## Load data

In [1]:
from pathlib import Path
import pandas as pd

# Load FoodDisease dataset
usecols = ['food_entity', 'disease_entity', 'sentence', 'is_cause', 'is_treat']
fd_df = pd.read_csv(
    Path('../data/food_disease.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=usecols).rename({'food_entity': 'term1', 'disease_entity': 'term2'}, axis=1)
fd_df['sentence'] = fd_df['sentence'].map(lambda x: x.lower())
fd_df = fd_df[fd_df.apply(lambda x: x['term1'] in x['sentence'] and x['term2'] in x['sentence'], axis=1)]
label_cols = ['is_cause', 'is_treat']
fd_df['is_cause'] = fd_df['is_cause'].astype(float).astype(int)
fd_df['is_treat'] = fd_df['is_treat'].astype(float).astype(int)
fd_df

Unnamed: 0,term1,term2,sentence,is_cause,is_treat
0,bombax ceiba l. (bombacaceae) fruits,urinary stones,"interestingly, many indian tribes use bombax c...",0,1
1,ultra-processed food,obesity,ultra-processed food consumption has been asso...,1,0
2,salt,osteoporosis,salt has notoriously been blamed for causing a...,1,0
3,broiler chicken,footpad dermatitis,in broiler chicken flocks when animal-based me...,0,0
4,sfp,diabetes,"in short, this study demonstrated that sfp cou...",0,1
...,...,...,...,...,...
603,nuts,atherosclerosis,"thus, the inclusion of food sources of dietary...",0,1
604,persea americana,wilting,glasshouse pathogenicity tests with 'reed' avo...,0,0
605,apple juice,asthma,intake of beverages with excess free fructose ...,1,0
606,coffee,pd,"however, some evidence exists regarding a pote...",0,1


In [2]:
usedcols = ['sentence', 'term1', 'term2']

df_caus = pd.read_csv(
    Path('..', 'data', 'crowd_truth_cause.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=usedcols
)
df_caus["is_cause"] = 1
df_caus["is_treat"] = 0
df_treat = pd.read_csv(
    Path('..', 'data', 'crowd_truth_treat.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=usedcols
)
df_treat["is_treat"] = 1
df_treat["is_cause"] = 0
ct_df = df_caus.append(df_treat, ignore_index=True)
ct_df

  ct_df = df_caus.append(df_treat, ignore_index=True)


Unnamed: 0,term1,term2,sentence,is_cause,is_treat
0,AUTISM,TANTRUM,"The limited data suggest that, in children wit...",1,0
1,SLEEP PROBLEM,FAMILY STRESS,SLEEP PROBLEMs are associated with difficult b...,1,0
2,CEREBELLAR ATAXIA,DYSFUNCTION OF THE CEREBELLUM,The term CEREBELLAR ATAXIA is employed to indi...,1,0
3,CEREBELLAR DEGENERATION,CHRONIC ETHANOL ABUSE,Non hereditary causes of cerebellar degenerati...,1,0
4,HEART PROBLEM,ARTHRITIS,The disorder can present with a migratory ture...,1,0
...,...,...,...,...,...
7963,PARKINSON'S DISEASE,AMANTADINE,A 61 year old man with PARKINSON'S DISEASE (PD...,0,1
7964,DEPRESSION,IMIPRAMINE,With successful treatment of the patient's dep...,0,1
7965,ANGI,BEPRIDIL,Five of 15 patients receiving bepridil did not...,0,1
7966,HEMOPHILIA A,FACTOR VIII,The development of antibodies to factor VIII i...,0,1


In [3]:
fd_df['source'] = 'FoodDisease'
ct_df['source'] = 'CrowdTruth'
df = pd.concat([fd_df, ct_df])

df['prp_sent'] = df['sentence']
df['prp_sent'] = df.apply(lambda x: x['prp_sent'].replace(x['term1'], '[TERM1]'), axis=1)
df['prp_sent'] = df.apply(lambda x: x['prp_sent'].replace(x['term2'], '[TERM2]'), axis=1)
df

Unnamed: 0,term1,term2,sentence,is_cause,is_treat,source,prp_sent
0,bombax ceiba l. (bombacaceae) fruits,urinary stones,"interestingly, many indian tribes use bombax c...",0,1,FoodDisease,"interestingly, many indian tribes use [TERM1] ..."
1,ultra-processed food,obesity,ultra-processed food consumption has been asso...,1,0,FoodDisease,[TERM1] consumption has been associated with s...
2,salt,osteoporosis,salt has notoriously been blamed for causing a...,1,0,FoodDisease,[TERM1] has notoriously been blamed for causin...
3,broiler chicken,footpad dermatitis,in broiler chicken flocks when animal-based me...,0,0,FoodDisease,in [TERM1] flocks when animal-based measures r...
4,sfp,diabetes,"in short, this study demonstrated that sfp cou...",0,1,FoodDisease,"in short, this study demonstrated that [TERM1]..."
...,...,...,...,...,...,...,...
7963,PARKINSON'S DISEASE,AMANTADINE,A 61 year old man with PARKINSON'S DISEASE (PD...,0,1,CrowdTruth,A 61 year old man with [TERM1] (PD) developed ...
7964,DEPRESSION,IMIPRAMINE,With successful treatment of the patient's dep...,0,1,CrowdTruth,With successful treatment of the patient's dep...
7965,ANGI,BEPRIDIL,Five of 15 patients receiving bepridil did not...,0,1,CrowdTruth,Five of 15 patients receiving bepridil did not...
7966,HEMOPHILIA A,FACTOR VIII,The development of antibodies to factor VIII i...,0,1,CrowdTruth,The development of antibodies to factor VIII i...


## Feature extraction

In [4]:
# !pip install transformers evaluate datasets

In [5]:
from transformers import AutoTokenizer, AutoModel, pipeline

# Idea: use bert embeddings as features for SVC
# Here using shortest dep path makes sense because it gives context between the 2 entities in question

# https://github.com/EmilyAlsentzer/clinicalBERT
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# TODO get embeddings for each input sentence and SDP of each input sentence
# then do usual sklearn crossvalidation classification
pipe('is dis life?')

[[[0.4879814684391022,
   0.566674530506134,
   -0.5663884282112122,
   -0.3814820945262909,
   -0.33466288447380066,
   -0.1778324544429779,
   0.47919297218322754,
   0.11976409703493118,
   0.5079109072685242,
   -0.5354055166244507,
   -0.38260048627853394,
   0.16573670506477356,
   -0.8754660487174988,
   0.22716443240642548,
   -0.6127524971961975,
   0.016451658681035042,
   0.06366363912820816,
   0.006360404193401337,
   -0.02363014407455921,
   -0.08432858437299728,
   -0.047699470072984695,
   0.03774499148130417,
   -0.19593286514282227,
   -0.3779665529727936,
   -0.3851042091846466,
   -0.12227645516395569,
   0.43293309211730957,
   0.5590320229530334,
   -0.11865845322608948,
   0.4241071939468384,
   0.014696234837174416,
   0.32160547375679016,
   -0.17747652530670166,
   0.27901414036750793,
   -0.5282014012336731,
   0.6054015159606934,
   -0.20767387747764587,
   0.38292789459228516,
   0.19679689407348633,
   0.0334688164293766,
   0.1322387456893921,
   0.562116

## Finetuned classifier

In [8]:
# Default encoding
print(tokenizer.encode('[TERM1]'))
print(tokenizer.encode('[TERM2]'))

[101, 164, 1858, 1475, 166, 102]
[101, 164, 1858, 1477, 166, 102]


In [9]:
# Create special tokens for relation candidates (not sure if needed, might be experimented with)
# Not sure if this destroyes some pretrained weights
# https://github.com/huggingface/tokenizers/issues/247

# special_tokens_dict = {'additional_special_tokens': ['[TERM1]','[TERM2]']}
# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
# res = model.resize_token_embeddings(len(tokenizer))
# print(res)
# print(tokenizer.encode('[TERM1]'))
# print(tokenizer.encode('[TERM2]'))

In [10]:
for i in range(1):
    sent = df['prp_sent'].iloc[i]
    print(sent)
    print(tokenizer.encode(sent))

interestingly, many indian tribes use [TERM1] as a traditional medicine for the treatment of [TERM2].
[101, 5426, 1193, 117, 1242, 1107, 10359, 6872, 1329, 164, 1858, 1475, 166, 1112, 170, 2361, 5182, 1111, 1103, 3252, 1104, 164, 1858, 1477, 166, 119, 102]


In [11]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['term1', 'term2', 'sentence', 'is_cause', 'is_treat', 'source', 'prp_sent', '__index_level_0__'],
    num_rows: 8563
})

In [12]:
# https://huggingface.co/docs/transformers/training

from transformers import AutoModelForSequenceClassification, pipeline
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

# TODO check compute_metrics, check dataset
model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=2, problem_type="multi_label_classification"
)
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

IndexError: Invalid key: 374 is out of bounds for size 0