## Load data

In [65]:
from pathlib import Path
import pandas as pd

# Load FoodDisease dataset
usecols = ['food_entity', 'disease_entity', 'sentence', 'is_cause', 'is_treat']
fd_df = pd.read_csv(
    Path('../data/food_disease.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=usecols).rename({'food_entity': 'term1', 'disease_entity': 'term2'}, axis=1)
fd_df['sentence'] = fd_df['sentence'].map(lambda x: x.lower())
fd_df

Unnamed: 0,term1,term2,sentence,is_cause,is_treat
0,bombax ceiba l. (bombacaceae) fruits,urinary stones,"interestingly, many indian tribes use bombax c...",0.0,1.0
1,ultra-processed food,obesity,ultra-processed food consumption has been asso...,1.0,0.0
2,salt,osteoporosis,salt has notoriously been blamed for causing a...,1.0,0.0
3,broiler chicken,footpad dermatitis,in broiler chicken flocks when animal-based me...,0.0,0.0
4,sfp,diabetes,"in short, this study demonstrated that sfp cou...",0.0,1.0
...,...,...,...,...,...
603,nuts,atherosclerosis,"thus, the inclusion of food sources of dietary...",0.0,1.0
604,persea americana,wilting,glasshouse pathogenicity tests with 'reed' avo...,0.0,0.0
605,apple juice,asthma,intake of beverages with excess free fructose ...,1.0,0.0
606,coffee,pd,"however, some evidence exists regarding a pote...",0.0,1.0


In [66]:
usedcols = ['sentence', 'term1', 'term2']

df_caus = pd.read_csv(
    Path('..', 'data', 'crowd_truth_cause.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=usedcols
)
df_caus["is_cause"] = 1
df_caus["is_treat"] = 0
df_treat = pd.read_csv(
    Path('..', 'data', 'crowd_truth_treat.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=usedcols
)
df_treat["is_treat"] = 1
df_treat["is_cause"] = 0
ct_df = df_caus.append(df_treat, ignore_index=True)
ct_df

  ct_df = df_caus.append(df_treat, ignore_index=True)


Unnamed: 0,term1,term2,sentence,is_cause,is_treat
0,AUTISM,TANTRUM,"The limited data suggest that, in children wit...",1,0
1,SLEEP PROBLEM,FAMILY STRESS,SLEEP PROBLEMs are associated with difficult b...,1,0
2,CEREBELLAR ATAXIA,DYSFUNCTION OF THE CEREBELLUM,The term CEREBELLAR ATAXIA is employed to indi...,1,0
3,CEREBELLAR DEGENERATION,CHRONIC ETHANOL ABUSE,Non hereditary causes of cerebellar degenerati...,1,0
4,HEART PROBLEM,ARTHRITIS,The disorder can present with a migratory ture...,1,0
...,...,...,...,...,...
7963,PARKINSON'S DISEASE,AMANTADINE,A 61 year old man with PARKINSON'S DISEASE (PD...,0,1
7964,DEPRESSION,IMIPRAMINE,With successful treatment of the patient's dep...,0,1
7965,ANGI,BEPRIDIL,Five of 15 patients receiving bepridil did not...,0,1
7966,HEMOPHILIA A,FACTOR VIII,The development of antibodies to factor VIII i...,0,1


In [67]:
fd_df['source'] = 'FoodDisease'
ct_df['source'] = 'CrowdTruth'
df = pd.concat([fd_df, ct_df])

df['prp_sent'] = df['sentence']
df['prp_sent'] = df.apply(lambda x: x['prp_sent'].replace(x['term1'], '[TERM1]'), axis=1)
df['prp_sent'] = df.apply(lambda x: x['prp_sent'].replace(x['term2'], '[TERM2]'), axis=1)
df

Unnamed: 0,term1,term2,sentence,is_cause,is_treat,source,prp_sent
0,bombax ceiba l. (bombacaceae) fruits,urinary stones,"interestingly, many indian tribes use bombax c...",0.0,1.0,FoodDisease,"interestingly, many indian tribes use [TERM1] ..."
1,ultra-processed food,obesity,ultra-processed food consumption has been asso...,1.0,0.0,FoodDisease,[TERM1] consumption has been associated with s...
2,salt,osteoporosis,salt has notoriously been blamed for causing a...,1.0,0.0,FoodDisease,[TERM1] has notoriously been blamed for causin...
3,broiler chicken,footpad dermatitis,in broiler chicken flocks when animal-based me...,0.0,0.0,FoodDisease,in [TERM1] flocks when animal-based measures r...
4,sfp,diabetes,"in short, this study demonstrated that sfp cou...",0.0,1.0,FoodDisease,"in short, this study demonstrated that [TERM1]..."
...,...,...,...,...,...,...,...
7963,PARKINSON'S DISEASE,AMANTADINE,A 61 year old man with PARKINSON'S DISEASE (PD...,0,1.0,CrowdTruth,A 61 year old man with [TERM1] (PD) developed ...
7964,DEPRESSION,IMIPRAMINE,With successful treatment of the patient's dep...,0,1.0,CrowdTruth,With successful treatment of the patient's dep...
7965,ANGI,BEPRIDIL,Five of 15 patients receiving bepridil did not...,0,1.0,CrowdTruth,Five of 15 patients receiving bepridil did not...
7966,HEMOPHILIA A,FACTOR VIII,The development of antibodies to factor VIII i...,0,1.0,CrowdTruth,The development of antibodies to factor VIII i...


## Load model

In [1]:
# !pip install transformers



In [77]:
from transformers import AutoTokenizer, AutoModel

# https://github.com/EmilyAlsentzer/clinicalBERT
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

print(tokenizer.encode('[TERM1]'))
print(tokenizer.encode('[TERM2]'))

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[101, 164, 1858, 1475, 166, 102]
[101, 164, 1858, 1477, 166, 102]


In [81]:
# Create special tokens for relation candidates (not sure if needed, might be experimented with)
# Not sure if this destroyes some pretrained weights
# https://github.com/huggingface/tokenizers/issues/247

special_tokens_dict = {'additional_special_tokens': ['[TERM1]','[TERM2]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(28998, 768)

In [82]:
print(tokenizer.encode('[TERM1]'))
print(tokenizer.encode('[TERM2]'))

[101, 28996, 102]
[101, 28997, 102]


In [61]:
for i in range(5):
    sent = df['prp_sent'].iloc[i]
    print(sent)
    print(tokenizer.encode(sent))

interestingly, many indian tribes use TERM1 as a traditional medicine for the treatment of TERM2.
[101, 5426, 1193, 117, 1242, 1107, 10359, 6872, 1329, 1858, 1475, 1112, 170, 2361, 5182, 1111, 1103, 3252, 1104, 1858, 1477, 119, 102]
TERM1 consumption has been associated with several health outcomes such as TERM2, hypertension, cardiovascular disease and cancer.
[101, 1858, 1475, 8160, 1144, 1151, 2628, 1114, 1317, 2332, 13950, 1216, 1112, 1858, 1477, 117, 177, 24312, 23826, 1988, 117, 3621, 25575, 3653, 1105, 4182, 119, 102]
TERM1 has notoriously been blamed for causing an increase in the urinary excretion of calcium, and thus is a considered a risk factor for TERM2.
[101, 1858, 1475, 1144, 14140, 1193, 1151, 11289, 1111, 3989, 1126, 2773, 1107, 1103, 190, 9324, 1616, 4252, 13782, 2116, 1104, 15355, 117, 1105, 2456, 1110, 170, 1737, 170, 3187, 5318, 1111, 1858, 1477, 119, 102]
in TERM1 flocks when animal-based measures related to TERM2, hock burns, body lesions and arthritis are identi

In [3]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          