## Load data

In [3]:
from pathlib import Path
import pandas as pd


usecols = ['food_entity', 'disease_entity', 'sentence', 'is_cause', 'is_treat']
fd_df = pd.read_csv(
    Path('../data/food_disease.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=usecols).rename({'food_entity': 'term1', 'disease_entity': 'term2'}, axis=1)
fd_df['sentence'] = fd_df['sentence'].map(lambda x: x.lower())
fd_df = fd_df[fd_df.apply(lambda x: x['term1'] in x['sentence'] and x['term2'] in x['sentence'], axis=1)]
label_cols = ['is_cause', 'is_treat']
fd_df['is_cause'] = fd_df['is_cause'].astype(float).astype(int)
fd_df['is_treat'] = fd_df['is_treat'].astype(float).astype(int)
# fd_df

In [None]:
df_cause = pd.read_csv(
    Path('../data/crowd_truth_cause.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=['sentence', 'term1', 'term2']
)
df_cause['is_cause'] = 1
df_cause['is_treat'] = 0
df_treat = pd.read_csv(
    Path('../data/crowd_truth_treat.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=['sentence', 'term1', 'term2']
)
df_treat['is_treat'] = 1
df_treat['is_cause'] = 0
ct_df = df_cause.append(df_treat, ignore_index=True)
# ct_df

In [None]:
fd_df['source'] = 'FoodDisease'
ct_df['source'] = 'CrowdTruth'
df = pd.concat([fd_df, ct_df])

term1 = 'term1'
term2 = 'term2'

df['prp_sent'] = df['sentence']
df['prp_sent'] = df.apply(lambda x: x['prp_sent'].replace(x['term1'], term1), axis=1)
df['prp_sent'] = df.apply(lambda x: x['prp_sent'].replace(x['term2'], term2), axis=1)
df = df[df['prp_sent'].apply(lambda x: term1 in x and term2 in x)]
# df

In [None]:
# !pip install spacy networkx
import spacy
import networkx as nx

nlp = spacy.load('en_core_web_sm')    
doc = nlp(df['sentence'].iloc[0])

def shortest_dep_path(sentence):
    doc = nlp(sentence)
    edges = []
    for token in doc:
        for child in token.children:
            edges.append((
                '{0}'.format(token.lemma_),
                '{0}'.format(child.lemma_)))
    graph = nx.Graph(edges)
    entity1 = term1
    entity2 = term2
    try:
        return nx.shortest_path(graph, source=entity1, target=entity2)
    except Exception as e:
        return []

df['sdp'] = df['prp_sent'].apply(shortest_dep_path)
df = df[df['sdp'].apply(len) > 0]
df['sdp_joined'] = df['sdp'].apply(lambda x: ' '.join(x))
# df

In [20]:
df.to_csv(Path('../data/milestone3/preprocessed.csv'), sep=';', index=False)

In [2]:
import pandas as pd
from pathlib import Path

df = pd.read_csv(Path('../data/milestone3/preprocessed.csv'), sep=';')
df

Unnamed: 0,term1,term2,sentence,is_cause,is_treat,source,prp_sent,sdp,sdp_joined
0,bombax ceiba l. (bombacaceae) fruits,urinary stones,"interestingly, many indian tribes use bombax c...",0,1,FoodDisease,"interestingly, many indian tribes use term1 as...","['term1', 'use', 'as', 'medicine', 'for', 'tre...",term1 use as medicine for treatment of term2
1,ultra-processed food,obesity,ultra-processed food consumption has been asso...,1,0,FoodDisease,term1 consumption has been associated with sev...,"['term1', 'consumption', 'associate', 'with', ...",term1 consumption associate with outcome as term2
2,salt,osteoporosis,salt has notoriously been blamed for causing a...,1,0,FoodDisease,term1 has notoriously been blamed for causing ...,"['term1', 'blame', 'for', 'term2']",term1 blame for term2
3,broiler chicken,footpad dermatitis,in broiler chicken flocks when animal-based me...,0,0,FoodDisease,in term1 flocks when animal-based measures rel...,"['term1', 'flock', 'in', 'identify', 'burn', '...","term1 flock in identify burn , term2"
4,sfp,diabetes,"in short, this study demonstrated that sfp cou...",0,1,FoodDisease,"in short, this study demonstrated that term1 c...","['term1', 'develop', 'as', 'food', 'or', 'prev...",term1 develop as food or prevention of term2
...,...,...,...,...,...,...,...,...,...
7973,PARKINSON'S DISEASE,AMANTADINE,A 61 year old man with PARKINSON'S DISEASE (PD...,0,1,CrowdTruth,A 61 year old man with term1 (PD) developed su...,"['term1', 'with', 'man', 'develop', 'after', '...",term1 with man develop after initiation of tre...
7974,DEPRESSION,IMIPRAMINE,With successful treatment of the patient's dep...,0,1,CrowdTruth,With successful treatment of the patient's dep...,"['term1', 'of', 'treatment', 'with', 'term2']",term1 of treatment with term2
7975,ANGI,BEPRIDIL,Five of 15 patients receiving bepridil did not...,0,1,CrowdTruth,Five of 15 patients receiving bepridil did not...,"['term1', 'of', 'onset', 'to', 'minute', 'with...",term1 of onset to minute with term2
7976,HEMOPHILIA A,FACTOR VIII,The development of antibodies to factor VIII i...,0,1,CrowdTruth,The development of antibodies to factor VIII i...,"['term1', 'with', 'protocol', 'combine', 'term2']",term1 with protocol combine term2


## BERT features with traditional classifier

In [3]:
# !pip install transformers evaluate datasets
from transformers import AutoTokenizer, AutoModel, pipeline

# Idea: use bert embeddings as features for SVC
# Here using shortest dep path makes sense because it gives context between the 2 entities in question

# https://github.com/EmilyAlsentzer/clinicalBERT
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
model = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer, binary_output=True)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from tqdm.auto import tqdm
import joblib


sent_embeddings = pipe(list(df['prp_sent']))
sdp_embeddings = pipe(list(df['sdp_joined']))

joblib.dump(Path('../data/milestone3/sent_embeddings.pkl'), sent_embeddings)
joblib.dump(Path('../data/milestone3/sdp_embeddings.pkl'), sdp_embeddings)

In [4]:
import joblib
from pathlib import Path

# sent_embeddings = joblib.load(Path('../data/milestone3/sent_embeddings.pkl'))
sdp_embeddings = joblib.load(Path('../data/milestone3/sdp_embeddings.pkl'))

In [9]:
print(len(sdp_embeddings))
print(len(sdp_embeddings[0]))
print(len(sdp_embeddings[0][0]))
print(len(sdp_embeddings[0][0][0]))

7978
1
12
768


In [12]:
# Use embedding of [CLS] token to get approximate sentence representation
sent_cls_embeddings = [x[0][0] for x in sdp_embeddings]
# sdp_cls_embeddings = [x[0][0] for x in sent_embeddings]

In [None]:
# TODO train svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm, datasets
import matplotlib.pyplot as plt
import numpy as np

## BERT with classification head

In [None]:
# https://huggingface.co/docs/transformers/training

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import evaluate
from datasets import Dataset
import numpy as np

dataset = Dataset.from_pandas(df)
dataset

# TODO fix training pipeling (check compute_metrics, check dataset)
model = AutoModelForSequenceClassification.from_pretrained(
    'emilyalsentzer/Bio_ClinicalBERT',
    num_labels=2, problem_type='multi_label_classification'
)
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)

training_args = TrainingArguments(output_dir='test_trainer', evaluation_strategy='epoch')
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

loading configuration file config.json from cache at C:\Users\alexs/.cache\huggingface\hub\models--emilyalsentzer--Bio_ClinicalBERT\snapshots\41943bf7f983007123c758373c5246305cc536ec\config.json
Model config BertConfig {
  "_name_or_path": "emilyalsentzer/Bio_ClinicalBERT",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at C:\Users\alexs/.cache\huggingface\hub\models--emilyalsentzer--Bio_ClinicalBERT\snapshots\41943bf7f983007123c7583

IndexError: Invalid key: 374 is out of bounds for size 0