## Getting the Training Data

In [10]:
import pandas as pd

data = []
with open("data/DRUG-AE.rel", "r") as f:
    for line in f.readlines():
        fields = line.strip().split("|")
        pubmed_id, sentence, adverse_effect, start_offset_dose, end_offset_dose, drug, start_offset_drug, end_offset_drug = fields
        data.append([pubmed_id, sentence, adverse_effect, start_offset_dose, end_offset_dose, drug, start_offset_drug, end_offset_drug,])
        
with open("data/ADE-NEG.txt", "r") as f:
    for line in f.readlines():
        fields = line.strip().split("NEG")
        pubmed_id, sentence = fields
        data.append([pubmed_id, sentence, '', '', '', '', '', ''])

# Create a pandas DataFrame from the parsed data
columns = ["PubMed_ID", "Sentence", "Adverse_Effect", "Start_Offset_Dose", 
           "End_Offset_Dose", "Drug", "Start_Offset_Drug", "End_Offset_Drug"]
df = pd.DataFrame(data, columns=columns)

In [11]:
df.head()

Unnamed: 0,PubMed_ID,Sentence,Adverse_Effect,Start_Offset_Dose,End_Offset_Dose,Drug,Start_Offset_Drug,End_Offset_Drug
0,10030778,Intravenous azithromycin-induced ototoxicity.,ototoxicity,43,54,azithromycin,22,34
1,10048291,"Immobilization, while Paget's bone disease was...",increased calcium-release,960,985,dihydrotachysterol,908,926
2,10048291,Unaccountable severe hypercalcemia in a patien...,hypercalcemia,31,44,dihydrotachysterol,94,112
3,10082597,METHODS: We report two cases of pseudoporphyri...,pseudoporphyria,620,635,naproxen,646,654
4,10082597,METHODS: We report two cases of pseudoporphyri...,pseudoporphyria,620,635,oxaprozin,659,668


We only need the sentence, adverse_effect, and drug columns.

In [13]:
df = df[['Sentence', 'Adverse_Effect','Drug']]
df.head()

Unnamed: 0,Sentence,Adverse_Effect,Drug
0,Intravenous azithromycin-induced ototoxicity.,ototoxicity,azithromycin
1,"Immobilization, while Paget's bone disease was...",increased calcium-release,dihydrotachysterol
2,Unaccountable severe hypercalcemia in a patien...,hypercalcemia,dihydrotachysterol
3,METHODS: We report two cases of pseudoporphyri...,pseudoporphyria,naproxen
4,METHODS: We report two cases of pseudoporphyri...,pseudoporphyria,oxaprozin


Now we need to split the data into training and validation data.

In [16]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.15, random_state=101)

In [17]:
from transformers import AutoModel, AutoTokenizer

# Downloading the model.
model_name = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Tokenizing the sentences.
train_encodings = tokenizer(train_df["Sentence"].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df["Sentence"].tolist(), truncation=True, padding=True)