# **Drug Drug Interaction**

## **Import Libraries**

In [72]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [73]:
import pandas as pd
import os
import xml.etree.ElementTree as ET
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

## **Dataset**

#### Parse the Dataset


In [74]:
def parse_ddi_corpus(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    for sentence in root.iter('sentence'):
        sent_text = sentence.attrib['text']
        entities = sentence.findall('entity')
        pairs = sentence.findall('pair')

        if len(entities) == 1:
            data.append([entities[0].attrib['text'], 'NULL', sent_text, 'False'])
        else:
            for pair in pairs:
                e1 = pair.attrib['e1']
                e2 = pair.attrib['e2']
                interaction = pair.attrib['ddi']

                e1_text = next(entity.attrib['text'] for entity in entities if entity.attrib['id'] == e1)
                e2_text = next(entity.attrib['text'] for entity in entities if entity.attrib['id'] == e2)

                data.append([e1_text, e2_text, sent_text, interaction])

    df = pd.DataFrame(data, columns=['Drug1', 'Drug2', 'Sentence', 'Interaction'])
    return df

def parse_all_ddi_files(directory_path):
        all_data = []

        for root, dirs, files in os.walk(directory_path):
            for file in files:
                if file.endswith('.xml'):
                    file_path = os.path.join(root, file)
                    df = parse_ddi_corpus(file_path)
                    all_data.append(df)

        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df

directory_path = '/kaggle/input/ddicorpus/DDICorpus/Train/DrugBank'
directory_path_2 = '/kaggle/input/ddicorpus/DDICorpus/Train/MedLine' 
directory_path_test = '/kaggle/input/ddicorpus/DDICorpus/Test/Test for DDI Extraction task/DrugBank'
directory_path_test_2 = '/kaggle/input/ddicorpus/DDICorpus/Test/Test for DDI Extraction task/MedLine'

In [None]:
df_1 = parse_all_ddi_files(directory_path)
df_2 = parse_all_ddi_files(directory_path_2)
df = pd.concat([df_1, df_2], ignore_index=True)


test_df_1 = parse_all_ddi_files(directory_path_test)
test_df_2 = parse_all_ddi_files(directory_path_test_2)
test_df = pd.concat([test_df_1, test_df_2], ignore_index=True)
df.head()
test_df.head()

#### Drop the rows with Drug1 or Drug2 as NULL


In [76]:
df = df[(df['Drug1'] != 'NULL') & (df['Drug2'] != 'NULL')]
test_df = test_df[(test_df['Drug1'] != 'NULL') & (test_df['Drug2'] != 'NULL')]
df


Unnamed: 0,Drug1,Drug2,Sentence,Interaction
0,Ketoconazole,Itraconazole,"Ketoconazole/Itraconazole, Macrolides, Includi...",false
1,Ketoconazole,Macrolides,"Ketoconazole/Itraconazole, Macrolides, Includi...",false
2,Ketoconazole,Erythromycin,"Ketoconazole/Itraconazole, Macrolides, Includi...",false
3,Itraconazole,Macrolides,"Ketoconazole/Itraconazole, Macrolides, Includi...",false
4,Itraconazole,Erythromycin,"Ketoconazole/Itraconazole, Macrolides, Includi...",false
...,...,...,...,...
29558,filipin,amphotericin B,Both the toxicity of filipin and the therapeut...,false
29559,filipin,polyene antibiotics,Both the toxicity of filipin and the therapeut...,false
29560,amphotericin B,polyene antibiotics,Both the toxicity of filipin and the therapeut...,false
29561,filipin,amphotericin B,filipin was more potent in lysing human red bl...,false


#### Convert 'Interaction' column to binary labels (1 for true, 0 for false)


In [78]:
df['Interaction'] = df['Interaction'].apply(lambda x: 1 if x == 'true' else 0)
test_df['Interaction'] = test_df['Interaction'].apply(lambda x: 1 if x == 'true' else 0)

## **Model**

#### Load BioBERT and tokenizer

In [77]:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Custom Dataset class for the DDI data

In [None]:
class DDIDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        sentence = f"{row['Sentence'].replace(row['Drug1'], '[Drug1]').replace(row['Drug2'], '[Drug2]')}"
        inputs = self.tokenizer(sentence, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        label = row['Interaction']  # Use the binary labels (0 or 1)
        return input_ids, attention_mask, label


In [None]:
train_dataset = DDIDataset(df, tokenizer)
test_dataset = DDIDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

#### Training function

In [None]:
def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)


#### Evaluation function

In [None]:
def evaluate(model, test_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    return accuracy, report


#### Train the model

In [None]:
# Initialize the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, device)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss}")


#### Evaluate the model

In [81]:
accuracy, report = evaluate(model, test_loader, device)
print(f"Test Accuracy: {accuracy}")
print(report)

Epoch 1/3, Loss: 0.1710170605734621
Epoch 2/3, Loss: 0.09338661233484895
Epoch 3/3, Loss: 0.07077675234118506
Test Accuracy: 0.928446466060182
              precision    recall  f1-score   support

           0       0.98      0.93      0.96      4737
           1       0.74      0.90      0.81       979

    accuracy                           0.93      5716
   macro avg       0.86      0.92      0.88      5716
weighted avg       0.94      0.93      0.93      5716



## Save the model

In [82]:
output_dir = "saved_biobert_model"
os.makedirs(output_dir, exist_ok=True)
# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('saved_biobert_model/tokenizer_config.json',
 'saved_biobert_model/special_tokens_map.json',
 'saved_biobert_model/vocab.txt',
 'saved_biobert_model/added_tokens.json',
 'saved_biobert_model/tokenizer.json')