## Import Libraries

In [9]:
import pickle
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.linear_model import LogisticRegression
import pandas as pd
import os
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from joblib import load
import numpy as np
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# Evaluate the model performance
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import joblib


## Import Models

Load the saved fine-tuned BioBERT model and tokenizer

In [10]:
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_model = AutoModelForSequenceClassification.from_pretrained("./model/saved_biobert_model/")  # Replace with your model path

Load the Classification Model 

In [12]:
logistic_model = load("./model/results/svm_model.pkl")
random_forest_model = load("./model/results/xgb_model.pkl")

device = "mps" if torch.backends.mps.is_available() else "cpu"
biobert_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Load the Word2Vec Model

In [13]:
model_path = "./model/model.bin"

# Load the model with memory mapping
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

print("Model loaded successfully!")

word_vector = model['drugs']  

Model loaded successfully!


In [15]:
def get_biobert_prediction(drug1, drug2, sentence, model, tokenizer, device):
    # Replace drug names with placeholders for consistency
    sentence = sentence.replace(drug1, "[Drug1]").replace(drug2, "[Drug2]")
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).cpu().numpy()  # Get probability scores
    return probs[0]

In [16]:
def get_classic_model_prediction(drug1, drug2, model, vectorizer):
    # Vectorize each drug name
    drug1_vec = vectorizer.transform([drug1]).toarray()
    drug2_vec = vectorizer.transform([drug2]).toarray()
    combined_vector = list(drug1_vec[0]) + list(drug2_vec[0])  # Concatenate vectors for both drugs
    probs = model.predict_proba([combined_vector])[0]  # Get probability prediction
    return probs

## Load Dataset

In [17]:
def parse_ddi_corpus(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    for sentence in root.iter('sentence'):
        sent_text = sentence.attrib['text']
        entities = sentence.findall('entity')
        pairs = sentence.findall('pair')

        if len(entities) == 1:
            data.append([entities[0].attrib['text'], 'NULL', sent_text, 'False'])
        else:
            for pair in pairs:
                e1 = pair.attrib['e1']
                e2 = pair.attrib['e2']
                interaction = pair.attrib['ddi']

                e1_text = next(entity.attrib['text'] for entity in entities if entity.attrib['id'] == e1)
                e2_text = next(entity.attrib['text'] for entity in entities if entity.attrib['id'] == e2)

                data.append([e1_text, e2_text, sent_text, interaction])

    df = pd.DataFrame(data, columns=['Drug1', 'Drug2', 'Sentence', 'Interaction'])
    return df

def parse_all_ddi_files(directory_paths):
    all_data = []

    for directory_path in directory_paths:
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                if file.endswith('.xml'):
                    file_path = os.path.join(root, file)
                    df = parse_ddi_corpus(file_path)
                    all_data.append(df)

    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# Specify the paths to both directories
directory_paths = [
    '../Dataset/DDICorpus/Train/DrugBank',
    '../Dataset/DDICorpus/Train/MedLine'
]

df = parse_all_ddi_files(directory_paths)

# Filter out rows where 'Drug1' or 'Drug2' is 'NULL'
df = df[(df['Drug1'] != 'NULL') & (df['Drug2'] != 'NULL')]

# Define the new directory path for the test data
# new_directory_paths = [
#     '../Dataset/DDICorpus/Test/Test for DDI Extraction task/DrugBank',
#     '../Dataset/DDICorpus/Test/Test for DDI Extraction task/MedLine'
# ]

# # Parse and combine data from the new directory
# new_test_df = parse_all_ddi_files(new_directory_paths)

# Filter out rows where 'Drug1' or 'Drug2' is 'NULL' (if needed)
# test_df = new_test_df[(new_test_df['Drug1'] != 'NULL') & (new_test_df['Drug2'] != 'NULL')]

train_df=df
# # Display the first few rows of the new test set
# test_df.head()

In [18]:
train_df.head()

Unnamed: 0,Drug1,Drug2,Sentence,Interaction
0,calcium,EMCYT,"Milk, milk products, and calcium-rich foods or...",True
1,allopurinol,ampicillin,The concurrent administration of allopurinol a...,True
2,allopurinol,ampicillin,The concurrent administration of allopurinol a...,False
3,ampicillin,ampicillin,The concurrent administration of allopurinol a...,False
4,ampicillin,allopurinol,It is not known whether this potentiation of a...,True


Load TwoSides Dataset as Test Dataset.

In [35]:
## Load the TwoSidesCombined.csv file 
file_path = '../Dataset/twosides_generated_sentences.csv'
test_df = pd.read_csv(file_path)

test_df 

# could you change interaction from 1/0 to True/False
test_df['Interaction'] = test_df['Interaction'].map({1: "true", 0: "false"})

In [36]:
test_df 

## Randomly select 27792 rows from the test_df
test_df = test_df.sample(n=27792, random_state=42)
test_df 

Unnamed: 0,drug_1_concept_name,drug_2_concept_name,generated_sentence,Interaction
87672,salmeterol,Fenofibrate,salmeterol and Fenofibrate do not interact thr...,false
348340,ginger root,belimumab,ginger root does not affect the biotransformat...,false
363610,valacyclovir,Procarbazine,valacyclovir administered alongside Procarbazi...,true
415440,Mechlorethamine,Morphine,Mechlorethamine and Morphine may be co-adminis...,false
34268,ELOSULFASE ALFA,Dalteparin,ELOSULFASE ALFA does not impact the efficacy a...,false
...,...,...,...,...
2856,Melphalan,Potassium Aspartate,Melphalan and Potassium Aspartate do not inter...,true
282208,tegaserod,Amiodarone,tegaserod and Amiodarone together do not typic...,true
136256,Midazolam,Petrolatum,Midazolam administered with Petrolatum has not...,true
256765,Bromazepam,Aprotinin,Bromazepam does not alter the pharmacodynamics...,false


In [31]:
def get_drug_embeddings(drug, model):
    drug_vectors=np.array([])
    if drug in model:
        drug_vectors = model[drug]
    else:
        pass
    return drug_vectors

In [32]:
def get_embeddings(drug1, drug2, expected_dim=200):
    embedding1 = get_drug_embeddings(drug1, model)  
    embedding2 = get_drug_embeddings(drug2, model)  

    if isinstance(embedding1, dict):
        embedding1 = np.array(list(embedding1.values()))
    if isinstance(embedding2, dict):
        embedding2 = np.array(list(embedding2.values()))

    if embedding1 is None or embedding1.size == 0:
        embedding1 = np.zeros(expected_dim)  # Pad missing embeddings
    if embedding2 is None or embedding2.size == 0:
        embedding2 = np.zeros(expected_dim)

    if embedding1.shape[0] != expected_dim:
        embedding1 = np.pad(embedding1, (0, expected_dim - embedding1.shape[0]), 'constant')

    if embedding2.shape[0] != expected_dim:
        embedding2 = np.pad(embedding2, (0, expected_dim - embedding2.shape[0]), 'constant')

    combined_embedding = np.concatenate((embedding1, embedding2))

    return combined_embedding

stacking_features = []
valid_indices = []  

for index, row in train_df.iterrows():
    drug1, drug2 = row['Drug1'], row['Drug2'] 
    try:
        drug_embeddings = get_embeddings(drug1, drug2)

        biobert_probs = get_biobert_prediction(drug1, drug2, row['Sentence'], biobert_model, biobert_tokenizer, device)

        reshaped_embeddings = drug_embeddings.reshape(1, -1)  # Reshape for single sample

        logistic_probs = logistic_model.predict(reshaped_embeddings)
        random_forest_probs = random_forest_model.predict(reshaped_embeddings)

        # Combine predictions as stacking features
        stacking_features.append(list(biobert_probs) + logistic_probs.tolist() + random_forest_probs.tolist())

        # Record the index as valid
        valid_indices.append(index)

    except ValueError as e:
        print(f"Error processing row {index}: {e}")

X_stacking = pd.DataFrame(stacking_features)
y_stacking = train_df.loc[valid_indices, 'Interaction'].reset_index(drop=True)

In [33]:
# Initialize the meta-classifier
stacking_model = LogisticRegression()

# Train the stacking model
stacking_model.fit(X_stacking, y_stacking)

# Predictions on the test data (optional if you already have a test set)
y_pred = stacking_model.predict(X_stacking)
print("Stacking Model Accuracy:", accuracy_score(y_stacking, y_pred))
print(classification_report(y_stacking, y_pred))

Stacking Model Accuracy: 0.9770437535981578
              precision    recall  f1-score   support

       false       0.98      0.99      0.99     23771
        true       0.93      0.91      0.92      4021

    accuracy                           0.98     27792
   macro avg       0.96      0.95      0.95     27792
weighted avg       0.98      0.98      0.98     27792



In [37]:
def get_embeddings(drug1, drug2, expected_dim=200):
    embedding1 = get_drug_embeddings(drug1, model)  # Fetch embedding for drug1
    embedding2 = get_drug_embeddings(drug2, model)  # Fetch embedding for drug2

    # Convert to numpy array if needed
    if isinstance(embedding1, dict):
        embedding1 = np.array(list(embedding1.values()))
    if isinstance(embedding2, dict):
        embedding2 = np.array(list(embedding2.values()))

    # Ensure embeddings are valid arrays
    if embedding1 is None or embedding1.size == 0:
        embedding1 = np.zeros(expected_dim)  # Pad missing embeddings
    if embedding2 is None or embedding2.size == 0:
        embedding2 = np.zeros(expected_dim)

    # Ensure embeddings have the correct shape
    if embedding1.shape[0] != expected_dim:
        embedding1 = np.pad(embedding1, (0, expected_dim - embedding1.shape[0]), 'constant')

    if embedding2.shape[0] != expected_dim:
        embedding2 = np.pad(embedding2, (0, expected_dim - embedding2.shape[0]), 'constant')

    # Concatenate the two embeddings to form a 400-d vector
    combined_embedding = np.concatenate((embedding1, embedding2))

    return combined_embedding
stacking_features = []
errors = []

# Process each test row
for index, row in test_df.iterrows():
    drug1, drug2 = row['drug_1_concept_name'], row['drug_2_concept_name']
    
    try:
        # Get embeddings
        drug_embeddings = get_embeddings(drug1, drug2)
        if drug_embeddings.size == 0:
            raise ValueError("Empty embeddings for drug pair.")

        # Generate predictions from individual models
        biobert_probs = get_biobert_prediction(drug1, drug2, row['generated_sentence'], biobert_model, biobert_tokenizer, device)

        reshaped_embeddings = drug_embeddings.reshape(1, -1)  # Reshape for single sample
        logistic_probs = logistic_model.predict(reshaped_embeddings)
        random_forest_probs = random_forest_model.predict(reshaped_embeddings)

        # Combine all predictions for stacking
        stacking_features.append(list(biobert_probs) + logistic_probs.tolist() + random_forest_probs.tolist())

    except ValueError as e:
        print(f"Error processing row {index}: {e}")
        errors.append(index)  # Track rows that caused errors

# Remove corresponding labels from y_stacking for rows with errors
test_df_cleaned = test_df.drop(errors)
X_stacking_test = pd.DataFrame(stacking_features)
y_stacking_test = test_df_cleaned['Interaction'].values  # Binary labels

# Make final predictions with the stacking model
stacking_predictions = stacking_model.predict(X_stacking_test)


print("Classification Report:")
print(classification_report(y_stacking_test, stacking_predictions))
print("Accuracy:", accuracy_score(y_stacking_test, stacking_predictions))

Classification Report:
              precision    recall  f1-score   support

       false       0.50      0.98      0.66     13904
        true       0.52      0.02      0.04     13888

    accuracy                           0.50     27792
   macro avg       0.51      0.50      0.35     27792
weighted avg       0.51      0.50      0.35     27792

Accuracy: 0.5009715025906736


In [14]:
# Save the trained stacking model
joblib.dump(stacking_model, "stacking_model.pkl")

print("Models saved successfully!")

Models saved successfully!


In [16]:
from xgboost import XGBClassifier  # Import XGBClassifier

# Convert string labels to integers (if needed)
y_stacking = np.where(y_stacking == 'true', 1, 0)
y_stacking_test = np.where(y_stacking_test == 'true', 1, 0)

# Initialize different stacking models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_stacking, y_stacking)
    
    # Predictions on training data
    y_pred = model.predict(X_stacking)
    print(f"{name} Stacking Model Accuracy:", accuracy_score(y_stacking, y_pred))
    print(classification_report(y_stacking, y_pred))
    
    # Make final predictions on test data
    stacking_predictions = model.predict(X_stacking_test)
    
    # Evaluate the model performance
    print("Classification Report:")
    print(classification_report(y_stacking_test, stacking_predictions))
    print("Accuracy:", accuracy_score(y_stacking_test, stacking_predictions))



Training Logistic Regression...


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [17]:
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_stacking.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("\nTraining Neural Network...")
nn_model.fit(X_stacking, y_stacking, epochs=20, batch_size=16, verbose=1, validation_split=0.1)

# Make predictions
nn_predictions = (nn_model.predict(X_stacking_test) > 0.5).astype(int)

# Evaluate the neural network
print("Neural Network Classification Report:")
print(classification_report(y_stacking_test, nn_predictions))
print("Neural Network Accuracy:", accuracy_score(y_stacking_test, nn_predictions))



Training Neural Network...
Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 487us/step - accuracy: 0.9977 - loss: 0.0313 - val_accuracy: 1.0000 - val_loss: 3.8771e-06
Epoch 2/20
[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 403us/step - accuracy: 1.0000 - loss: 3.5819e-06 - val_accuracy: 1.0000 - val_loss: 6.1397e-07
Epoch 3/20
[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 402us/step - accuracy: 1.0000 - loss: 6.8093e-07 - val_accuracy: 1.0000 - val_loss: 1.4341e-07
Epoch 4/20
[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 402us/step - accuracy: 1.0000 - loss: 3.3799e-07 - val_accuracy: 1.0000 - val_loss: 3.7141e-08
Epoch 5/20
[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 403us/step - accuracy: 1.0000 - loss: 1.4414e-07 - val_accuracy: 1.0000 - val_loss: 9.5523e-09
Epoch 6/20
[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 401us/step - accuracy: 1.0000 - loss: 3.5127e-08 - val_accuracy: 1