In [None]:
import pickle
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.linear_model import LogisticRegression
import pandas as pd
import os
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from joblib import load
import numpy as np
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
# Load the saved fine-tuned BioBERT model and tokenizer
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/full-dataset-fine-tuned-ddi/saved_biobert_model/")  # Replace with your model path

logistic_model = load("/kaggle/input/fork-of-pipeline2/svm_model.pkl")
random_forest_model = load("/kaggle/input/fork-of-pipeline2/xgb_model.pkl")

# Set up device for BioBERT (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
biobert_model.to(device)

In [None]:
def get_biobert_prediction(drug1, drug2, sentence, model, tokenizer, device):
    # Replace drug names with placeholders for consistency
    sentence = sentence.replace(drug1, "[Drug1]").replace(drug2, "[Drug2]")
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).cpu().numpy()  # Get probability scores
    return probs[0]

In [None]:
def get_classic_model_prediction(drug1, drug2, model, vectorizer):
    # Vectorize each drug name
    drug1_vec = vectorizer.transform([drug1]).toarray()
    drug2_vec = vectorizer.transform([drug2]).toarray()
    combined_vector = list(drug1_vec[0]) + list(drug2_vec[0])  # Concatenate vectors for both drugs
    probs = model.predict_proba([combined_vector])[0]  # Get probability prediction
    return probs

In [None]:
def parse_ddi_corpus(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    for sentence in root.iter('sentence'):
        sent_text = sentence.attrib['text']
        entities = sentence.findall('entity')
        pairs = sentence.findall('pair')

        if len(entities) == 1:
            data.append([entities[0].attrib['text'], 'NULL', sent_text, 'False'])
        else:
            for pair in pairs:
                e1 = pair.attrib['e1']
                e2 = pair.attrib['e2']
                interaction = pair.attrib['ddi']

                e1_text = next(entity.attrib['text'] for entity in entities if entity.attrib['id'] == e1)
                e2_text = next(entity.attrib['text'] for entity in entities if entity.attrib['id'] == e2)

                data.append([e1_text, e2_text, sent_text, interaction])

    df = pd.DataFrame(data, columns=['Drug1', 'Drug2', 'Sentence', 'Interaction'])
    return df

def parse_all_ddi_files(directory_paths):
    all_data = []

    for directory_path in directory_paths:
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                if file.endswith('.xml'):
                    file_path = os.path.join(root, file)
                    df = parse_ddi_corpus(file_path)
                    all_data.append(df)

    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# Specify the paths to both directories
directory_paths = [
    '/kaggle/input/ddicorpus/DDICorpus/Train/DrugBank',
    '/kaggle/input/ddicorpus/DDICorpus/Train/MedLine'
]

df = parse_all_ddi_files(directory_paths)

# Filter out rows where 'Drug1' or 'Drug2' is 'NULL'
df = df[(df['Drug1'] != 'NULL') & (df['Drug2'] != 'NULL')]

# Define the new directory path for the test data
new_directory_paths = [
    '/kaggle/input/ddicorpus/DDICorpus/Test/Test for DDI Extraction task/DrugBank',
    '/kaggle/input/ddicorpus/DDICorpus/Test/Test for DDI Extraction task/MedLine'
]

# Parse and combine data from the new directory
new_test_df = parse_all_ddi_files(new_directory_paths)

# Filter out rows where 'Drug1' or 'Drug2' is 'NULL' (if needed)
test_df = new_test_df[(new_test_df['Drug1'] != 'NULL') & (new_test_df['Drug2'] != 'NULL')]

train_df=df
# Display the first few rows of the new test set
test_df.head()

Unnamed: 0,Drug1,Drug2,Sentence,Interaction
0,Pilocarpine,beta adrenergic antagonists,Pilocarpine should be administered with cautio...,True
3,atropine,ipratropium,These effects should be considered when antich...,False
4,acetylsalicylic acid,calcium,While no formal drug interaction studies have ...,False
5,acetylsalicylic acid,conjugated estrogens,While no formal drug interaction studies have ...,False
6,acetylsalicylic acid,hydroxychloroquine sulfate,While no formal drug interaction studies have ...,False


In [None]:
model_path = "/kaggle/input/word2vec-embeddings/model.bin"

# Load the model with memory mapping
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

print("Model loaded successfully!")

word_vector = model['drugs']  

Model loaded successfully!


In [None]:
def get_drug_embeddings(drug, model):
    drug_vectors=np.array([])
    if drug in model:
        drug_vectors = model[drug]
    else:
        pass
    return drug_vectors

In [None]:
def get_embeddings(drug1, drug2, expected_dim=200):
    embedding1 = get_drug_embeddings(drug1, model)  
    embedding2 = get_drug_embeddings(drug2, model)  

    if isinstance(embedding1, dict):
        embedding1 = np.array(list(embedding1.values()))
    if isinstance(embedding2, dict):
        embedding2 = np.array(list(embedding2.values()))

    if embedding1 is None or embedding1.size == 0:
        embedding1 = np.zeros(expected_dim)  # Pad missing embeddings
    if embedding2 is None or embedding2.size == 0:
        embedding2 = np.zeros(expected_dim)

    if embedding1.shape[0] != expected_dim:
        embedding1 = np.pad(embedding1, (0, expected_dim - embedding1.shape[0]), 'constant')

    if embedding2.shape[0] != expected_dim:
        embedding2 = np.pad(embedding2, (0, expected_dim - embedding2.shape[0]), 'constant')

    combined_embedding = np.concatenate((embedding1, embedding2))

    return combined_embedding

stacking_features = []
valid_indices = []  

for index, row in train_df.iterrows():
    drug1, drug2 = row['Drug1'], row['Drug2'] 
    try:
        drug_embeddings = get_embeddings(drug1, drug2)

        biobert_probs = get_biobert_prediction(drug1, drug2, row['Sentence'], biobert_model, biobert_tokenizer, device)

        reshaped_embeddings = drug_embeddings.reshape(1, -1)  # Reshape for single sample

        logistic_probs = logistic_model.predict(reshaped_embeddings)
        random_forest_probs = random_forest_model.predict(reshaped_embeddings)

        # Combine predictions as stacking features
        stacking_features.append(list(biobert_probs) + logistic_probs.tolist() + random_forest_probs.tolist())

        # Record the index as valid
        valid_indices.append(index)

    except ValueError as e:
        print(f"Error processing row {index}: {e}")

X_stacking = pd.DataFrame(stacking_features)
y_stacking = train_df.loc[valid_indices, 'Interaction'].reset_index(drop=True)


In [None]:
# Initialize the meta-classifier
stacking_model = LogisticRegression()

# Train the stacking model
stacking_model.fit(X_stacking, y_stacking)

# Predictions on the test data (optional if you already have a test set)
y_pred = stacking_model.predict(X_stacking)
print("Stacking Model Accuracy:", accuracy_score(y_stacking, y_pred))
print(classification_report(y_stacking, y_pred))


Stacking Model Accuracy: 0.9770437535981578
              precision    recall  f1-score   support

       false       0.98      0.99      0.99     23771
        true       0.93      0.91      0.92      4021

    accuracy                           0.98     27792
   macro avg       0.96      0.95      0.95     27792
weighted avg       0.98      0.98      0.98     27792



In [None]:
def get_embeddings(drug1, drug2, expected_dim=200):
    embedding1 = get_drug_embeddings(drug1, model)  # Fetch embedding for drug1
    embedding2 = get_drug_embeddings(drug2, model)  # Fetch embedding for drug2

    # Convert to numpy array if needed
    if isinstance(embedding1, dict):
        embedding1 = np.array(list(embedding1.values()))
    if isinstance(embedding2, dict):
        embedding2 = np.array(list(embedding2.values()))

    # Ensure embeddings are valid arrays
    if embedding1 is None or embedding1.size == 0:
        embedding1 = np.zeros(expected_dim)  # Pad missing embeddings
    if embedding2 is None or embedding2.size == 0:
        embedding2 = np.zeros(expected_dim)

    # Ensure embeddings have the correct shape
    if embedding1.shape[0] != expected_dim:
        embedding1 = np.pad(embedding1, (0, expected_dim - embedding1.shape[0]), 'constant')

    if embedding2.shape[0] != expected_dim:
        embedding2 = np.pad(embedding2, (0, expected_dim - embedding2.shape[0]), 'constant')

    # Concatenate the two embeddings to form a 400-d vector
    combined_embedding = np.concatenate((embedding1, embedding2))

    return combined_embedding
stacking_features = []
errors = []

# Process each test row
for index, row in test_df.iterrows():
    drug1, drug2 = row['Drug1'], row['Drug2']
    
    try:
        # Get embeddings
        drug_embeddings = get_embeddings(drug1, drug2)
        if drug_embeddings.size == 0:
            raise ValueError("Empty embeddings for drug pair.")

        # Generate predictions from individual models
        biobert_probs = get_biobert_prediction(drug1, drug2, row['Sentence'], biobert_model, biobert_tokenizer, device)

        reshaped_embeddings = drug_embeddings.reshape(1, -1)  # Reshape for single sample
        logistic_probs = logistic_model.predict(reshaped_embeddings)
        random_forest_probs = random_forest_model.predict(reshaped_embeddings)

        # Combine all predictions for stacking
        stacking_features.append(list(biobert_probs) + logistic_probs.tolist() + random_forest_probs.tolist())

    except ValueError as e:
        print(f"Error processing row {index}: {e}")
        errors.append(index)  # Track rows that caused errors

# Remove corresponding labels from y_stacking for rows with errors
test_df_cleaned = test_df.drop(errors)
X_stacking_test = pd.DataFrame(stacking_features)
y_stacking_test = test_df_cleaned['Interaction'].values  # Binary labels

# Make final predictions with the stacking model
stacking_predictions = stacking_model.predict(X_stacking_test)

# Evaluate the model performance
from sklearn.metrics import classification_report, accuracy_score

print("Classification Report:")
print(classification_report(y_stacking_test, stacking_predictions))
print("Accuracy:", accuracy_score(y_stacking_test, stacking_predictions))


In [11]:
import joblib

# Save the trained stacking model
joblib.dump(stacking_model, "stacking_model.pkl")

print("Models saved successfully!")


Models saved successfully!


In [None]:
# Convert string labels to integers (if needed)
y_stacking = np.where(y_stacking == 'true', 1, 0)
y_stacking_test = np.where(y_stacking_test == 'true', 1, 0)

# Initialize different stacking models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_stacking, y_stacking)
    
    # Predictions on training data
    y_pred = model.predict(X_stacking)
    print(f"{name} Stacking Model Accuracy:", accuracy_score(y_stacking, y_pred))
    print(classification_report(y_stacking, y_pred))
    
    # Make final predictions on test data
    stacking_predictions = model.predict(X_stacking_test)
    
    # Evaluate the model performance
    print("Classification Report:")
    print(classification_report(y_stacking_test, stacking_predictions))
    print("Accuracy:", accuracy_score(y_stacking_test, stacking_predictions))


In [None]:
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_stacking.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("\nTraining Neural Network...")
nn_model.fit(X_stacking, y_stacking, epochs=20, batch_size=16, verbose=1, validation_split=0.1)

# Make predictions
nn_predictions = (nn_model.predict(X_stacking_test) > 0.5).astype(int)

# Evaluate the neural network
print("Neural Network Classification Report:")
print(classification_report(y_stacking_test, nn_predictions))
print("Neural Network Accuracy:", accuracy_score(y_stacking_test, nn_predictions))
