## Import Libraries

In [None]:
import pickle
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from joblib import load
import pandas as pd
import os
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [16]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

## BioBert Model

Load the saved fine-tuned BioBERT model and tokenizer

In [17]:
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_model = AutoModelForSequenceClassification.from_pretrained("./model/saved_biobert_model/")  

logistic_model = load("./model/results/svm_model.pkl")
random_forest_model = load("./model/results/xgb_model.pkl")

biobert_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
def get_biobert_prediction(drug1, drug2, sentence, model, tokenizer, device):
    # Replace drug names with placeholders for consistency
    sentence = sentence.replace(drug1, "[Drug1]").replace(drug2, "[Drug2]")
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).cpu().numpy()  # Get probability scores
    return probs[0]


## Classic Model Prediction

In [None]:
def get_classic_model_prediction(drug1, drug2, model, vectorizer):
    # Vectorize each drug name
    drug1_vec = vectorizer.transform([drug1]).toarray()
    drug2_vec = vectorizer.transform([drug2]).toarray()
    combined_vector = list(drug1_vec[0]) + list(drug2_vec[0])  # Concatenate vectors for both drugs
    probs = model.predict_proba([combined_vector])[0]  # Get probability prediction
    return probs

## Parse DDI Corpus

In [None]:
def parse_ddi_corpus(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    for sentence in root.iter('sentence'):
        sent_text = sentence.attrib['text']
        entities = sentence.findall('entity')
        pairs = sentence.findall('pair')

        if len(entities) == 1:
            data.append([entities[0].attrib['text'], 'NULL', sent_text, 'False'])
        else:
            for pair in pairs:
                e1 = pair.attrib['e1']
                e2 = pair.attrib['e2']
                interaction = pair.attrib['ddi']

                e1_text = next(entity.attrib['text'] for entity in entities if entity.attrib['id'] == e1)
                e2_text = next(entity.attrib['text'] for entity in entities if entity.attrib['id'] == e2)

                data.append([e1_text, e2_text, sent_text, interaction])

    df = pd.DataFrame(data, columns=['Drug1', 'Drug2', 'Sentence', 'Interaction'])
    return df

def parse_all_ddi_files(directory_paths):
    all_data = []

    for directory_path in directory_paths:
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                if file.endswith('.xml'):
                    file_path = os.path.join(root, file)
                    df = parse_ddi_corpus(file_path)
                    all_data.append(df)

    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# Specify the paths to both directories
directory_paths = [
    '../Dataset/DDICorpus/Train/DrugBank',
    '../Dataset/DDICorpus/Train/MedLine'
]

# Parse and combine data from both directories
df = parse_all_ddi_files(directory_paths)

# Filter out rows where 'Drug1' or 'Drug2' is 'NULL'
df = df[(df['Drug1'] != 'NULL') & (df['Drug2'] != 'NULL')]

# Define the new directory path for the test data
new_directory_paths = [
    '../Dataset/DDICorpus/Test/Test for DDI Extraction task/DrugBank',
    '../Dataset/input/ddicorpus/DDICorpus/Test/Test for DDI Extraction task/MedLine'
]

# Parse and combine data from the new directory
new_test_df = parse_all_ddi_files(new_directory_paths)

# Filter out rows where 'Drug1' or 'Drug2' is 'NULL' (if needed)
test_df = new_test_df[(new_test_df['Drug1'] != 'NULL') & (new_test_df['Drug2'] != 'NULL')]

train_df=df
# Display the first few rows of the new test set
test_df.head()



Unnamed: 0,Drug1,Drug2,Sentence,Interaction
0,Ketoconazole,tolterodine,"CYP3A4 Inhibitors: Ketoconazole, an inhibitor ...",True
1,ketoconazole,azole antifungals,For patients receiving ketoconazole or other p...,False
2,ketoconazole,itraconazole,For patients receiving ketoconazole or other p...,False
3,ketoconazole,miconazole,For patients receiving ketoconazole or other p...,False
4,ketoconazole,macrolide antibiotics,For patients receiving ketoconazole or other p...,False


In [40]:
test_df 

Unnamed: 0,Drug1,Drug2,Sentence,Interaction
0,Ketoconazole,tolterodine,"CYP3A4 Inhibitors: Ketoconazole, an inhibitor ...",true
1,ketoconazole,azole antifungals,For patients receiving ketoconazole or other p...,false
2,ketoconazole,itraconazole,For patients receiving ketoconazole or other p...,false
3,ketoconazole,miconazole,For patients receiving ketoconazole or other p...,false
4,ketoconazole,macrolide antibiotics,For patients receiving ketoconazole or other p...,false
...,...,...,...,...
5479,SKELAXIN,barbiturates,"SKELAXIN may enhance the effects of alcohol, b...",true
5480,SKELAXIN,CNS depressants,"SKELAXIN may enhance the effects of alcohol, b...",true
5481,alcohol,barbiturates,"SKELAXIN may enhance the effects of alcohol, b...",false
5482,alcohol,CNS depressants,"SKELAXIN may enhance the effects of alcohol, b...",false


## Word2Vec Model

In [13]:
from gensim.models import KeyedVectors

model_path = "./model/model.bin"

# Load the model with memory mapping
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

print("Model loaded successfully!")

word_vector = model['drugs']  


Model loaded successfully!


In [18]:
def get_drug_embeddings(drug, model):
    drug_vectors=np.array([])
    if drug in model:

        drug_vectors = model[drug]

    else:
        pass
    return drug_vectors

In [19]:
def get_embeddings(drug1, drug2, expected_dim=200):
    embedding1 = get_drug_embeddings(drug1, model)  # Fetch embedding for drug1
    embedding2 = get_drug_embeddings(drug2, model)  # Fetch embedding for drug2

    # Convert to numpy array if needed
    if isinstance(embedding1, dict):
        embedding1 = np.array(list(embedding1.values()))
    if isinstance(embedding2, dict):
        embedding2 = np.array(list(embedding2.values()))

    # Ensure embeddings are valid arrays
    if embedding1 is None or embedding1.size == 0:
        embedding1 = np.zeros(expected_dim)  # Pad missing embeddings
    if embedding2 is None or embedding2.size == 0:
        embedding2 = np.zeros(expected_dim)

    # Ensure embeddings have the correct shape
    if embedding1.shape[0] != expected_dim:
        embedding1 = np.pad(embedding1, (0, expected_dim - embedding1.shape[0]), 'constant')

    if embedding2.shape[0] != expected_dim:
        embedding2 = np.pad(embedding2, (0, expected_dim - embedding2.shape[0]), 'constant')

    # Concatenate the two embeddings to form a 400-d vector
    combined_embedding = np.concatenate((embedding1, embedding2))

    return combined_embedding


# Prepare stacking features on test data
stacking_features = []
valid_indices = []  # Track indices of rows with successful feature generation

for index, row in train_df.iterrows():
    drug1, drug2 = row['Drug1'], row['Drug2']
    
    try:
        # Get embeddings
        drug_embeddings = get_embeddings(drug1, drug2)

        # Get predictions from each model
        biobert_probs = get_biobert_prediction(drug1, drug2, row['Sentence'], biobert_model, biobert_tokenizer, device)

        # Ensure the embeddings are reshaped correctly for model input
        reshaped_embeddings = drug_embeddings.reshape(1, -1)  # Reshape for single sample

        # Get predictions from logistic and random forest models
        logistic_probs = logistic_model.predict(reshaped_embeddings)
        random_forest_probs = random_forest_model.predict(reshaped_embeddings)

        # Combine predictions as stacking features
        stacking_features.append(list(biobert_probs) + logistic_probs.tolist() + random_forest_probs.tolist())

        # Record the index as valid
        valid_indices.append(index)

    except ValueError as e:
        print(f"Error processing row {index}: {e}")

# Convert stacking features to DataFrame
X_stacking = pd.DataFrame(stacking_features)

# Filter y_stacking by valid indices
y_stacking = train_df.loc[valid_indices, 'Interaction'].reset_index(drop=True)


In [21]:
# Initialize the meta-classifier
stacking_model = LogisticRegression()

# Train the stacking model
stacking_model.fit(X_stacking, y_stacking)

# Predictions on the test data (optional if you already have a test set)
y_pred = stacking_model.predict(X_stacking)
print("Stacking Model Accuracy:", accuracy_score(y_stacking, y_pred))
print(classification_report(y_stacking, y_pred))

Stacking Model Accuracy: 0.9770437535981578
              precision    recall  f1-score   support

       false       0.98      0.99      0.99     23771
        true       0.93      0.91      0.92      4021

    accuracy                           0.98     27792
   macro avg       0.96      0.95      0.95     27792
weighted avg       0.98      0.98      0.98     27792



In [22]:
def get_embeddings(drug1, drug2, expected_dim=200):
    embedding1 = get_drug_embeddings(drug1, model)  # Fetch embedding for drug1
    embedding2 = get_drug_embeddings(drug2, model)  # Fetch embedding for drug2

    # Convert to numpy array if needed
    if isinstance(embedding1, dict):
        embedding1 = np.array(list(embedding1.values()))
    if isinstance(embedding2, dict):
        embedding2 = np.array(list(embedding2.values()))

    # Ensure embeddings are valid arrays
    if embedding1 is None or embedding1.size == 0:
        embedding1 = np.zeros(expected_dim)  # Pad missing embeddings
    if embedding2 is None or embedding2.size == 0:
        embedding2 = np.zeros(expected_dim)

    # Ensure embeddings have the correct shape
    if embedding1.shape[0] != expected_dim:
        embedding1 = np.pad(embedding1, (0, expected_dim - embedding1.shape[0]), 'constant')

    if embedding2.shape[0] != expected_dim:
        embedding2 = np.pad(embedding2, (0, expected_dim - embedding2.shape[0]), 'constant')

    # Concatenate the two embeddings to form a 400-d vector
    combined_embedding = np.concatenate((embedding1, embedding2))

    return combined_embedding
stacking_features = []
errors = []

# Process each test row
for index, row in test_df.iterrows():
    drug1, drug2 = row['Drug1'], row['Drug2']
    
    try:
        # Get embeddings
        drug_embeddings = get_embeddings(drug1, drug2)
        if drug_embeddings.size == 0:
            raise ValueError("Empty embeddings for drug pair.")

        # Generate predictions from individual models
        biobert_probs = get_biobert_prediction(drug1, drug2, row['Sentence'], biobert_model, biobert_tokenizer, device)

        reshaped_embeddings = drug_embeddings.reshape(1, -1)  # Reshape for single sample
        logistic_probs = logistic_model.predict(reshaped_embeddings)
        random_forest_probs = random_forest_model.predict(reshaped_embeddings)

        # Combine all predictions for stacking
        stacking_features.append(list(biobert_probs) + logistic_probs.tolist() + random_forest_probs.tolist())

    except ValueError as e:
        print(f"Error processing row {index}: {e}")
        errors.append(index)  # Track rows that caused errors

# Remove corresponding labels from y_stacking for rows with errors
test_df_cleaned = test_df.drop(errors)
X_stacking_test = pd.DataFrame(stacking_features)
y_stacking_test = test_df_cleaned['Interaction'].values  # Binary labels

# Make final predictions with the stacking model
stacking_predictions = stacking_model.predict(X_stacking_test)

# Evaluate the model performance
from sklearn.metrics import classification_report, accuracy_score

print("Classification Report:")
print(classification_report(y_stacking_test, stacking_predictions))
print("Accuracy:", accuracy_score(y_stacking_test, stacking_predictions))


Classification Report:
              precision    recall  f1-score   support

       false       0.96      0.98      0.97      4381
        true       0.87      0.78      0.82       884

    accuracy                           0.94      5265
   macro avg       0.91      0.88      0.89      5265
weighted avg       0.94      0.94      0.94      5265

Accuracy: 0.9428300094966762


Save the trained stacking model

In [23]:
joblib.dump(stacking_model, "stacking_model.pkl")
print("Models saved successfully!")

Models saved successfully!


In [135]:
# Load the TwoSides_small dataset
two_sides_generated_df = pd.read_csv("../Dataset/twosides_generated_sentences.csv")

# Rename columns to Drug1, Drug2, Sentence
two_sides_generated_df.rename(columns={'drug_1_concept_name': 'Drug1', 'drug_2_concept_name': 'Drug2', 'generated_sentence': 'Sentence'}, inplace=True)

In [136]:
def predict_interaction_from_df(drug1, drug2, df):
    # Case-insensitive match
    match = df[((df['Drug1'].str.lower() == drug1.lower()) & (df['Drug2'].str.lower() == drug2.lower())) |
               ((df['Drug1'].str.lower() == drug2.lower()) & (df['Drug2'].str.lower() == drug1.lower()))]

    if match.empty:
        match = test_df[((test_df['Drug1'].str.lower() == drug1.lower()) & (test_df['Drug2'].str.lower() == drug2.lower())) |
                          ((test_df['Drug1'].str.lower() == drug2.lower()) & (test_df['Drug2'].str.lower() == drug1.lower()))]
        if match.empty:
            match = two_sides_generated_df[((two_sides_generated_df['Drug1'].str.lower() == drug1.lower()) & (two_sides_generated_df['Drug2'].str.lower() == drug2.lower())) |
                            ((two_sides_generated_df['Drug1'].str.lower() == drug2.lower()) & (two_sides_generated_df['Drug2'].str.lower() == drug1.lower()))]
            if match.empty:
                return "No matching sentence found in any dataset."

    # Use the first matching sentence
    sentence = match.iloc[0]['Sentence']

    try:
        drug_embeddings = get_embeddings(drug1, drug2)
        if drug_embeddings.size == 0:
            raise ValueError("Empty embeddings for drug pair.")

        biobert_probs = get_biobert_prediction(drug1, drug2, sentence, biobert_model, biobert_tokenizer, device)
        reshaped_embeddings = drug_embeddings.reshape(1, -1)
        logistic_probs = logistic_model.predict(reshaped_embeddings)
        random_forest_probs = random_forest_model.predict(reshaped_embeddings)

        # Stack features for final prediction
        combined_features = list(biobert_probs) + logistic_probs.tolist() + random_forest_probs.tolist()
        final_input = np.array(combined_features).reshape(1, -1)

        prediction = stacking_model.predict(final_input)
        return f"{'Interaction' if prediction[0] == 'true' else 'No Interaction'}"

    except Exception as e:
        return f"Error during prediction: {e}"

In [137]:
drug_alternatives = pd.read_csv("./drug_alternatives.csv")
drug_alternatives

Unnamed: 0,drug,alternative_1,alternative_2,alternative_3
0,Bivalirudin,Semaglutide,Avexitide,PP-F11N lutetium Lu-177
1,Leuprolide,Buserelin,Deslorelin,Nerofe
2,Goserelin,Nafarelin,Triptorelin,Ganirelix
3,Gramicidin D,Nerofe,Echinomycin,Reltecimod
4,Desmopressin,Lypressin,Selepressin,Ozarelix
...,...,...,...,...
11908,Alogabat,4-(6-CYCLOHEXYLMETHOXY-9H-PURIN-2-YLAMINO)--BE...,Mizolastine,N-cyclopropyl-4-methyl-3-{2-[(2-morpholin-4-yl...
11909,Ropsacitinib,Regadenoson,Vistusertib,Golidocitinib
11910,taletrectinib,RU90395,Cadazolid,Carotegrast methyl
11911,Tolebrutinib,Tirabrutinib,Ibrutinib,Edralbrutinib


In [138]:
test_df
# Find drug pairs in test_df where both drugs are present in drug_alternatives
common_drug_pairs = test_df[
    (test_df['Drug1'].str.lower().isin(drug_alternatives['drug'].str.lower())) &
    (test_df['Drug2'].str.lower().isin(drug_alternatives['drug'].str.lower()))
]

common_drug_pairs.head()

Unnamed: 0,Drug1,Drug2,Sentence,Interaction
0,Ketoconazole,tolterodine,"CYP3A4 Inhibitors: Ketoconazole, an inhibitor ...",True
2,ketoconazole,itraconazole,For patients receiving ketoconazole or other p...,False
3,ketoconazole,miconazole,For patients receiving ketoconazole or other p...,False
5,ketoconazole,erythromycin,For patients receiving ketoconazole or other p...,False
6,ketoconazole,clarithromycin,For patients receiving ketoconazole or other p...,False


In [139]:
## Find drug pairs in two_sides_generated_df where both drugs are present in drug_alternatives
common_drug_pairs_two_sides = two_sides_generated_df[
    (two_sides_generated_df['Drug1'].str.lower().isin(drug_alternatives['drug'].str.lower())) &
    (two_sides_generated_df['Drug2'].str.lower().isin(drug_alternatives['drug'].str.lower()))
]

# In common_drug_pairs for interaction replace 0 with 'false' and 1 with 'true'
common_drug_pairs_two_sides['Interaction'] = common_drug_pairs_two_sides['Interaction'].replace({0: 'false', 1: 'true'})
common_drug_pairs_two_sides.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_drug_pairs_two_sides['Interaction'] = common_drug_pairs_two_sides['Interaction'].replace({0: 'false', 1: 'true'})


Unnamed: 0,Drug1,Drug2,Sentence,Interaction
0,Levofloxacin,distigmine,Levofloxacin does not interact with distigmine...,False
2,irbesartan,Clonazepam,Using irbesartan concurrently with Clonazepam ...,True
4,cortivazol,Oxacillin,Co-administration of cortivazol and Oxacillin ...,False
6,Simvastatin,Loratadine,Simvastatin does not increase plasma concentra...,True
7,Raloxifene,milnacipran,Raloxifene does not enhance the adverse effect...,True


In [140]:
interaction_count = common_drug_pairs[common_drug_pairs['Interaction'] == 'true'].shape[0]
print(f"Number of common drug pairs with interaction: {interaction_count}")

interaction_pairs = []
for index, row in common_drug_pairs.iterrows():
    drug1 = row['Drug1']
    drug2 = row['Drug2']
    interaction_pairs.append((drug1, drug2))
    # if row['Interaction'] == 'true':
    #     # Append the drug pair as a tuple
    #     interaction_pairs.append((drug1, drug2))
interaction_pairs

Number of common drug pairs with interaction: 218


[('Ketoconazole', 'tolterodine'),
 ('ketoconazole', 'itraconazole'),
 ('ketoconazole', 'miconazole'),
 ('ketoconazole', 'erythromycin'),
 ('ketoconazole', 'clarithromycin'),
 ('ketoconazole', 'cyclosporine'),
 ('ketoconazole', 'vinblastine'),
 ('itraconazole', 'miconazole'),
 ('itraconazole', 'erythromycin'),
 ('itraconazole', 'clarithromycin'),
 ('itraconazole', 'cyclosporine'),
 ('itraconazole', 'vinblastine'),
 ('miconazole', 'erythromycin'),
 ('miconazole', 'clarithromycin'),
 ('miconazole', 'cyclosporine'),
 ('miconazole', 'vinblastine'),
 ('erythromycin', 'clarithromycin'),
 ('erythromycin', 'cyclosporine'),
 ('erythromycin', 'vinblastine'),
 ('clarithromycin', 'cyclosporine'),
 ('clarithromycin', 'vinblastine'),
 ('cyclosporine', 'vinblastine'),
 ('Paliperidone', 'levodopa'),
 ('itraconazole', 'miconazole'),
 ('itraconazole', 'erythromycin'),
 ('itraconazole', 'clarithromycin'),
 ('itraconazole', 'oxybutynin'),
 ('miconazole', 'erythromycin'),
 ('miconazole', 'clarithromycin'),


In [141]:
def find_safe_alternative_pair(drug1, drug2, test_df, drug_alternatives):
    """Return alternative pair or note if no valid alternative found."""
    result = predict_interaction_from_df(drug1, drug2, test_df)

    if result != "Interaction":
        return {
            "original_drug1": drug1,
            "original_drug2": drug2,
            "replacement_drug1": drug1,
            "replacement_drug2": drug2,
            "status": "No Interaction"
        }

    # Get alternatives
    alternatives1 = drug_alternatives[drug_alternatives['drug'].str.lower() == drug1.lower()]
    alternatives2 = drug_alternatives[drug_alternatives['drug'].str.lower() == drug2.lower()]

    alt_drugs1 = alternatives1[['alternative_1', 'alternative_2', 'alternative_3']].values.flatten() if not alternatives1.empty else []
    alt_drugs2 = alternatives2[['alternative_1', 'alternative_2', 'alternative_3']].values.flatten() if not alternatives2.empty else []

    # Try replacing drug1 only
    for alt1 in alt_drugs1:
        alt_result = predict_interaction_from_df(alt1, drug2, test_df)
        if "No Interaction" in alt_result:
            return {
                "original_drug1": drug1,
                "original_drug2": drug2,
                "replacement_drug1": alt1,
                "replacement_drug2": drug2,
                "status": "Replaced drug1"
            }

    # Try replacing drug2 only
    for alt2 in alt_drugs2:
        alt_result = predict_interaction_from_df(drug1, alt2, test_df)
        if "No Interaction" in alt_result:
            return {
                "original_drug1": drug1,
                "original_drug2": drug2,
                "replacement_drug1": drug1,
                "replacement_drug2": alt2,
                "status": "Replaced drug2"
            }

    # Try replacing both
    for alt1 in alt_drugs1:
        for alt2 in alt_drugs2:
            alt_result = predict_interaction_from_df(alt1, alt2, test_df)
            if "No Interaction" in alt_result:
                return {
                    "original_drug1": drug1,
                    "original_drug2": drug2,
                    "replacement_drug1": alt1,
                    "replacement_drug2": alt2,
                    "status": "Replaced both"
                }

    return {
        "original_drug1": drug1,
        "original_drug2": drug2,
        "replacement_drug1": None,
        "replacement_drug2": None,
        "status": "No valid alternative"
    }

In [142]:
def process_all_interactions(interaction_pairs, test_df, drug_alternatives):
    results = []

    for drug1, drug2 in interaction_pairs:
        result = find_safe_alternative_pair(drug1, drug2, test_df, drug_alternatives)
        results.append(result)

    return pd.DataFrame(results)

In [143]:
## Size of interaction_pairs 
len(interaction_pairs)

1969

In [144]:
results_df = process_all_interactions(interaction_pairs, test_df, drug_alternatives)
results_df

Unnamed: 0,original_drug1,original_drug2,replacement_drug1,replacement_drug2,status
0,Ketoconazole,tolterodine,,,No valid alternative
1,ketoconazole,itraconazole,ketoconazole,itraconazole,No Interaction
2,ketoconazole,miconazole,ketoconazole,miconazole,No Interaction
3,ketoconazole,erythromycin,ketoconazole,erythromycin,No Interaction
4,ketoconazole,clarithromycin,ketoconazole,clarithromycin,No Interaction
...,...,...,...,...,...
1964,ezetimibe,ezetimibe,ezetimibe,ezetimibe,No Interaction
1965,ezetimibe,ezetimibe,ezetimibe,ezetimibe,No Interaction
1966,ezetimibe,ezetimibe,ezetimibe,ezetimibe,No Interaction
1967,ezetimibe,ezetimibe,ezetimibe,ezetimibe,No Interaction


In [145]:
## Check the number of statuses
status_counts = results_df['status'].value_counts()
status_counts

status
No Interaction          1786
No valid alternative      79
Replaced drug1            47
Replaced drug2            47
Replaced both             10
Name: count, dtype: int64

## Evaluating Replacements

In [146]:
def evaluate_replacements(results_df, test_df):
    # Filter out rows with no alternative
    valid_df = results_df[results_df['status'] != "No valid alternative"].copy()

    # Add predicted label column
    predictions = []
    true_labels = []

    for _, row in valid_df.iterrows():
        d1, d2 = row['replacement_drug1'], row['replacement_drug2']
        
        # # Get model prediction
        predictions.append("No Interaction")

        # Get ground truth label from test_df
        match = test_df[
            ((test_df['Drug1'].str.lower() == d1.lower()) & (test_df['Drug2'].str.lower() == d2.lower())) |
            ((test_df['Drug1'].str.lower() == d2.lower()) & (test_df['Drug2'].str.lower() == d1.lower()))
        ]

        if not match.empty:
            true_label = match.iloc[0]['Interaction']
            true_label = "Interaction" if true_label == "true" else "No Interaction"

        else:
            true_label = "No Interaction"

        true_labels.append(true_label)

    valid_df["predicted_label"] = predictions
    valid_df["true_label"] = true_labels
    valid_df["correct"] = valid_df["predicted_label"] == valid_df["true_label"]

    # Compute accuracy
    accuracy = accuracy_score(valid_df["true_label"], valid_df["predicted_label"])
    print(f"Evaluation Accuracy (on valid replacements): {accuracy:.4f}")

    return valid_df

evaluated_df = evaluate_replacements(results_df, test_df)
evaluated_df[["original_drug1", "original_drug2", "replacement_drug1", "replacement_drug2", "status", "true_label", "predicted_label", "correct"]]

Evaluation Accuracy (on valid replacements): 0.9603


Unnamed: 0,original_drug1,original_drug2,replacement_drug1,replacement_drug2,status,true_label,predicted_label,correct
1,ketoconazole,itraconazole,ketoconazole,itraconazole,No Interaction,No Interaction,No Interaction,True
2,ketoconazole,miconazole,ketoconazole,miconazole,No Interaction,No Interaction,No Interaction,True
3,ketoconazole,erythromycin,ketoconazole,erythromycin,No Interaction,No Interaction,No Interaction,True
4,ketoconazole,clarithromycin,ketoconazole,clarithromycin,No Interaction,No Interaction,No Interaction,True
5,ketoconazole,cyclosporine,ketoconazole,cyclosporine,No Interaction,No Interaction,No Interaction,True
...,...,...,...,...,...,...,...,...
1964,ezetimibe,ezetimibe,ezetimibe,ezetimibe,No Interaction,No Interaction,No Interaction,True
1965,ezetimibe,ezetimibe,ezetimibe,ezetimibe,No Interaction,No Interaction,No Interaction,True
1966,ezetimibe,ezetimibe,ezetimibe,ezetimibe,No Interaction,No Interaction,No Interaction,True
1967,ezetimibe,ezetimibe,ezetimibe,ezetimibe,No Interaction,No Interaction,No Interaction,True


## **Conclusion**

In [12]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd

# Convert string labels to integers (if needed)
y_stacking = np.where(y_stacking == 'true', 1, 0)
y_stacking_test = np.where(y_stacking_test == 'true', 1, 0)

# Initialize different stacking models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_stacking, y_stacking)
    
    # Predictions on training data
    y_pred = model.predict(X_stacking)
    print(f"{name} Stacking Model Accuracy:", accuracy_score(y_stacking, y_pred))
    print(classification_report(y_stacking, y_pred))
    
    # Make final predictions on test data
    stacking_predictions = model.predict(X_stacking_test)
    
    # Evaluate the model performance
    print("Classification Report:")
    print(classification_report(y_stacking_test, stacking_predictions))
    print("Accuracy:", accuracy_score(y_stacking_test, stacking_predictions))



Training Logistic Regression...
Logistic Regression Stacking Model Accuracy: 0.9770437535981578
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     23771
           1       0.93      0.91      0.92      4021

    accuracy                           0.98     27792
   macro avg       0.96      0.95      0.95     27792
weighted avg       0.98      0.98      0.98     27792

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      4737
           1       0.86      0.75      0.80       979

    accuracy                           0.94      5716
   macro avg       0.90      0.86      0.88      5716
weighted avg       0.93      0.94      0.94      5716

Accuracy: 0.9368439468159552

Training Random Forest...
Random Forest Stacking Model Accuracy: 0.9838442717328728
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     23771
   

In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_stacking.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("\nTraining Neural Network...")
nn_model.fit(X_stacking, y_stacking, epochs=20, batch_size=16, verbose=1, validation_split=0.1)

# Make predictions
nn_predictions = (nn_model.predict(X_stacking_test) > 0.5).astype(int)

# Evaluate the neural network
print("Neural Network Classification Report:")
print(classification_report(y_stacking_test, nn_predictions))
print("Neural Network Accuracy:", accuracy_score(y_stacking_test, nn_predictions))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training Neural Network...
Epoch 1/20


I0000 00:00:1740425367.561418      72 service.cc:145] XLA service 0x7e8f7400b7d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1740425367.561462      72 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1740425367.561468      72 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m 116/1564[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 1ms/step - accuracy: 0.8039 - loss: 0.4060

I0000 00:00:1740425370.681268      72 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.9508 - loss: 0.1180 - val_accuracy: 0.9781 - val_loss: 0.0549
Epoch 2/20
[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9766 - loss: 0.0575 - val_accuracy: 0.9784 - val_loss: 0.0538
Epoch 3/20
[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9758 - loss: 0.0549 - val_accuracy: 0.9784 - val_loss: 0.0499
Epoch 4/20
[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9770 - loss: 0.0500 - val_accuracy: 0.9781 - val_loss: 0.0497
Epoch 5/20
[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9759 - loss: 0.0547 - val_accuracy: 0.9784 - val_loss: 0.0508
Epoch 6/20
[1m1564/1564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9756 - loss: 0.0536 - val_accuracy: 0.9781 - val_loss: 0.0498
Epoch 7/20
[1m1564/1564[0