In [None]:
# !pip install torch_geometric

In [None]:
#### PART 2 and 3 of assignment:

### both notebook are built with colab cause I didn't have GPU on my local machine
#### Since the problem, seem bipartite to me, I took
#### this approach instead of treating it as heterogenous graphs
#### also specific in the problem set
#### the interest was to find score/probability of drug-disease interacting
####
#### I have added in function to get recomendation of drugs when given a disease
#### I have trained model as a classifier with binary entropy loss as a metric with L2 regularization loss
#### Since I read a paper on LightGCN hadn't got an opportunity to implemented,
#### in bipartite graphs(*https://arxiv.org/abs/2002.02126) I choose this one,
#### Though it would have been better if the y variable was a score instead of 1 and 0 classes


#### I have only used one archtecture. With more time we can implement graphsage and gcn too
#### where we can train different models on different
#### seeds(It would take more training time, depending on data size, number of models and epochs etc. )
#### the end result would be taken as average or probability from all models or voting from different models for particular interaction


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
import os
import pickle


In [2]:
class LightGCN(nn.Module):
    def __init__(self, disease_embeddings, drug_embeddings, num_layers):
        super(LightGCN, self).__init__()

        # Convert numpy arrays to torch tensors if they aren't already
        if not isinstance(disease_embeddings, torch.Tensor):
            disease_embeddings = torch.FloatTensor(disease_embeddings)
        if not isinstance(drug_embeddings, torch.Tensor):
            drug_embeddings = torch.FloatTensor(drug_embeddings)

        # Create embedding layers and register as parameters
        self.num_diseases = disease_embeddings.shape[0]
        self.num_drugs = drug_embeddings.shape[0]
        self.embedding_dim = disease_embeddings.shape[1]

        # Register embeddings as parameters
        self.disease_embedding = nn.Parameter(disease_embeddings)
        self.drug_embedding = nn.Parameter(drug_embeddings)
        self.num_layers = num_layers

    def forward(self, edge_index):
        # Get embeddings
        diseases_emb = self.disease_embedding
        drugs_emb = self.drug_embedding
        all_emb = torch.cat([diseases_emb, drugs_emb])

        # Storage for embeddings at each layer
        embs = [all_emb]

        # Compute adjacency matrix
        adj = torch.zeros((all_emb.shape[0], all_emb.shape[0]), device=edge_index.device)
        adj[edge_index[0], edge_index[1]] = 1
        adj[edge_index[1], edge_index[0]] = 1

        # Compute degree matrix
        degree = adj.sum(dim=1)
        degree_sqrt = torch.sqrt(degree + 1e-12)
        degree_matrix_inv_sqrt = torch.diag(1.0 / degree_sqrt)

        # Normalize adjacency matrix
        norm_adj = degree_matrix_inv_sqrt @ adj @ degree_matrix_inv_sqrt

        # Message passing layers
        for _ in range(self.num_layers):
            all_emb = norm_adj @ all_emb
            embs.append(all_emb)

        # Final embeddings are mean of all layers
        final_embs = torch.stack(embs, dim=0).mean(dim=0)

        diseases_emb_final, drugs_emb_final = torch.split(final_embs, [self.num_diseases, self.num_drugs])
        return diseases_emb_final, drugs_emb_final

    def predict(self, disease_indices, drug_indices, edge_index):
        diseases_emb_final, drugs_emb_final = self.forward(edge_index)
        disease_emb = diseases_emb_final[disease_indices]
        drug_emb = drugs_emb_final[drug_indices]

        # Compute prediction scores
        predictions = (disease_emb * drug_emb).sum(dim=1)
        return torch.sigmoid(predictions)

def prepare_data(ground_truth, node_embeddings, test_size=0.2, random_state=42):
    """
    Prepare train and test datasets using the ground truth data and initial embeddings
    """
    # Create mapping dictionaries for diseases and drugs
    disease_to_idx = {disease: idx for idx, disease in enumerate(ground_truth['target'].unique())}
    drug_to_idx = {drug: idx for idx, drug in enumerate(ground_truth['source'].unique())}

    # Extract embeddings from node_embeddings DataFrame
    drug_embeddings = []
    disease_embeddings = []

    # Get the embedding columns
    embedding_cols = [col for col in node_embeddings.columns if col.startswith('embedding_')]

    # Create embeddings matrices
    for drug in drug_to_idx:
        drug_embedding = node_embeddings[node_embeddings['id'] == drug][embedding_cols].values[0]
        drug_embeddings.append(drug_embedding)

    for disease in disease_to_idx:
        disease_embedding = node_embeddings[node_embeddings['id'] == disease][embedding_cols].values[0]
        disease_embeddings.append(disease_embedding)

    drug_embeddings = np.stack(drug_embeddings)
    disease_embeddings = np.stack(disease_embeddings)

    # Split the data into train and test
    train_df, test_df = train_test_split(ground_truth, test_size=test_size,
                                        random_state=random_state, stratify=ground_truth['y'])

    # Create edge indices for training (using only positive edges)
    train_edge_index = []
    for _, row in train_df[train_df['y'] == 1].iterrows():
        drug_idx = drug_to_idx[row['source']]
        disease_idx = disease_to_idx[row['target']]
        train_edge_index.append([drug_idx, disease_idx + len(drug_to_idx)])
        train_edge_index.append([disease_idx + len(drug_to_idx), drug_idx])

    train_edge_index = torch.tensor(train_edge_index, dtype=torch.long).t()

    # Prepare train pairs and labels (including both positive and negative examples)
    train_pairs = []
    train_labels = []
    for _, row in train_df.iterrows():
        drug_idx = drug_to_idx[row['source']]
        disease_idx = disease_to_idx[row['target']]
        train_pairs.append([disease_idx, drug_idx])
        train_labels.append(row['y'])

    # Prepare test pairs and labels
    test_pairs = []
    test_labels = []
    for _, row in test_df.iterrows():
        drug_idx = drug_to_idx[row['source']]
        disease_idx = disease_to_idx[row['target']]
        test_pairs.append([disease_idx, drug_idx])
        test_labels.append(row['y'])

    return {
        'train_edge_index': train_edge_index,
        'train_pairs': torch.tensor(train_pairs, dtype=torch.long),
        'train_labels': torch.tensor(train_labels, dtype=torch.float),
        'test_pairs': torch.tensor(test_pairs, dtype=torch.long),
        'test_labels': torch.tensor(test_labels, dtype=torch.float),
        'disease_to_idx': disease_to_idx,
        'drug_to_idx': drug_to_idx,
        'disease_embeddings': disease_embeddings,
        'drug_embeddings': drug_embeddings
    }


In [None]:
######

#### for Preprocessing Data
#### Since the knowledge embedding with pykeen would have taken more time
#### I use topology embedding from the bedding file
#### edges used were from grouth truth file and only nodes present in ground truth file were used
#### after that pre-processing I have saved it in recomendation_pipeline_initial_node_embedding
#### After that I use prepare data function to make it in format to use in pyG pipeline
#####

In [3]:
# Read your data
ground_truth = pd.read_csv('/content/Ground Truth.csv')
node_embeddings = pd.read_csv('/content/recomendation_pipeline_initial_node_embeddings.csv')

# Prepare data
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = prepare_data(ground_truth, node_embeddings, test_size=0.2) ###  80/20 split

FileNotFoundError: [Errno 2] No such file or directory: '/content/Ground Truth.csv'

In [7]:
### how does prepared data look
print(data.keys())
### checking dimensions

print(data['disease_embeddings'].shape)
print(data['drug_embeddings'].shape)
print(len(data['disease_to_idx']))
print(len(data['drug_to_idx']))

dict_keys(['train_edge_index', 'train_pairs', 'train_labels', 'test_pairs', 'test_labels', 'disease_to_idx', 'drug_to_idx', 'disease_embeddings', 'drug_embeddings'])
(1554, 128)
(2231, 128)
1554
2231


In [10]:
## just a small check to see if the there is a class imbalance, one class is larger than other but difference is not huge

zeros = np.count_nonzero(data['train_labels'] == 0)
ones = np.count_nonzero(data['train_labels'] == 1)

print(f"Number of 0s: {zeros}")
print(f"Number of 1s: {ones}")

Number of 0s: 13449
Number of 1s: 9245


In [11]:
## same with checking test set

zeros = np.count_nonzero(data['test_labels'] == 0)
ones = np.count_nonzero(data['test_labels'] == 1)

print(f"Number of 0s: {zeros}")
print(f"Number of 1s: {ones}")

Number of 0s: 3362
Number of 1s: 2312


In [12]:

def train_epoch(model, train_edge_index, train_pairs, train_labels, optimizer, batch_size, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    num_batches = (len(train_pairs) + batch_size - 1) // batch_size

    # Shuffle training data
    indices = torch.randperm(len(train_pairs))
    train_pairs = train_pairs[indices]
    train_labels = train_labels[indices]

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(train_pairs))

        batch_pairs = train_pairs[start_idx:end_idx].to(device)
        batch_labels = train_labels[start_idx:end_idx].to(device)

        optimizer.zero_grad()

        # Get predictions
        predictions = model.predict(
            batch_pairs[:, 0],
            batch_pairs[:, 1],
            train_edge_index.to(device)
        )

        # Binary cross entropy loss
        loss = F.binary_cross_entropy(predictions, batch_labels)

        # Add L2 regularization
        l2_reg = 0
        for param in model.parameters():
            l2_reg += torch.norm(param, p=2)
        loss += 0.0001 * l2_reg

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / num_batches

def evaluate(model, train_edge_index, test_pairs, test_labels, device):
    """Evaluate the model"""
    model.eval()
    with torch.no_grad():
        test_predictions = model.predict(
            test_pairs[:, 0].to(device),
            test_pairs[:, 1].to(device),
            train_edge_index.to(device)
        )

        # Calculate metrics
        auc_roc = roc_auc_score(test_labels.cpu(), test_predictions.cpu())
        precision, recall, _ = precision_recall_curve(test_labels.cpu(), test_predictions.cpu())
        auc_pr = auc(recall, precision)

        return {
            'AUC-ROC': auc_roc,
            'AUC-PR': auc_pr
        }

def train_and_evaluate(model, data, num_epochs=100, batch_size=128, lr=0.001, device='cuda'):
    """Train and evaluate the model"""
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # Train
        train_loss = train_epoch(
            model,
            data['train_edge_index'],
            data['train_pairs'],
            data['train_labels'],
            optimizer,
            batch_size,
            device
        )

        # Evaluate
        if (epoch + 1) % 10 == 0:
            metrics = evaluate(
                model,
                data['train_edge_index'],
                data['test_pairs'],
                data['test_labels'],
                device
            )
            print(f'Epoch {epoch+1}: Loss = {train_loss:.4f}, '
                  f'Test AUC-ROC = {metrics["AUC-ROC"]:.4f}, '
                  f'Test AUC-PR = {metrics["AUC-PR"]:.4f}')



In [None]:

#######

### I was initially thinking of this approach but I think this would work well when y is a interaction score instead of 1/0

#######


# def bpr_loss(disease_emb_final, disease_emb, pos_drug_emb_final, pos_drug_emb,
#              neg_drug_emb_final, neg_drug_emb, lambda_reg=0.001):
#     """
#     Compute BPR loss with regularization
#     """
#     # Regularization loss
#     reg_loss = lambda_reg * (disease_emb.norm(2).pow(2) +
#                             pos_drug_emb.norm(2).pow(2) +
#                             neg_drug_emb.norm(2).pow(2))

#     # BPR loss
#     pos_scores = torch.mul(disease_emb_final, pos_drug_emb_final).sum(dim=-1)
#     neg_scores = torch.mul(disease_emb_final, neg_drug_emb_final).sum(dim=-1)

#     bpr_loss = torch.mean(torch.nn.functional.softplus(-(pos_scores - neg_scores)))

#     return bpr_loss + reg_loss

# def train_epoch_bpr(model, train_edge_index, train_pairs, train_labels, optimizer, batch_size, device):
#     """Train for one epoch using BPR loss with existing negative edges"""
#     model.train()
#     total_loss = 0
#     num_batches = (len(train_pairs) + batch_size - 1) // batch_size

#     # Get positive and negative pairs
#     pos_mask = train_labels == 1
#     neg_mask = train_labels == 0
#     pos_pairs = train_pairs[pos_mask]
#     neg_pairs = train_pairs[neg_mask]

#     # Ensure equal number of positive and negative pairs per batch
#     min_pairs = min(len(pos_pairs), len(neg_pairs))
#     pos_pairs = pos_pairs[:min_pairs]
#     neg_pairs = neg_pairs[:min_pairs]

#     # Shuffle both positive and negative pairs
#     pos_indices = torch.randperm(len(pos_pairs))
#     neg_indices = torch.randperm(len(neg_pairs))
#     pos_pairs = pos_pairs[pos_indices]
#     neg_pairs = neg_pairs[neg_indices]

#     for i in range(0, len(pos_pairs), batch_size):
#         batch_end = min(i + batch_size, len(pos_pairs))

#         # Get batch of positive and negative pairs
#         batch_pos_pairs = pos_pairs[i:batch_end].to(device)
#         batch_neg_pairs = neg_pairs[i:batch_end].to(device)

#         optimizer.zero_grad()

#         # Get embeddings
#         disease_emb_final, drug_emb_final = model(train_edge_index.to(device))
#         disease_emb = model.disease_embedding
#         drug_emb = model.drug_embedding

#         # Get relevant embeddings for positive and negative samples
#         batch_disease_emb_final = disease_emb_final[batch_pos_pairs[:, 0]]
#         batch_disease_emb = disease_emb[batch_pos_pairs[:, 0]]

#         batch_pos_drug_emb_final = drug_emb_final[batch_pos_pairs[:, 1]]
#         batch_pos_drug_emb = drug_emb[batch_pos_pairs[:, 1]]

#         batch_neg_drug_emb_final = drug_emb_final[batch_neg_pairs[:, 1]]
#         batch_neg_drug_emb = drug_emb[batch_neg_pairs[:, 1]]

#         # Calculate BPR loss
#         loss = bpr_loss(
#             batch_disease_emb_final, batch_disease_emb,
#             batch_pos_drug_emb_final, batch_pos_drug_emb,
#             batch_neg_drug_emb_final, batch_neg_drug_emb
#         )

#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#     return total_loss / ((len(pos_pairs) + batch_size - 1) // batch_size)

# def calculate_recall_at_k(predictions, ground_truth, k_values):
#     """Calculate Recall@k for multiple k values"""
#     recalls = {}
#     predictions_topk = predictions.topk(max(k_values))[1]

#     for k in k_values:
#         predictions_k = set(predictions_topk[:k].tolist())
#         ground_truth_set = set(ground_truth.tolist())
#         recall = len(predictions_k.intersection(ground_truth_set)) / len(ground_truth_set)
#         recalls[f'recall@{k}'] = recall

#     return recalls

# def evaluate_with_recall(model, train_edge_index, test_pairs, test_labels, device, k_values=[5, 10, 20]):
#     """Evaluate the model using Recall@k metrics"""
#     model.eval()
#     metrics = {}

#     with torch.no_grad():
#         disease_emb_final, drug_emb_final = model(train_edge_index.to(device))

#         # Calculate Recall@k
#         # Group test data by disease
#         test_data = pd.DataFrame({
#             'disease_idx': test_pairs[:, 0].cpu().numpy(),
#             'drug_idx': test_pairs[:, 1].cpu().numpy(),
#             'label': test_labels.cpu().numpy()
#         })

#         recall_values = {f'recall@{k}': 0.0 for k in k_values}
#         num_diseases = 0

#         for disease_idx in test_data['disease_idx'].unique():
#             # Get positive drugs for this disease in test set
#             pos_drugs = test_data[
#                 (test_data['disease_idx'] == disease_idx) &
#                 (test_data['label'] == 1)
#             ]['drug_idx'].values

#             if len(pos_drugs) == 0:
#                 continue

#             # Calculate scores for all drugs
#             disease_emb = disease_emb_final[disease_idx].unsqueeze(0)
#             scores = torch.mm(disease_emb, drug_emb_final.t()).squeeze()

#             # Calculate recall@k
#             recalls = calculate_recall_at_k(scores, torch.tensor(pos_drugs), k_values)

#             for k, recall in recalls.items():
#                 recall_values[k] += recall

#             num_diseases += 1

#         # Average recall values
#         for k in recall_values:
#             recall_values[k] /= num_diseases
#             metrics[k] = recall_values[k]

#     return metrics

# def train_and_evaluate(model, data, num_epochs=100, batch_size=128, lr=0.001, device='cuda'):
#     """Train and evaluate the model using BPR loss and recall metrics"""
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     k_values = [5, 10, 20,100]  # Values for recall@k

#     for epoch in range(num_epochs):
#         # Train
#         train_loss = train_epoch_bpr(
#             model,
#             data['train_edge_index'],
#             data['train_pairs'],
#             data['train_labels'],
#             optimizer,
#             batch_size,
#             device
#         )

#         # Evaluate
#         if (epoch + 1) % 10 == 0:
#             metrics = evaluate_with_recall(
#                 model,
#                 data['train_edge_index'],
#                 data['test_pairs'],
#                 data['test_labels'],
#                 device,
#                 k_values
#             )

#             print(f'Epoch {epoch+1}: Loss = {train_loss:.4f}')
#             for k in k_values:
#                 print(f'Test Recall@{k} = {metrics[f"recall@{k}"]:.4f}')
#             print()

In [13]:

# Initialize model with intital node embeddings(their features)
model = LightGCN(
    disease_embeddings=data['disease_embeddings'],
    drug_embeddings=data['drug_embeddings'],
    num_layers=4
).to(device)



train_and_evaluate(model, data, num_epochs=60, batch_size=100, device=device)

Epoch 10: Loss = 0.4152, Test AUC-ROC = 0.9063, Test AUC-PR = 0.8563
Epoch 20: Loss = 0.2788, Test AUC-ROC = 0.9372, Test AUC-PR = 0.9062
Epoch 30: Loss = 0.2236, Test AUC-ROC = 0.9451, Test AUC-PR = 0.9189
Epoch 40: Loss = 0.1890, Test AUC-ROC = 0.9481, Test AUC-PR = 0.9232
Epoch 50: Loss = 0.1622, Test AUC-ROC = 0.9493, Test AUC-PR = 0.9244
Epoch 60: Loss = 0.1406, Test AUC-ROC = 0.9494, Test AUC-PR = 0.9233


In [8]:
def get_pair_score(model, drug_id, disease_id, data, device):
    """
    Get interaction score for a specific drug-disease pair

    Args:
        model: Trained LightGCN model
        drug_id: Drug identifier (e.g., 'CHEMBL123456')
        disease_id: Disease identifier (e.g., 'MONDO:0007186')
        data: Dictionary containing model data and mappings
        device: torch device

    Returns:
        dict: Dictionary containing the score and relevant information
    """
    model.eval()

    # Check if drug and disease exist in our mappings
    if drug_id not in data['drug_to_idx']:
        raise ValueError(f"Drug ID {drug_id} not found in training data")
    if disease_id not in data['disease_to_idx']:
        raise ValueError(f"Disease ID {disease_id} not found in training data")

    # Get indices
    drug_idx = data['drug_to_idx'][drug_id]
    disease_idx = data['disease_to_idx'][disease_id]

    with torch.no_grad():
        # Get embeddings
        disease_emb_final, drug_emb_final = model(data['train_edge_index'].to(device))

        # Get specific embeddings
        disease_emb = disease_emb_final[disease_idx].unsqueeze(0)
        drug_emb = drug_emb_final[drug_idx].unsqueeze(0)

        # Calculate score
        score = torch.mm(disease_emb, drug_emb.t()).squeeze()
        probability = torch.sigmoid(score)

        return {
            'drug_id': drug_id,
            'disease_id': disease_id,
            'raw_score': score.item(),
            'probability': probability.item(),
            'disease_emb':list(disease_emb),
            'drug_emb':list(drug_emb)
        }

In [9]:
# Get score for a specific pair
score_info = get_pair_score(
    model=model,
    drug_id="CHEMBL.COMPOUND:CHEMBL30",
    disease_id="MONDO:0007186",
    data=data,
    device=device
)

print("score for the pair")
print(f"Raw score: {score_info['raw_score']:.4f}")
print(f"Probability: {score_info['probability']:.4f}")

score for the pair
Raw score: 4.3494
Probability: 0.9873


In [16]:
score_info

{'drug_id': 'CHEMBL.COMPOUND:CHEMBL30',
 'disease_id': 'MONDO:0007186',
 'raw_score': 4.349397659301758,
 'probability': 0.9872501492500305,
 'disease_emb': [tensor([-0.4440, -0.2155,  0.4541, -0.4702,  0.2211,  0.2181, -0.1467,  0.4214,
           0.1603, -0.1287,  0.1681,  0.1274, -0.2727,  0.1877,  0.0130, -0.4701,
          -0.1617,  0.1998, -0.1888, -0.4296, -0.1751,  0.4310,  0.1423, -0.0080,
           0.2045,  0.4471, -0.4535,  0.4224,  0.2488,  0.4404,  0.4259,  0.2253,
          -0.2045,  0.4637,  0.1788, -0.4454, -0.1613, -0.2043, -0.1976,  0.4428,
          -0.0108, -0.0942, -0.3114, -0.1989,  0.1862,  0.1291, -0.4229,  0.4614,
          -0.4687,  0.0755, -0.1765,  0.1390,  0.4331,  0.1520,  0.4555,  0.4051,
          -0.1752,  0.3089, -0.2247,  0.1028, -0.1814, -0.0258,  0.4645,  0.2008,
           0.4602, -0.1847,  0.2151, -0.1212, -0.4654, -0.4600, -0.1984,  0.0292,
           0.1335,  0.1657, -0.4628,  0.1336,  0.0020,  0.1881, -0.1761,  0.3263,
           0.4602,  0.46

In [17]:
#################################################################################################

#### This was related to one of the personal projects, (Bipartite graph problem)
#### Can we get a list of drugs for a specific disease?
####
#### the next part is additional caveat for generating recommendation list
####
#### While in comments, you may see recall@K and bpr as an evaluation metric instead of Roc-auc this was when we would have scorees
#### this was because I thought it be better if we would like to use same pipeline for recommendation system when y is a score
####


#################################################################################################

In [18]:
def generate_recommendations_for_disease(model, disease_id, ground_truth, data, device, top_k=20):
    """
    Generate ranked drug recommendations for a specific disease

    Args:
        model: Trained LightGCN model
        disease_id: Disease identifier (e.g., 'MONDO:0007186')
        ground_truth: Original ground truth DataFrame
        data: Dictionary containing model data and mappings
        device: torch device
        top_k: Number of recommendations to return

    Returns:
        DataFrame with ranked drug recommendations and their scores
    """
    model.eval()

    # Get disease index
    disease_idx = data['disease_to_idx'][disease_id]

    # Get existing positive interactions
    existing_interactions = set(
        ground_truth[
            (ground_truth['target'] == disease_id) &
            (ground_truth['y'] == 1)
        ]['source'].values
    )

    with torch.no_grad():
        # Get embeddings
        disease_emb_final, drug_emb_final = model(data['train_edge_index'].to(device))

        # Get disease embedding
        disease_emb = disease_emb_final[disease_idx].unsqueeze(0)

        # Calculate scores for all drugs
        scores = torch.mm(disease_emb, drug_emb_final.t()).squeeze()
        # scores = torch.sigmoid(scores)  # Convert to probabilities or raw scores

        # Convert to numpy for easier handling
        scores = scores.cpu().numpy()

        # Create recommendations DataFrame
        recommendations = []
        for drug_id, drug_idx in data['drug_to_idx'].items():
            if drug_id not in existing_interactions:  # Exclude existing positive interactions
                recommendations.append({
                    'drug_id': drug_id,
                    'score': scores[drug_idx],
                })

        recommendations_df = pd.DataFrame(recommendations)

        # Sort by score and get top k
        recommendations_df = recommendations_df.sort_values('score', ascending=False).head(top_k)
        recommendations_df = recommendations_df.reset_index(drop=True)

        # Add rank column
        recommendations_df['rank'] = recommendations_df.index + 1

        # Reorder columns
        recommendations_df = recommendations_df[['rank', 'drug_id', 'score']]

    return recommendations_df


In [None]:
###############################################################

### getting recommendation for drugs for a particular disease #

###############################################################

In [19]:
disease_id = 'MONDO:0007186'  # Replace with your disease ID
recommendations = generate_recommendations_for_disease(
    model=model,
    disease_id=disease_id,
    ground_truth=ground_truth,
    data=data,
    device=device,
    top_k=20
)
print(recommendations)


####
#### This is from one model
#### with more models we can can take recommendations as drugs who showed up
#### in top 20 or top n and then rank them on importance(frequency of them appearing in the top 20 or top n list)

####

    rank                        drug_id      score
0      1   CHEMBL.COMPOUND:CHEMBL396778  17.558647
1      2  CHEMBL.COMPOUND:CHEMBL1200823  16.249205
2      3     CHEMBL.COMPOUND:CHEMBL1351  16.147923
3      4  CHEMBL.COMPOUND:CHEMBL2104164  15.868414
4      5  CHEMBL.COMPOUND:CHEMBL1569487  15.491830
5      6  CHEMBL.COMPOUND:CHEMBL3322001  14.949764
6      7   CHEMBL.COMPOUND:CHEMBL279229  14.141775
7      8   CHEMBL.COMPOUND:CHEMBL309821  13.904785
8      9     CHEMBL.COMPOUND:CHEMBL1078  13.006432
9     10   CHEMBL.COMPOUND:CHEMBL589583  12.979447
10    11                    CHEBI:36047  12.814536
11    12   CHEMBL.COMPOUND:CHEMBL152067  12.802544
12    13   CHEMBL.COMPOUND:CHEMBL376488  12.713521
13    14   CHEMBL.COMPOUND:CHEMBL203125  12.694095
14    15   CHEMBL.COMPOUND:CHEMBL116438  12.666769
15    16  CHEMBL.COMPOUND:CHEMBL2110816  12.269520
16    17   CHEMBL.COMPOUND:CHEMBL162036  12.225706
17    18     CHEMBL.COMPOUND:CHEMBL1072  12.128423
18    19                    CHE

In [None]:

###############################################

#### save it and use it later with function ###

################################################

In [None]:


def save_model_pickle(model, data, save_path='lightgcn_model_2.pkl'):
    # Move model to CPU for saving
    model = model.cpu()

    # Create save dictionary
    save_dict = {
        'model_state': model.state_dict(),
        'model_config': {
            'num_diseases': model.num_diseases,
            'num_drugs': model.num_drugs,
            'embedding_dim': model.embedding_dim,
            'num_layers': model.num_layers
        },
        'mappings': {
            'drug_to_idx': data['drug_to_idx'],
            'disease_to_idx': data['disease_to_idx']
        },
        'train_edge_index': data['train_edge_index'].cpu()
    }

    # Save using pickle
    with open(save_path, 'wb') as f:
        pickle.dump(save_dict, f)

    print(f"Model saved to {save_path}")

def load_model_pickle(load_path='lightgcn_model_2.pkl', device='cuda'):
    # Load the pickle file
    with open(load_path, 'rb') as f:
        save_dict = pickle.load(f)

    # Initialize model with saved configuration
    model = LightGCN(
        disease_embeddings=torch.randn(save_dict['model_config']['num_diseases'],
                                     save_dict['model_config']['embedding_dim']),
        drug_embeddings=torch.randn(save_dict['model_config']['num_drugs'],
                                  save_dict['model_config']['embedding_dim']),
        num_layers=save_dict['model_config']['num_layers']
    )

    # Load model state
    model.load_state_dict(save_dict['model_state'])

    # Move model to specified device
    model = model.to(device)

    # Prepare data dictionary
    data = {
        'drug_to_idx': save_dict['mappings']['drug_to_idx'],
        'disease_to_idx': save_dict['mappings']['disease_to_idx'],
        'train_edge_index': save_dict['train_edge_index'].to(device)
    }

    return model, data

In [21]:
# After training your model
save_model_pickle(
    model=model,
    data=data,
    save_path='lightgcn_model_2.pkl'
)

Model saved to lightgcn_model_2.pkl


In [6]:
# Load the model
model, data = load_model_pickle(
    load_path='lightgcn_model_2.pkl',
    device=device
)


In [23]:
# test that it saved and works alright after load

score_info = get_pair_score(
    model=model,
    drug_id="CHEMBL.COMPOUND:CHEMBL30",
    disease_id="MONDO:0007186",
    data=data,
    device=device
)

print("score for the pair")
print(f"Raw score: {score_info['raw_score']:.4f}")
print(f"Probability: {score_info['probability']:.4f}")

score for the pair
Raw score: 4.3494
Probability: 0.9873
