In [23]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from node2vec import Node2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import graph_prep

## Loading Data, Generate Graph & Embeddings

In [24]:
data = ['preprocessed_34_10.tsv', 'preprocessed_42_10.tsv']

file_path = "../data/" + data[0]
df = pd.read_csv(file_path, sep='\t', index_col=0)

In [25]:
G = graph_prep.nx_drug_gene_bipartite(df)
#node2vec_embeddings = graph_prep.node2vec_embedding(G)
#deepwalk_embeddings = graph_prep.deepwalk_embedding(G)

Node2Vec

In [26]:
node2vec_embeddings = graph_prep.node2vec_embedding(G)

Computing transition probabilities:   0%|          | 0/1637 [00:00<?, ?it/s]

Generating walks (CPU: 4): 100%|██████████| 75/75 [00:14<00:00,  5.29it/s]




Deep Walk

In [27]:
deepwalk_embeddings = graph_prep.deepwalk_embedding(G)

Computing transition probabilities:   0%|          | 0/1637 [00:00<?, ?it/s]


Generating walks (CPU: 4): 100%|██████████| 75/75 [00:13<00:00,  5.45it/s]





KeyboardInterrupt: 

## Node2Vec

In [None]:
import numpy as np
import pandas as pd
import os
import pickle
import xgboost as xgb
import torch
import torch.nn as nn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc
from scipy.stats import loguniform, randint
import matplotlib.pyplot as plt

In [None]:
# Directory for saving results
save_dir = '../res/node2vec'
os.makedirs(save_dir, exist_ok=True)

In [None]:
# Initialize dictionaries to store metrics and models
metrics = {}
models = {}
roc_data = {}
prc_data = {}

In [None]:
# Hyperparameter distributions
lr_param_dist = {'C': loguniform(0.001, 1000)}
xgb_param_dist = {
    'learning_rate': loguniform(0.01, 0.2),
    'n_estimators': randint(50, 1000),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'gamma': loguniform(0.001, 1),
    'subsample': loguniform(0.5, 1),
    'colsample_bytree': loguniform(0.5, 1)
}
svm_param_dist = {'C': loguniform(0.001, 1000), 'gamma': loguniform(0.001, 1)}

In [None]:
# Softmax Layer Neural Network
class SoftmaxNN(nn.Module):
    def __init__(self, input_size):
        super(SoftmaxNN, self).__init__()
        self.fc = nn.Linear(input_size, 2)

    def forward(self, x):
        return nn.functional.softmax(self.fc(x), dim=1)

In [None]:
# Function for hyperparameter tuning
def tune_hyperparameters(clf, param_dist, X_train, y_train):
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=100, cv=5, scoring='roc_auc', n_jobs=-1)
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_

In [None]:
# Function for training and evaluating the model
def train_evaluate_model(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_probs = clf.predict_proba(X_test)[:, 1]

    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        #'AUC-ROC': roc_auc_score(y_test, y_probs),
        #'AUC-PRC': auc(*precision_recall_curve(y_test, y_probs)[:2])
    }

    roc_data = roc_curve(y_test, y_probs)
    prc_data = precision_recall_curve(y_test, y_probs)

    print("Metrics: \n")
    print(metrics)
    print("AUC-ROC: ")
    print(roc_data)
    print("AUC-PRC: ")
    print(prc_data)

    return metrics, roc_data, prc_data


# Function for training and evaluating the neural network
def train_evaluate_nn(model, X_train, X_test, y_train, y_test, epochs=100, learning_rate=0.001):
    # Convert data to PyTorch tensors
    X_train_torch = torch.tensor(X_train.astype(np.float32))
    X_test_torch = torch.tensor(X_test.astype(np.float32))
    y_train_torch = torch.tensor(y_train.astype(np.int64))
    y_test_torch = torch.tensor(y_test.astype(np.int64))

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_train_torch)
        loss = criterion(outputs, y_train_torch)
        loss.backward()
        optimizer.step()

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        y_pred_torch = model(X_test_torch)
        y_pred = torch.max(y_pred_torch, 1)[1].numpy()

    # Calculate metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred)
    }

    return metrics

In [None]:
# Function for plotting metrics
def plot_metrics(roc_data, prc_data, metrics, title, save_dir):
    fpr, tpr, _ = roc_data
    precision, recall, _ = prc_data

    fig, axs = plt.subplots(1, 2, figsize=(12, 6))

    # ROC Curve
    axs[0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC AUC = {metrics["AUC-ROC"]:.2f}')
    axs[0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    axs[0].set_title(f'{title} - ROC Curve')
    axs[0].set_xlabel('False Positive Rate')
    axs[0].set_ylabel('True Positive Rate')
    axs[0].legend(loc='lower right')

    # Precision-Recall Curve
    axs[1].plot(recall, precision, color='darkorange', lw=2, label=f'PRC AUC = {metrics["AUC-PRC"]:.2f}')
    axs[1].set_title(f'{title} - Precision-Recall Curve')
    axs[1].set_xlabel('Recall')
    axs[1].set_ylabel('Precision')
    axs[1].legend(loc='lower left')

    plt.tight_layout()
    plt.savefig(save_dir + '/' + title + '.png')
    plt.show()

In [None]:
# Loop through sampling methods
for sampling in ['negative', 'stratified', 'down']:
    # Split edges and labels for training and test
    edges_train, edges_test, y_train, y_test = graph_prep.edge_train_test_split(df, method=sampling)

    # Construct edge features
    X_train = np.array([graph_prep.edge_features(edge, node2vec_embeddings) for edge in edges_train])
    X_test = np.array([graph_prep.edge_features(edge, node2vec_embeddings) for edge in edges_test])
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    print("############# Node2Vec + LR + " + sampling + " #############")
    # Train and evaluate Logistic Regression
    lr_clf = tune_hyperparameters(LogisticRegression(max_iter=100000), lr_param_dist, X_train, y_train)
    lr_metrics, lr_roc_data, lr_prc_data = train_evaluate_model(lr_clf, X_train, X_test, y_train, y_test)
    models['logistic_regression_' + sampling] = lr_clf
    metrics['logistic_regression_' + sampling] = lr_metrics
    roc_data['logistic_regression_' + sampling] = lr_roc_data
    prc_data['logistic_regression_' + sampling] = lr_prc_data

    # print("############# Node2Vec + XGB + " + sampling + " #############\n")
    # # Train and evaluate XGBoost
    # xgb_clf = tune_hyperparameters(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_param_dist, X_train, y_train)
    # xgb_metrics, xgb_roc_data, xgb_prc_data = train_evaluate_model(xgb_clf, X_train, X_test, y_train, y_test)
    # models['xgboost_' + sampling] = xgb_clf
    # metrics['xgboost_' + sampling] = xgb_metrics
    # roc_data['xgboost_' + sampling] = xgb_roc_data
    # prc_data['xgboost_' + sampling] = xgb_prc_data
    #
    # print("############# Node2Vec + SVM + " + sampling + " #############\n")
    # # Train and evaluate SVM
    # svm_clf = tune_hyperparameters(SVC(probability=True), svm_param_dist, X_train, y_train)
    # svm_metrics, svm_roc_data, svm_prc_data = train_evaluate_model(svm_clf, X_train, X_test, y_train, y_test)
    # models['svm_' + sampling] = svm_clf
    # metrics['svm_' + sampling] = svm_metrics
    # roc_data['svm_' + sampling] = svm_roc_data
    # prc_data['svm_' + sampling] = svm_prc_data
    #
    # print("############# Node2Vec + NN + " + sampling + " #############\n")
    # # Train and evaluate Softmax Neural Network
    # nn_model = SoftmaxNN(X_train.shape[1])
    # nn_metrics = train_evaluate_nn(nn_model, X_train, X_test, y_train, y_test)
    # models['softmax_nn_' + sampling] = nn_model
    # metrics['softmax_nn_' + sampling] = nn_metrics

# Save models and metrics
for model_key in models:
    pickle.dump(models[model_key], open(f'{save_dir}/{model_key}_model.pkl', 'wb'))
    pd.DataFrame([metrics[model_key]]).to_csv(f'{save_dir}/{model_key}_metrics.csv', index=False)

# Plotting
for model_key in roc_data:
    plot_metrics(roc_data[model_key], prc_data[model_key], metrics[model_key], model_key, save_dir)