# Prequisite


In [None]:
import pandas as pd
import torch
import torch_geometric as pyg

# Download and preprocess the data (optional: you can skip this step and directly load the GDA_df.csv file).

In [None]:
# Download and preprocess
from data_acquisition_processing import get_data

api_key ="Disgenet-api_key"
df = get_data(api_key=api_key, disease_type="cancer")

In [None]:
# Load the data
df = pd.read_csv('GDA_df.csv')

# Initialization and Data Splitting

In [None]:
from graph_preparation import prepare_homogeneous_graph

graph = prepare_homogeneous_graph(df)

split = pyg.transforms.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=True,
    add_negative_train_samples=True,
    neg_sampling_ratio=1.0,
    split_labels=True)
train_data, val_data, test_data = split(graph)

print(f'Train data: {train_data}')
print(f'Val data: {val_data}')
print(f'Test data: {test_data}')

Train data: Data(x=[11277, 48], edge_index=[2, 53006], pos_edge_label=[26503], pos_edge_label_index=[2, 26503], neg_edge_label=[26503], neg_edge_label_index=[2, 26503])
Val data: Data(x=[11277, 48], edge_index=[2, 53006], pos_edge_label=[3312], pos_edge_label_index=[2, 3312], neg_edge_label=[3312], neg_edge_label_index=[2, 3312])
Test data: Data(x=[11277, 48], edge_index=[2, 59630], pos_edge_label=[3312], pos_edge_label_index=[2, 3312], neg_edge_label=[3312], neg_edge_label_index=[2, 3312])


In [None]:
input_dim = graph.num_node_features
hidden_dim = 128
output_dim = 64
dropout = 0.2
wd = 1e-4
lr = 1e-3
num_epochs = 50

In [None]:
from models import GCN_DP, GCN_MLP, GraphSAGE_MLP, GIN_MLP
from trainer import Trainer

model = GCN_DP(graph.num_node_features, hidden_dim, output_dim, dropout)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, weight_decay=wd)

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    save_path='test.pth'
)

# Train and Evaluate

In [17]:
trainer.fit(train_data, val_data, num_epochs=num_epochs, early_stopping_patience=10)

Epoch  Train Loss   Pos Loss   Neg Loss   Val F1   Val AUC   Threshold 
  5      1.2038       0.3052     0.8987     0.8067   0.8832    0.61      
Confusion Matrix:
[[2689  623]
 [ 652 2660]]
Epoch  Train Loss   Pos Loss   Neg Loss   Val F1   Val AUC   Threshold 
  10     1.1812       0.3113     0.8699     0.8206   0.8992    0.61      
Confusion Matrix:
[[2772  540]
 [ 632 2680]]
Epoch  Train Loss   Pos Loss   Neg Loss   Val F1   Val AUC   Threshold 
  15     1.1776       0.3026     0.8750     0.8293   0.9084    0.61      
Confusion Matrix:
[[2736  576]
 [ 558 2754]]
Epoch  Train Loss   Pos Loss   Neg Loss   Val F1   Val AUC   Threshold 
  20     1.1476       0.2365     0.9111     0.8602   0.9343    0.63      
Confusion Matrix:
[[2748  564]
 [ 387 2925]]
Epoch  Train Loss   Pos Loss   Neg Loss   Val F1   Val AUC   Threshold 
  25     1.1185       0.2249     0.8935     0.8691   0.9405    0.63      
Confusion Matrix:
[[2895  417]
 [ 446 2866]]
Epoch  Train Loss   Pos Loss   Neg Loss   Val

In [18]:
trainer.test(test_data)

  checkpoint = torch.load(self.save_path)


Test F1: 0.8777
Test AUC: 0.9368
Best Threshold: 0.60
Confusion Matrix:
[[2911  401]
 [ 408 2904]]
