In [None]:
import numpy as np
import pandas as pd
import os
import torch
import warnings
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl.data import DGLDataset
from torch.optim import Adam
import tqdm as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, f1_score, recall_score, roc_auc_score
import optuna
from optuna import trial
from optuna.samplers import TPESampler


warnings.filterwarnings('ignore')
np.random.seed(12)
torch.manual_seed(12)

# GPU Setting

In [None]:
print(dgl.__version__)

In [None]:
!nvidia-smi

Fri Jun 30 08:08:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
device = torch.device('cuda' if torch.cuda.is_availabe() else 'cpu')

# Data Load

In [None]:
dataset = dgl.data.CSVDataset('your_folder_route')
len(dataset)

In [None]:
graph0, data0 = dataset[0]
print(graph0)

In [None]:
print(data0)

In [None]:
# Make self loop
self_dataset = []
for graph, data in dataset:
  graph = dgl.add_self_loop(graph)
  self_dataset.append((graph, data))
print(len(self_dataset))

In [None]:
graph0, data0 = self_dataset[0]
print(graph0)

In [None]:
print(data0)

In [None]:
in_fetas = graph0.ndata['feat'].shape[1]
edge_fetas = graph0.edata['feat'].shape[1]

# Train / Test Split, Batch

In [None]:
labels = np.array([self_dataset[i][1] for i in range(len(self_dataset))])

neg_indices = np.where(labels == 0)[0]
pos_indices = np.where(labels == 1)[0]

train_ratio ,val_ratio, test_ratio = 0.6, 0.2, 0.2

train_neg, val_test_neg = train_test_split(neg_indices, train_size = train_ratio, random_state = 12)
val_neg, test_neg = train_test_split(val_test_neg, train_size = 0.5, random_state = 12)

train_pos, val_test_pos = train_test_split(pos_indices, train_size = train_ratio, random_state = 12)
val_pos, test_pos = train_test_split(val_test_pos, train_size = 0.5, random_state = 12)

train_indices = np.concatenate([train_neg, train_pos])
val_indices = np.concatenate([val_neg, val_pos])
test_indices = np.concatenate([test_neg, test_pos])

In [None]:
train_dataset, val_dataset, test_dataset = [], [], []

for index in train_indices:
  train_dataset.append(self_dataset[index])
np.random.shuffle(train_dataset)

for index in val_indices:
  val_dataset.append(self_dataset[index])
np.random.shuffle(val_dataset)

for index in test_indices:
  test_dataset.append(self_dataset[index])
np.random.shuffle(test_dataset)


#Modeling

In [None]:
from dgl.nn import EdgeGATConv, AvgPooling

class EdgeGATModel(nn.Moudle):
  def __init__(self, in_feats, edge_feats, hidden_feats, out_feats, num_heads):
    super(EdgeGATModel, self).__init__()
    self.edge_gat1 = EdgeGATConv(in_feats = in_feats,
                                 edge_feats = edge_feats,
                                 out_feats = hidden_feats,
                                 num_heads = num_heads)
    self.edge_gat2 = EdgeGATConv(in_feats = hidden_feats,
                                 edge_feats = edge_feats,
                                 out_feats = hidden_feats,
                                 num_heads = num_heads)
    self.Linear = nn.Linear(hidden_feats, 1)

  def forward(self, graph, node_feats, edge_feats, num_heads):
    hidden1 = self.edge_gat1(graph, node_feats, edge_feats)
    hidden1 = torch.mean(hidden1, dim = 1, keepdim = True)
    hidden1 = hidden1.view(hidden1.shape[0], -1)
    hidden1 = F.leaky_relu(hidden1)

    hidden2 = self.edge_gat2(graph, hidden1, edge_feats)
    hidden2 = torch.mean(hidden2, dim = 1, keepdim = True)
    hidden2 = hidden2.view(hidden2.shape[0], -1)
    hidden2 = F.leaky_relu(hidden2)
    graph.ndata['h'] = hidden2

    hg = dgl.mean_nodes(graph, 'h')
    hg = self.Linear(hg)
    return hg

# Hyper Params (Optuna)

In [None]:
def objective(trial):
  hidden_feats = 2 ** trail.suggest_int('hidden_feats', 3, 5)
  num_heads = trial.suggest_int('num_heads', 2, 5)
  learning_rate = 10 ** trial.suggest_int('learning_rate', -5, -2)
  num_epochs = trial.suggest_int('num_epochs', 10, 100, 10)
  batch_size = 2 ** trial.suggest_int('batch_size', 5, 8)

  train_dataloader = GraphDataLoader(
      train_dataset,
      batch_size = batch_size,
      drop_last = False)

  val_dataloader = GraphDataLoader(
      val_dataset,
      batch_size = batch_size,
      drop_last = False)

  model = EdgeGATModel(in_feats = in_feats,
                     edge_feats = edge_feats,
                     hidden_feats = hidden_feats,
                     num_heads = num_heads)

  optimizer = Adam(model.parameters(), lr = learning_Rate)
  criterion = nn.BCEWithLogitsLoss()

  model.train()
  for batched_graph, labels in train_dataloader:
    optimizer.zero_grad()
    pred = model(batched_graph, batched_graph.ndata['feat'], batched_graph.edata['feat'], num_heads)
    loss= criterion(pred, labels.float())
    loss.backward()
    optimizer.step()

  val_loss = 0
  step = 0
  model.eval()

  with torch.no_grad():
    for batched_graph, labels in val_dataloader:
      step += 1
      pred = model(batched_graph, batched_graph.ndata['feat'], batched_graph.edata['feat'], num_heads)
      loss= criterion(pred, labels.float())
      val_loss += loss.item()
      average_val_loss = val_loss / step
      trial.report(average_val_loss, step)
      if trial.should_prune():
        raise optuna.TrialPruned()

  return average_val_loss

In [None]:
sampler = TPESampler(seed = 12)
study = optuna.create_study(
    study_name = 'EdgeGAT_OPT',
    direction = 'minimize',
    sampler = sampler)
study.optimize(objective, n_trials = 100)
print('Best Score : ', study.best_value)
print('Best Trial : ', study.best_trial.params)

#Training

In [None]:
train_dataloader = GraphDataLoader(
    train_dataset,
    batch_size = 2 ** study.best_trial.params['batch_size'],
    drop_last = False
)

val_dataloader = GraphDataLoader(
    val_dataset,
    batch_size = 2 ** study.best_trial.params['batch_size'],
    drop_last = False
)

test_dataloader = GraphDataLoader(
    test_dataset,
    batch_size = 2 ** study.best_trial.params['batch_size'],
    drop_last = False
)

In [None]:
hidden_feats = 2 ** study.best_trial.params['hidden_feats']
learning_rate = 10 ** study.best_trial.params['learning_rate']

num_heads = study.best_trial.params['num_heads']
num_epochs = study.best_trial.params['num_epochs']

In [None]:
model = EdgeGATModel(in_feats = in_feats,
                     edge_feats = edge_feats,
                     hidden_feats = hidden_feats,
                     num_heads = num_heads)
optimizer = Adam(model.parameters(), lr = learning_Rate)
criterion = nn.BCEWithLogitsLoss()

In [None]:
progress_bar = tqdm(range(num_epochs),desc = 'Model Training')

model.train()
for epoch in progress_bar:
  for batched_graph, labels in train_dataloader:
    optimizer.zero_grad()
    pred = model(batched_graph, batched_graph.ndata['feat'], batched_graph.edata['feat'], num_heads)
    loss = criterion(pred, labels.float())
    loss.backward()
    optimizer.step()
    progress_bar.set_description(f'Epoch - {epoch + 1}')

## Validation (Find Optimal Threshold)

In [None]:
from sklearn.metrics import precision_recall_curve
from matplotlib import pyplot as plt

In [None]:
def find_optimal_threshold(model, dataloader):
  model.eval()
  with torch.no_grad():
    all_pred = []
    all_labels = []
    for batched_graph, labels in dataloader:
      pred = model(batched_graph, batched_graph.ndata['feat'], batched_graph.edata['feat'], num_heads)
      all_pred.append(pred)
      all_labels.append(labels.float())

    all_pred = torch.sigmoid(torch.cat(all_pred))
    all_labels = torch.cat(all_labels)
    precision, recall, threshold = precision_recall_curve(all_labels, all_pred)
    f1 = precision * recall * 2 / (precision + recall)
    ix = np.argmax(f1)
    opt_thr = threshold[ix]
    print(f'Optimal Threshold : {opt_thr:.2f}, F1 Score : {f1[ix]:.2f}')

    plt.plot(recall, precision, marker = ',', label = 'EdgeGAT')
    plt.scatter(recall[ix], precision[ix], marker = 'o', color = 'black', label = 'Optimal')
    plt.title('Precision - Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.show()

    return opt_thr

In [None]:
best_threshold = find_optimal_threshold(model, val_dataloader)

## Test (Result)

In [None]:
def evaluate(model, dataloader, threshold):
  model.eval()
  with torch.no_grad():
    all_pred = []
    all_labels = []
    for batched_graph, labels in dataloader:
      pred = model(batched_graph, batched_graph.ndata['feat'], batched_graph.edata['feat'], num_heads)
      all_pred.append(pred)
      all_labels.append(labels.float())

    all_pred = torch.sigmoid(torch.cat(all_pred))
    all_labels = torch.cat(all_labels)
    pred_labels = (all_pred >= threshold).long()
    f1 = f1_score(all_labels, pred_labels)
    recall = recall_score(all_labels, pred_labels)
    accuracy = (pred_labels == all_labels).float().mean().item()

    return f1, recall, accuracy

In [None]:
f1, recall, accuracy = evaluate(model, test_dataloader, best_threshold)
print(f"Accuracy : {accuracy:.2f}, Recall : {recall:.2f}, F1 : {f1:.2f}")