In [2]:
# Partially adopted from https://github.com/gordicaleksa/pytorch-GAT

# @misc{Gordić2020PyTorchGAT,
#   author = {Gordić, Aleksa},
#   title = {pytorch-GAT},
#   year = {2020},
#   publisher = {GitHub},
#   journal = {GitHub repository},
#   howpublished = {\url{https://github.com/gordicaleksa/pytorch-GAT}},
# }

# import base files and mount google drive 
import torch
from google.colab import drive
drive.mount('/content/drive')
import json
import os
import enum


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install torch_geometric
# !pip install igraph
# !apt-get install libcairo2-dev libjpeg-dev libgif-dev
# !pip install pycairo
# !pip install cairocffi
# !pip install GitPython
# !pip install ray[tune]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910476 sha256=3c6d739a26af6f1705b054ad597f24685320a7e289012b30c922256cc0a5ab1f
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Successfully built torch_geometric
Installing collected packages: torch_geometric
Successfully installed torch_geomet

In [4]:
# Run SRGNN
import torch_geometric

In [None]:
# Visualization related imports
import matplotlib.pyplot as plt
import networkx as nx
from networkx.readwrite import json_graph
import igraph as ig

# Main computation libraries
import numpy as np

# Deep learning related imports
import torch
from torch.utils.data import DataLoader, Dataset

# Analysis and Metrics
from sklearn.metrics import f1_score
from ray.air import session
import ray
from ray.tune.schedulers import ASHAScheduler
from ray import tune
from ray.tune.stopper import (CombinedStopper, 
MaximumIterationStopper, TrialPlateauStopper, ExperimentPlateauStopper)
from ray import air

## Data Preprocessing - PPI

In [None]:
DATA_DIR_PATH = os.path.join('/content/drive/MyDrive/DL4H_final/GAT-pt', 'data')
PPI_PATH = os.path.join(DATA_DIR_PATH, 'ppi')
PPI_URL = 'https://data.dgl.ai/dataset/ppi.zip'  # preprocessed PPI data from Deep Graph Library


## Recreation of GAT model

In [None]:
%cd /content/drive/MyDrive/DL4H_Final/

In [None]:
# Let's just define dummy visualization functions for now - just to stop Python interpreter from complaining!
# We'll define them in a moment, properly, I swear.
from torch.hub import download_url_to_file
import zipfile
from src.datasets_dat import *

def plot_in_out_degree_distributions():
    pass

def visualize_graph():
    pass

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # checking whether you have a GPU

config = {
    'dataset_name': DatasetType.PPI.name,
    'should_visualize': False,
    'batch_size': 1,
    'ppi_load_test_only': False  # small optimization for loading test graphs only, we won't use it here
}

data_loader_train, data_loader_val, data_loader_test = load_graph_data(config, device)
# Let's fetch a single batch from the train graph data loader
node_features, node_labels, edge_index = next(iter(data_loader_train))

print('*' * 20)
print(node_features.shape, node_features.dtype)
print(node_labels.shape, node_labels.dtype)
print(edge_index.shape, edge_index.dtype)

In [None]:
#@title
from src.GAT_model import *
from ray import air
scheduler = ASHAScheduler(
    time_attr="training_iteration",
    metric="micro_f1",
    mode="max",
    grace_period=1,
)
# tune_config=tune.TuneConfig(scheduler=scheduler),

stopper = CombinedStopper(
    MaximumIterationStopper(max_iter=10),
    TrialPlateauStopper(metric="micro_f1"),
)

tuner = tune.Tuner(
    tune.with_resources(train_gat_ppi, {"gpu": 1,'cpu': 12}),
    run_config=air.RunConfig(
      name="gnn_exp_heads_per_layer",
      stop=stopper,
      verbose=1,
  ),
    tune_config=tune.TuneConfig(scheduler=scheduler, num_samples=1),
    param_space={
        # distribution for resampling
        # GNNs, contrary to CNNs, are often shallow (it ultimately depends on the graph properties)
        'num_of_epochs': 200,
         'patience_period': 50,
         'lr': 0.005,
         'weight_decay': 0,
         'should_test': True,
         'force_cpu': False,
         'dataset_name': 'PPI',
         'batch_size': 2,
         'should_visualize': False,
         'enable_tensorboard': False,
         'console_log_freq': 10,
         'checkpoint_freq': 5,
         'ppi_load_test_only': False,
        "num_of_layers": 3 , # PPI has got 42% of nodes with all 0 features - that's why 3 layers are useful
        'num_heads_per_layer1': tune.grid_search(list(range(3, 6))),
        "num_heads_per_layer2": tune.grid_search(list(range(3, 6))),  # other values may give even better results from the reported ones
        'num_heads_per_layer3': tune.grid_search(list(range(3, 6))),
        "num_features_per_layer": [PPI_NUM_INPUT_FEATURES, 64, 64, PPI_NUM_CLASSES],  # 64 would also give ~0.975 uF1!
        "add_skip_connection": True,  # skip connection is very important! (keep it otherwise micro-F1 is almost 0)
        "bias": True,  # bias doesn't matter that much
        "dropout": 0.,  # dropout hurts the performance (best to keep it at 0)
        #         "lr": lambda: np.random.uniform(0.0001, 1),
        #         # allow perturbations within this set of categorical values
        #         "momentum": [0.8, 0.9, 0.99],
    }
)
results = tuner.fit()


0,1
Current time:,2023-05-08 06:10:45
Running for:,02:23:51.33
Memory:,5.4/83.5 GiB

Trial name,status,loc,num_heads_per_layer1,num_heads_per_layer2,num_heads_per_layer3,iter,total time (s),micro_f1
train_gat_ppi_f9577_00000,TERMINATED,172.28.0.12:106316,3,3,3,1,328.027,0.956045
train_gat_ppi_f9577_00001,TERMINATED,172.28.0.12:106316,4,3,3,1,327.455,0.972534
train_gat_ppi_f9577_00002,TERMINATED,172.28.0.12:106316,5,3,3,1,331.971,0.974586
train_gat_ppi_f9577_00003,TERMINATED,172.28.0.12:106316,3,4,3,1,327.032,0.862875
train_gat_ppi_f9577_00004,TERMINATED,172.28.0.12:106316,4,4,3,1,305.342,0.927491
train_gat_ppi_f9577_00005,TERMINATED,172.28.0.12:106316,5,4,3,1,329.335,0.906079
train_gat_ppi_f9577_00006,TERMINATED,172.28.0.12:106316,3,5,3,1,330.507,0.977865
train_gat_ppi_f9577_00007,TERMINATED,172.28.0.12:106316,4,5,3,1,329.201,0.978031
train_gat_ppi_f9577_00008,TERMINATED,172.28.0.12:106316,5,5,3,1,177.317,0.942868
train_gat_ppi_f9577_00009,TERMINATED,172.28.0.12:106316,3,3,4,1,247.575,0.869984


[2m[36m(train_gat_ppi pid=106316)[0m {'num_heads_per_layer': [3, 3, 3], 'num_of_layers': 3, 'num_features_per_layer': [50, 64, 64, 121]}
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 1 to CPU. It has 1767 nodes and 34085 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 2 to CPU. It has 1377 nodes and 31081 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 3 to CPU. It has 2263 nodes and 61907 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 4 to CPU. It has 2339 nodes and 67769 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 5 to CPU. It has 1578 nodes and 37740 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 6 to CPU. It has 1021 nodes and 19237 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 7 to CPU. It has 1823 nodes and 46153 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 8 to CPU. It has 2488 nodes and 72878 edges.
[2m[36m(train_gat_

2023-05-08 06:10:45,924	INFO tune.py:762 -- Total run time: 8631.49 seconds (8631.32 seconds for the tuning loop).


[2m[36m(train_gat_ppi pid=106316)[0m **************************************************
[2m[36m(train_gat_ppi pid=106316)[0m Test micro-F1 = 0.9270046420779637


In [None]:
#@title
from ray import air
scheduler = ASHAScheduler(
    time_attr="training_iteration",
    metric="micro_f1",
    mode="max",
    grace_period=1,
)
# tune_config=tune.TuneConfig(scheduler=scheduler),

stopper = CombinedStopper(
    MaximumIterationStopper(max_iter=10),
    TrialPlateauStopper(metric="micro_f1"),
)

tuner = tune.Tuner(
    tune.with_resources(train_gat_ppi, {"gpu": 1,'cpu': 12}),
    run_config=air.RunConfig(
      name="gnn_exp_heads_per_layer",
      stop=stopper,
      verbose=1,
  ),
    tune_config=tune.TuneConfig(scheduler=scheduler, num_samples=1),
    param_space={
        # distribution for resampling
        # GNNs, contrary to CNNs, are often shallow (it ultimately depends on the graph properties)
        'num_of_epochs': 200,
         'patience_period': 50,
         'lr': 0.005,
         'weight_decay': 0,
         'should_test': True,
         'force_cpu': False,
         'dataset_name': 'PPI',
         'batch_size': 2,
         'should_visualize': False,
         'enable_tensorboard': False,
         'console_log_freq': 10,
         'checkpoint_freq': 5,
         'ppi_load_test_only': False,
        "num_of_layers": 3 , # PPI has got 42% of nodes with all 0 features - that's why 3 layers are useful
        'num_heads_per_layer1': tune.grid_search(list(range(3, 6))),
        "num_heads_per_layer2": tune.grid_search(list(range(3, 6))),  # other values may give even better results from the reported ones
        'num_heads_per_layer3': tune.grid_search(list(range(3, 6))),
        "num_features_per_layer": [PPI_NUM_INPUT_FEATURES, 64, 64, PPI_NUM_CLASSES],  # 64 would also give ~0.975 uF1!
        "add_skip_connection": True,  # skip connection is very important! (keep it otherwise micro-F1 is almost 0)
        "bias": True,  # bias doesn't matter that much
        "dropout": 0.,  # dropout hurts the performance (best to keep it at 0)
        #         "lr": lambda: np.random.uniform(0.0001, 1),
        #         # allow perturbations within this set of categorical values
        #         "momentum": [0.8, 0.9, 0.99],
    }
)
results = tuner.fit()


0,1
Current time:,2023-05-08 06:10:45
Running for:,02:23:51.33
Memory:,5.4/83.5 GiB

Trial name,status,loc,num_heads_per_layer1,num_heads_per_layer2,num_heads_per_layer3,iter,total time (s),micro_f1
train_gat_ppi_f9577_00000,TERMINATED,172.28.0.12:106316,3,3,3,1,328.027,0.956045
train_gat_ppi_f9577_00001,TERMINATED,172.28.0.12:106316,4,3,3,1,327.455,0.972534
train_gat_ppi_f9577_00002,TERMINATED,172.28.0.12:106316,5,3,3,1,331.971,0.974586
train_gat_ppi_f9577_00003,TERMINATED,172.28.0.12:106316,3,4,3,1,327.032,0.862875
train_gat_ppi_f9577_00004,TERMINATED,172.28.0.12:106316,4,4,3,1,305.342,0.927491
train_gat_ppi_f9577_00005,TERMINATED,172.28.0.12:106316,5,4,3,1,329.335,0.906079
train_gat_ppi_f9577_00006,TERMINATED,172.28.0.12:106316,3,5,3,1,330.507,0.977865
train_gat_ppi_f9577_00007,TERMINATED,172.28.0.12:106316,4,5,3,1,329.201,0.978031
train_gat_ppi_f9577_00008,TERMINATED,172.28.0.12:106316,5,5,3,1,177.317,0.942868
train_gat_ppi_f9577_00009,TERMINATED,172.28.0.12:106316,3,3,4,1,247.575,0.869984


[2m[36m(train_gat_ppi pid=106316)[0m {'num_heads_per_layer': [3, 3, 3], 'num_of_layers': 3, 'num_features_per_layer': [50, 64, 64, 121]}
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 1 to CPU. It has 1767 nodes and 34085 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 2 to CPU. It has 1377 nodes and 31081 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 3 to CPU. It has 2263 nodes and 61907 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 4 to CPU. It has 2339 nodes and 67769 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 5 to CPU. It has 1578 nodes and 37740 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 6 to CPU. It has 1021 nodes and 19237 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 7 to CPU. It has 1823 nodes and 46153 edges.
[2m[36m(train_gat_ppi pid=106316)[0m Loading train graph 8 to CPU. It has 2488 nodes and 72878 edges.
[2m[36m(train_gat_

2023-05-08 06:10:45,924	INFO tune.py:762 -- Total run time: 8631.49 seconds (8631.32 seconds for the tuning loop).


[2m[36m(train_gat_ppi pid=106316)[0m **************************************************
[2m[36m(train_gat_ppi pid=106316)[0m Test micro-F1 = 0.9270046420779637


In [43]:
import zipfile

zip_ref = zipfile.ZipFile("/content/drive/MyDrive/SR-GNN/orig_paper/datasets/dataset-train-diginetica.zip", 'r')
zip_ref.extractall("/content/drive/MyDrive/DL4H_Final/datasets/")
zip_ref.close()

In [51]:
%cd /content/drive/MyDrive/DL4H_Final/datasets

/content/drive/MyDrive/SR-GNN/orig_paper/datasets


In [55]:
!python preprocess.py --dataset=diginetica

Namespace(dataset='diginetica')
-- Starting @ 2023-05-06 19:10:18.918161s
-- Reading data @ 2023-05-06 19:10:32.287916s
Splitting date 1464134400.0
186670
15979
[('4737', 1451606400.0), ('4741', 1451606400.0), ('4742', 1451606400.0)]
[('289', 1464220800.0), ('290', 1464220800.0), ('302', 1464220800.0)]
-- Splitting train set and test set @ 2023-05-06 19:10:35.327564s
43098
719470
60858
[[1], [3, 4], [3]] [1451606400.0, 1451606400.0, 1451606400.0] [2, 5, 4]
[[21553, 20071, 8762, 21566, 6381], [21553, 20071, 8762, 21566], [21553, 20071, 8762]] [1464220800.0, 1464220800.0, 1464220800.0] [21566, 6381, 21566]
avg length:  4.850942344040704
Done.


## Reproduce the results from SR-GNN

In [None]:
%cd /content/drive/MyDrive/DL4H_Final/src

In [6]:
## Train the base GNN model 
!python main.py --dataset=diginetica --model=GNN

Namespace(dataset='diginetica', batchSize=100, hiddenSize=100, epoch=30, lr=0.001, lr_dc=0.1, lr_dc_step=3, l2=1e-05, step=1, patience=10, nonhybrid=False, validation=False, valid_portion=0.1)
-------------------------------------------------------
epoch:  0
start training:  2023-05-06 19:27:05.976381
  A = trans_to_cuda(torch.Tensor(A).float())
[0/7195] Loss: 10.6728
[1440/7195] Loss: 8.8680
[2880/7195] Loss: 6.8838
[4320/7195] Loss: 6.0367
[5760/7195] Loss: 5.6701
	Loss:	50932.078
start predicting:  2023-05-06 19:33:55.264334
Best Result:
	Recall@20:	44.2423	MMR@20:	14.0486	Epoch:	0,	0
-------------------------------------------------------
epoch:  1
start training:  2023-05-06 19:34:17.380135
[0/7195] Loss: 5.1191
[1440/7195] Loss: 5.7019
[2880/7195] Loss: 5.2911
[4320/7195] Loss: 5.4294
[5760/7195] Loss: 5.0805
	Loss:	38185.148
start predicting:  2023-05-06 19:41:01.911713
Best Result:
	Recall@20:	47.3627	MMR@20:	15.1188	Epoch:	1,	1
-------------------------------------------------

## Using Other Attention Layers

In [2]:
# Train the self-attention Variation
!python main.py --dataset=diginetica --model=GCSAN

Namespace(dataset='diginetica', batchSize=50, hiddenSize=120, epoch=30, lr=0.001, lr_dc=0.1, lr_dc_step=3, l2=1e-05, step=1, patience=10, nonhybrid=False, validation=False, valid_portion=0.1, dynamic=False)
-------------------------------------------------------
epoch:  0
start training:  2023-05-07 22:10:46.867991
  A = trans_to_cuda(torch.Tensor(A).float())
[0/14390] Loss: 10.6635
[2879/14390] Loss: 8.8186
[5758/14390] Loss: 7.7137
[8637/14390] Loss: 7.4009
[11516/14390] Loss: 5.9949
	Loss:	111584.617
start predicting:  2023-05-07 22:19:14.509211
Best Result:
	Recall@20:	34.5016	MMR@20:	10.0339	Epoch:	0,	0
-------------------------------------------------------
epoch:  1
start training:  2023-05-07 22:19:41.590879
[0/14390] Loss: 6.3254
[2879/14390] Loss: 5.4605
[5758/14390] Loss: 6.2063
[8637/14390] Loss: 5.9369
[11516/14390] Loss: 5.8932
	Loss:	86674.406
start predicting:  2023-05-07 22:28:01.658884
Best Result:
	Recall@20:	40.9313	MMR@20:	12.5781	Epoch:	1,	1
----------------------

In [15]:
# GCSAN for yoochoose1_64
!python main.py --dataset=yoochoose1_64

Namespace(dataset='yoochoose1_64', batchSize=50, hiddenSize=120, epoch=30, lr=0.001, lr_dc=0.1, lr_dc_step=3, l2=1e-05, step=1, patience=10, nonhybrid=False, validation=False, valid_portion=0.1, dynamic=False)
-------------------------------------------------------
epoch:  0
start training:  2023-05-08 07:08:51.777595
  A = trans_to_cuda(torch.Tensor(A).float())
[0/7398] Loss: 10.5207
[1480/7398] Loss: 7.2021
[2960/7398] Loss: 6.7552
[4440/7398] Loss: 5.9518
[5920/7398] Loss: 6.1582
	Loss:	45059.945
start predicting:  2023-05-08 07:17:02.650447
Best Result:
	Recall@20:	60.5460	MMR@20:	23.3308	Epoch:	0,	0
-------------------------------------------------------
epoch:  1
start training:  2023-05-08 07:17:34.470590
[0/7398] Loss: 5.2449
[1480/7398] Loss: 5.5261
[2960/7398] Loss: 4.6513
[4440/7398] Loss: 4.8553
[5920/7398] Loss: 4.7125
	Loss:	35882.973
start predicting:  2023-05-08 07:25:42.613701
Best Result:
	Recall@20:	65.9898	MMR@20:	26.4745	Epoch:	1,	1
--------------------------------

In [9]:
# Train SR-GAN model
!python main.py --dataset=diginetica --model=SRGAN

2023-05-08 23:26:31,394 main.py[line:37] Namespace(dataset='diginetica', batch_size=100, hidden_size=100, epoch=10, lr=0.001, lr_dc=0.1, lr_dc_step=3, l2=1e-05, top_k=20, patience=10)
2023-05-08 23:26:31,525 main.py[line:52] logging to /content/drive/MyDrive/SR-GNN/geometric/src/../log/diginetica/Namespace(dataset='diginetica', batch_size=100, hidden_size=100, epoch=10, lr=0.001, lr_dc=0.1, lr_dc_step=3, l2=1e-05, top_k=20, patience=10)
2023-05-08 23:26:33,247 main.py[line:67] GNNModel(
  (embedding): Embedding(43097, 100)
  (gat1): GATConv(100, 100, heads=1)
  (gat2): GATConv(100, 100, heads=1)
  (e2s): Embedding2Score(
    (W_1): Linear(in_features=100, out_features=100, bias=True)
    (W_2): Linear(in_features=100, out_features=100, bias=True)
    (q): Linear(in_features=100, out_features=1, bias=True)
    (W_3): Linear(in_features=200, out_features=100, bias=True)
  )
  (loss_function): CrossEntropyLoss()
)
	Loss:	513.313
2023-05-08 23:30:44,712 main.py[line:86] Best Result:
2023-0