In [4]:
import torch
import torch.nn as nn
import copy
import os
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, confusion_matrix, precision_recall_curve, precision_score
from models import binary_cross_entropy, cross_entropy_logits, entropy_logits, RandomLayer
from prettytable import PrettyTable
from domain_adaptator import ReverseLayerF
from tqdm import tqdm
import torch.nn.functional as F


DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################



In [5]:
try:

  from models import GraphBAN
  from time import time
  from utils import set_seed, graph_collate_func, mkdir,graph_collate_func2
  from configs import get_cfg_defaults
  from dataloader import DTIDataset, MultiDataLoader, DTIDataset2
  from torch.utils.data import DataLoader
  from trainer import Trainer
  from domain_adaptator import Discriminator
  import torch
  import argparse
  import warnings, os
  import torch
  import torch.nn as nn
  import copy
  import os
  import numpy as np
  from tqdm import tqdm
  from rdkit.Chem import AllChem
  import torch.nn as nn
  import torch.nn.functional as F
  import torch
  import math
  from dgllife.model.gnn import GCN
  import pandas as pd
  from torch.nn.utils.weight_norm import weight_norm
except:
  print("Error: please change the runtime mode to GPU-enabled type such as T4!\n >>> Runtime/change runtime type/Hardware acceleratore/ T4 GPU")


**The Transductive and Inductive settings are in different places. To run the model for transductive mode use 'cfg_path' of Non_DA and for inductive one use 'DA' yaml file.**

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# If you want to change the settings such as number of epochs for teh GraphBAN`s main model change it through GraphBAN_Demo.yaml.
# If you want to run the model for transductive analysis, use GraphBAN_None_DA.yaml
#cfg_path = "/content/GraphBAN/GraphBAN_None_DA.yaml"
cfg_path = "/content/GraphBAN/GraphBAN_DA.yaml"
cfg = get_cfg_defaults()
cfg.merge_from_file(cfg_path)
cfg.freeze()
torch.cuda.empty_cache()
warnings.filterwarnings("ignore")
set_seed(cfg.SOLVER.SEED)
mkdir(cfg.RESULT.OUTPUT_DIR)
experiment = None
print(f"Config yaml: {cfg_path}")
print(f"Running on: {device}")
print(f"Hyperparameters:")
dict(cfg)

Config yaml: /content/GraphBAN/GraphBAN_DA.yaml
Running on: cuda
Hyperparameters:


{'DRUG': CfgNode({'NODE_IN_FEATS': 75, 'PADDING': True, 'HIDDEN_LAYERS': [128, 128, 128], 'NODE_IN_EMBEDDING': 128, 'MAX_NODES': 290}),
 'PROTEIN': CfgNode({'NUM_FILTERS': [128, 128, 128], 'KERNEL_SIZE': [3, 6, 9], 'EMBEDDING_DIM': 128, 'PADDING': True}),
 'BCN': CfgNode({'HEADS': 2}),
 'DECODER': CfgNode({'NAME': 'MLP', 'IN_DIM': 256, 'HIDDEN_DIM': 512, 'OUT_DIM': 128, 'BINARY': 2}),
 'SOLVER': CfgNode({'MAX_EPOCH': 1, 'BATCH_SIZE': 32, 'NUM_WORKERS': 0, 'LR': 0.0001, 'DA_LR': 5e-05, 'SEED': 10}),
 'RESULT': CfgNode({'OUTPUT_DIR': './result', 'SAVE_MODEL': True}),
 'DA': CfgNode({'TASK': True, 'METHOD': 'CDAN', 'USE': True, 'INIT_EPOCH': 10, 'LAMB_DA': 1, 'RANDOM_LAYER': True, 'ORIGINAL_RANDOM': True, 'RANDOM_DIM': 256, 'USE_ENTROPY': False})}

In [None]:
# This cell is to read BioSNAP dataset in transductive mode.

# Read your custom dataset here. it should be separated in three divitions for any of inductive or transductive analysis in the form of CSV or parquet.

#df_train = pd.read_csv("/content/GraphBAN/Data/BioSNAP/transductive/train.csv")
#df_val = pd.read_csv("/content/GraphBAN/Data/BioSNAP/transductive/val.csv")
#df_test = pd.read_csv("/content/GraphBAN/Data/BioSNAP/transductive/test.csv")

In [7]:
# This cell is for read BioSNAP dataset in inductive mode.

df_train = pd.read_csv("/content/GraphBAN/Data/BioSNAP/inductive/source_train.csv")# source_train
df_val = pd.read_csv("/content/GraphBAN/Data/BioSNAP/inductive/target_train.csv")#target_train
df_test = pd.read_csv("/content/GraphBAN/Data/BioSNAP/inductive/target_test.csv")#target_test

In [8]:
df_train.shape, df_val.shape, df_test.shape

((9766, 3), (3628, 3), (907, 3))

In [9]:
df_train.head()

Unnamed: 0,SMILES,Protein,Y
0,CC1=CN=C2N1C=CN=C2NCC1=CC=NC=C1,MARSLLLPLQILLLSLALETAGEEAQGDKIIDGAPCARGSHPWQVA...,0
1,FC(F)OC(F)(F)C(F)Cl,MVSAKKVPAIALSAGVSFALLRFLCLAVCLNESPGQNQKEEKLCTE...,1
2,[H][C@@]12C[C@@]3([H])C(=C(O)[C@]1(O)C(=O)C(C(...,MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,0
3,CCCN(CCC)CCC1=C2CC(=O)NC2=CC=C1,MAADLGPWNDTINGTWDGDELGYRCRFNEDFKYVLLPVSYGVVCVP...,0
4,CN\C(NCC1=CC=CC=C1)=N/C,MSQGVRRAGAGQGVAAAVQLLVTLSFLRSVVEAQVTGVLDDCLCDI...,0


In [10]:
# This cell generates the FCFP features for the molecules and add them to the train, validation and test datasets.
# should run for both transductive and inductive modes.

from rdkit.Chem import AllChem
import sys
import numpy as np
sys.path.append('/usr/local/lib/python3.7/site-packages/')

try:
  from rdkit import Chem
  from rdkit.Chem.Draw import IPythonConsole
except ImportError:
  print('Stopping RUNTIME. Colaboratory will restart automatically. Please run again.')
  exit()
df_list = [df_train,df_val, df_test]
for dfs in df_list:

    x_batch11 = []
    # y = torch.Tensor([y])
    smiles2 = dfs.iloc[dfs.index]['SMILES']
    batch_smiles2 = list(smiles2)
    for item in batch_smiles2:

        m1 = Chem.MolFromSmiles(str(item))
        fp1 = AllChem.GetMorganFingerprintAsBitVect(m1, radius=2, nBits=1024)
        x = np.array(fp1, dtype=np.float64)
        x_batch11.append(x)
    dfs['fcfp'] = x_batch11

In [None]:
# This cell reads the Teacher`s generated embedding that generated before for BioSNAP dataset in transductive mode.

#train_emb = pd.read_parquet("/content/GraphBAN/Data/BioSNAP/transductive/teacher_biosnap_transductive_emb256_trainset.parquet")

In [11]:
#Teacher`s embedding for BioSNAP on inductive mode.

train_emb = pd.read_csv("/content/GraphBAN/Data/BioSNAP/inductive/teacher_biosnap_cluster_juststructure_emb_256_source_train.csv")

In [None]:
import pandas as pd
b = pd.read_csv("source_train.csv")
b.shape

(14928, 5)

In [None]:
# if you want to use your own dataset, run this cell to generates the teacher`s embedding and coment the above 'train_emb' cells.
# To make sure about the stability of running teh code in dealing with large datasets we used parquet data file.

# Add this in a Google Colab cell to install the correct version of Pytorch Geometric.
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric
!pip install torchmetrics

import teacher

# Define the parameters
epochs = 200  # Number of epochs to run
output_file_path = "output_embeddings.parquet"  # Path where the output file will be saved
data_file_path = "source_train.csv"  # Path to the input data file

# Call the function
output = teacher.run_model(epochs, output_file_path, data_file_path)
print (output)
train_emb = pd.read_parquet(output_file_path)


Looking in links: https://data.pyg.org/whl/torch-2.3.0+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_scatter-2.1.2%2Bpt23cu121-cp310-cp310-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt23cu121
Looking in links: https://data.pyg.org/whl/torch-2.3.0+cu121.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_sparse-0.6.18%2Bpt23cu121-cp310-cp310-linux_x86_64.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt23cu121
Looking in links: https://data.pyg.org/whl/torch-2.3.0+cu121.html
Collecting torch-cluster
  Downloading https://data.p

In [12]:
train_emb['Array'] = train_emb.apply(lambda row: np.array(row), axis=1)

# Drop all columns except the 'Array' column
train_emb.drop(train_emb.columns.difference(['Array']), axis=1, inplace=True)

df_train['teacher_emb'] = train_emb['Array']

In [None]:
df_train.head(2)

Unnamed: 0,SMILES,Protein,Y,fcfp,teacher_emb
0,CC1=CN=C2N1C=CN=C2NCC1=CC=NC=C1,MARSLLLPLQILLLSLALETAGEEAQGDKIIDGAPCARGSHPWQVA...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.5657851, 0.97027797, 1.1025445, 0.927274, 0..."
1,FC(F)OC(F)(F)C(F)Cl,MVSAKKVPAIALSAGVSFALLRFLCLAVCLNESPGQNQKEEKLCTE...,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7476361, 0.7371368, 0.7153522, 0.73685837, ..."


In [None]:
df_test.head(2)

Unnamed: 0,SMILES,Protein,Y,fcfp
0,NC1=CC=NC=C1,MVLAQGLLSMALLALCWERSLAGAEETIPLQTLRCYNDYTSHITCR...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,NC1=CC=NC=C1,MAEKAPPGLNRKTSRSTLSLPPEPVDIIRSKTCSRRVKINVGGLNH...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [13]:
train_dataset = DTIDataset2(df_train.index.values, df_train)
val_dataset = DTIDataset(df_val.index.values, df_val)
test_dataset = DTIDataset(df_test.index.values, df_test)

params1 = {'batch_size': cfg.SOLVER.BATCH_SIZE, 'shuffle': True, 'num_workers': cfg.SOLVER.NUM_WORKERS, 'drop_last': True, 'collate_fn': graph_collate_func}
params2 = {'batch_size': cfg.SOLVER.BATCH_SIZE, 'shuffle': True, 'num_workers': cfg.SOLVER.NUM_WORKERS, 'drop_last': True, 'collate_fn': graph_collate_func2}
source_generator = DataLoader(train_dataset, **params2)
target_generator = DataLoader(val_dataset, **params1)
n_batches = max(len(source_generator), len(target_generator))
multi_generator = MultiDataLoader(dataloaders=[source_generator, target_generator], n_batches=n_batches)
training_generator = DataLoader(train_dataset, **params2)
params1['shuffle'] = False
params1['drop_last'] = False
val_generator = DataLoader(val_dataset,**params1)
test_generator = DataLoader(test_dataset,**params1)

In [14]:
model = GraphBAN(**cfg).to(device)
opt = torch.optim.Adam(model.parameters(), lr=cfg.SOLVER.LR)
if torch.cuda.is_available():
  torch.backends.cudnn.benchmark = True

In [15]:
# In the case that you need to run inductive analysis teh cfg.DA.USE is True otherwise you will run transductive analysis

if cfg.DA.USE:
        if cfg["DA"]["RANDOM_LAYER"]:
            domain_dmm = Discriminator(input_size=cfg["DA"]["RANDOM_DIM"], n_class=cfg["DECODER"]["BINARY"]).to(device)
        else:
            domain_dmm = Discriminator(input_size=cfg["DECODER"]["IN_DIM"] * cfg["DECODER"]["BINARY"],
                                       n_class=cfg["DECODER"]["BINARY"]).to(device)
        # params = list(model.parameters()) + list(domain_dmm.parameters())
        opt = torch.optim.Adam(model.parameters(), lr=cfg.SOLVER.LR)
        opt_da = torch.optim.Adam(domain_dmm.parameters(), lr=cfg.SOLVER.DA_LR)
else:
        opt = torch.optim.Adam(model.parameters(), lr=cfg.SOLVER.LR)
torch.backends.cudnn.benchmark = True

In [None]:
# Run this cell, just for transductive train and prediction


#trainer = Trainer(model, opt, device, training_generator, val_generator, test_generator, opt_da=None, discriminator=None, experiment=None, **cfg)
#result = trainer.train()

# **Expected Result 1**

100%|██████████| 2403/2403 [02:48<00:00, 14.23it/s]


**Training at Epoch 1** with training loss 0.6607703843614829
Validation at Epoch 1 with validation loss 0.6124107770968316  AUROC 0.7655853803644092 AUPRC 0.7644462826852887


**Test at Best Model of Epoch 1** with test loss 0.6113489803788964  AUROC 0.7633377346134166 AUPRC 0.7608600342696579 f1-score 0.7136748315253415 Specificity 0.7110625909752547 Accuracy 0.714727835426907 Thred_optim 0.4407811462879181


In [16]:
# This cell is to run the model for inductive train and prediction


trainer = Trainer(model, opt, device, multi_generator, val_generator, test_generator, opt_da=opt_da,
                          discriminator=domain_dmm,
                          experiment=None, **cfg)
result = trainer.train()

100%|██████████| 305/305 [02:00<00:00,  2.53it/s]


Training at Epoch 1 with model training loss 1.9643451503065765
Validation at Epoch 1 with validation loss 0.7315589843089121  AUROC 0.6056722080528978 AUPRC 0.5826882445602202
Test at Best Model of Epoch 1 with test loss 0.7133082295286244  AUROC 0.6373042886317223 AUPRC 0.6165217661104248 f1-score 0.6818828139220221 Specificity 0.9098901098901099 Accuracy 0.5766262403528115 Thred_optim 0.19310255348682404


# **Expecte Result 2**

100%|██████████| 305/305 [02:02<00:00,  2.49it/s]

**Training at Epoch 1** with model training loss 1.9643451503065765
Validation at Epoch 1 with validation loss 0.7315589843089121  AUROC 0.6056722080528978 AUPRC 0.5826882445602202

**Test at Best Model of Epoch 1** with test loss 0.7133082336392896  AUROC 0.6373042886317223 AUPRC 0.6165217661104248 f1-score 0.6818828139220221 Specificity 0.9098901098901099 Accuracy 0.5766262403528115 Thred_optim 0.19310255348682404
