In [1]:
import optuna
from optuna import Trial

from math import sqrt
from typing import Tuple, List

import numpy as np
import pandas as pd
from mordred import Calculator, descriptors
#import openbabel
from openbabel import pybel
from PyBioMed.PyMolecule.fingerprint import CalculatePubChemFingerprint,CalculateECFP2Fingerprint
from rdkit import Chem
from rdkit.Chem.rdchem import Atom

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, roc_curve, auc 
from sklearn.metrics import precision_recall_curve


from torch_geometric.data import Data
from torch_geometric.loader import DataLoader as G_Loader 
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import BatchNorm


# RDkit
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix

from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdMolDescriptors
from rdkit import DataStructs

# Pytorch and Pytorch Geometric
import torch

import torch.nn as nn
from torch.nn import Linear
import torch.optim as optim
import torch.nn.functional as F # activation function
from torch.utils.data import Dataset
from torch.utils.data import DataLoader as V_Loader # dataset management

from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, roc_curve, auc 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score

# performances visualization 
import matplotlib.pyplot as plt
#import seaborn as sns
import statistics
from prettytable import PrettyTable
%run ./my_performances.ipynb 


#%run ./graph_feature.ipynb 
#%run ./dataset_processing.ipynb 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# reload the data fingerprint 
#=======================================================
k=10
final_clean_fingerp_train=[]
final_clean_fingerp_val=[]
for i in range(k):
    final_clean_fingerp_train.append(np.load('final_clean_fingerp_train'+ str(i)+'.npy'))
    final_clean_fingerp_val.append(np.load('final_clean_fingerp_val' +str(i)+'.npy'))

final_clean_fingerp_test = np.load('final_clean_fingerp_test.npy')

In [3]:
train_idx = np.load('train_indices.npy')
val_idx = np.load('val_indices.npy')
test_idx = np.load('test_indices.npy')

In [4]:
# load the output label 
total_train_targets =[]
total_validation_targets =[]
total_test_targets=[]
for i in range(k):
    total_train_targets.append(np.load('total_train_targets'+ str(i)+'.npy'))
    total_validation_targets.append(np.load('total_validation_targets' +str(i)+'.npy'))

total_test_targets= np.load('total_test_targets.npy')

In [5]:
# create dataloader for training (vector data)
#======================================================================================
list_data_fingerp_train =[]
list_data_fingerp_val =[]
list_data_target_train =[]
list_data_target_val =[]

for data_train, data_val, tr_targets, val_targets in zip(final_clean_fingerp_train, final_clean_fingerp_val,total_train_targets, total_validation_targets):
    train_loader = V_Loader(dataset = data_train, batch_size = 126)
    val_loader = V_Loader(dataset = data_val, batch_size = 126)
    
    tr_target_loader = V_Loader(dataset = tr_targets, batch_size = 126)
    val_target_loader =  V_Loader(dataset = val_targets, batch_size = 126)
    
    list_data_fingerp_train.append(train_loader)
    list_data_fingerp_val.append(val_loader)
    
    list_data_target_train.append(tr_target_loader)
    list_data_target_val.append(val_target_loader)

In [6]:
#criterion = torch.nn.CrossEntropyLoss()
#define the loss function 
criterion = torch.nn.BCELoss()

In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, roc_curve, auc 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score

# performances visualization 
import matplotlib.pyplot as plt
#import seaborn as sns
import statistics
import math
from prettytable import PrettyTable
%run ./my_performances.ipynb 


def test_1(v_loaderB,v_target, combined_model):
    combined_model.eval()
    list_pred =[]
    list_targets =[]
    correct = 0
    for data_X2, data_target in zip (v_loaderB,v_target):  # Iterate in batches over the training/test dataset.
            out = combined_model(torch.tensor(data_X2, dtype=torch.float32))
            out_1 = out[:,0]
            
            list_pred.append(out_1.item())
            list_targets.append(data_target.item())
    return list_pred, list_targets

# used to count the train accuracy ,and validation accuracy when in the training mode 
def test(v_loaderB,target_v_loaderB, combined_model):
    combined_model.eval()

    correct = 0
    for data_X2, data_target in zip(v_loaderB,target_v_loaderB):  # Iterate in batches over the training/test dataset.
            out = combined_model(torch.tensor(data_X2, dtype=torch.float32))
            out_1 = out[:,0]
            for i,value in enumerate(out_1) :
                if value > 0.5 :
                    out_1[i] = 1
                else : out_1[i] = 0
            pred = out_1  # Use the class with highest probability.
            correct += int((pred == data_target).sum())  # Check against ground-truth labels.
    return correct / len(v_loaderB.dataset)  # Derive ratio of correct predictions.

In [8]:
def get_optimizer(gnn_model, learning_rate, optimizer_type, weight_decay=1e-4):
    if optimizer_type==1:
        optimizer = torch.optim.SGD(gnn_model.parameters(), lr=learning_rate, momentum=0.9)
    if optimizer_type==2:
        optimizer = torch.optim.Adam(gnn_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    if optimizer_type ==3 :
        optimizer = torch.optim.Adamax(gnn_model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4)
        
    return optimizer

In [9]:
class modelB(torch.nn.Module):
    def __init__(self, input_features, output_features,dropout_rateB1,dropout_rateB2,dropout_rateB3,  
                 dense_layer1,dense_layer2, dense_layer3):
        super(modelB, self).__init__()
        self.lin1 = nn.Linear(input_features,dense_layer1)
      
        self.lin2 = nn.Linear(int(dense_layer1), dense_layer2)
        self.lin3 = nn.Linear(int(dense_layer2), dense_layer3)
        self.lin4 = nn.Linear(int(dense_layer3), output_features)
        
        self.bn1 = nn.BatchNorm1d(int(dense_layer1))
        self.bn2 = nn.BatchNorm1d(int(dense_layer2))
        self.bn3 = nn.BatchNorm1d(int(dense_layer3))
        self.dropoutB1 = dropout_rateB1
        self.dropoutB2 = dropout_rateB2
        self.dropoutB3 = dropout_rateB3
        
    def forward(self, x):
        x = self.lin1(x)
        x = self.bn1(x)
       
        x = F.dropout(x, p= self.dropoutB1, training=self.training)
        x = x.relu()
  #      
        x = self.lin2(x)
        x = self.bn2(x)   
        x = F.dropout(x, p= self.dropoutB2, training=self.training)
        x = x.relu()
  #      
        x = self.lin3(x)
        x = self.bn3(x)   
        x = F.dropout(x, p= self.dropoutB3, training=self.training)

        x = x.relu()
        x = self.lin4(x)
        return torch.sigmoid(x)        
    

In [10]:
hyper_param_d = {'dropout_rateB1': 0.19029920767006717,
               'dropout_rateB2': 0.12142154817498285,
               'dropout_rateB3': 0.30892320125476364,
               'dense_layer1'  : 180,
               'dense_layer2'  : 96,
               'dense_layer3'  : 40,
               'learning_rate' : 0.00014475405687104653,
               'weight_decay'  : 0.0001839631656485908}

In [11]:
# load the combined model 
k =10
list_modelA = []
list_modelB = []
list_modelC = []

# load model B
for i in range(k):
    
    input_features    = 166 # length of feature data vector 
    output_features   = 1
    
    dropout_rateB1 =hyper_param_d['dropout_rateB1']
    dropout_rateB2 =hyper_param_d['dropout_rateB2']
    dropout_rateB3 =hyper_param_d['dropout_rateB3']
    dense_layer1 = hyper_param_d['dense_layer1']
    dense_layer2 = hyper_param_d['dense_layer2']
    dense_layer3 = hyper_param_d['dense_layer3']
 
        
    model_b= modelB(input_features, output_features,dropout_rateB1,dropout_rateB2,dropout_rateB3,  
                 dense_layer1,dense_layer2, dense_layer3)
    PATH = '0.39289model_fingerp'+ str(i)+'.pth'
    model_b.load_state_dict(torch.load(PATH))
    
    list_modelB.append(model_b)

In [13]:
nCV= 10 # ten crossfold validation 
list_fold_pred =[]
list_fold_targets =[]


v_test_loaderB = V_Loader(dataset = final_clean_fingerp_test, batch_size = 1)
v_test_target = V_Loader(dataset = total_test_targets, batch_size = 1)

for combined_model in list_modelB:  
    list_pred, list_targets = test_1(v_test_loaderB,v_test_target,combined_model)
    list_fold_pred.append(list_pred)
    list_fold_targets.append(list_targets)
    



In [14]:
# GET THE PERFORMANCES FROM THE TEST
#========================================================================
total_performances = performances(list_fold_pred, list_fold_targets, nCV)
list_bal_acc = []
for sen, spec in zip (total_performances[1] , total_performances[2]):
    bal_acc = (sen + spec)/2
    list_bal_acc.append(bal_acc)
                
statistics.mean(list_bal_acc)

0.69517

In [15]:
import statistics
from prettytable import PrettyTable
perf = total_performances
model_title = 'Test Perf.FCNN'
data_type ='MACCS'
Create_Tables(perf, model_title, data_type)

+----------------+-----------+-------+-------+-------+-------+-------+---------+---------+-----------+-------+
|   Model Name   | Data Type | m_ACC |  m_SN |  m_SP | m_MCC | m_AUC | m_Kappa | m_AUROC | m_Bal_ACC |  m_F1 |
+----------------+-----------+-------+-------+-------+-------+-------+---------+---------+-----------+-------+
| Test Perf.FCNN |   MACCS   |  0.74 | 0.814 | 0.577 | 0.393 | 0.744 |  0.392  |  0.744  |   0.695   | 0.811 |
+----------------+-----------+-------+-------+-------+-------+-------+---------+---------+-----------+-------+
+----------------+-----------+-------+-------+-------+-------+-------+---------+---------+-----------+-------+
|   Model Name   | Data Type | e_ACC |  e_SN |   SP  | e_MCC | e_AUC | e_Kappa | e_AUROC | e_Bal_ACC |  e_F1 |
+----------------+-----------+-------+-------+-------+-------+-------+---------+---------+-----------+-------+
| Test Perf.FCNN |   MACCS   | 0.017 | 0.034 | 0.039 | 0.031 | 0.011 |   0.03  |  0.011  |   0.014   | 0.811 |
+