In [1]:
import optuna
from optuna import Trial

from math import sqrt
from typing import Tuple, List

import numpy as np
import pandas as pd
from mordred import Calculator, descriptors
#import openbabel
from openbabel import pybel
from PyBioMed.PyMolecule.fingerprint import CalculatePubChemFingerprint,CalculateECFP2Fingerprint
from rdkit import Chem
from rdkit.Chem.rdchem import Atom

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, roc_curve, auc 
from sklearn.metrics import precision_recall_curve

from sklearn.model_selection import StratifiedKFold, KFold


from torch_geometric.data import Data
from torch_geometric.loader import DataLoader as G_Loader 
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import BatchNorm


# RDkit
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix

from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdMolDescriptors
from rdkit import DataStructs


# Pytorch and Pytorch Geometric
import torch

import torch.nn as nn
from torch.nn import Linear
import torch.optim as optim
import torch.nn.functional as F # activation function
from torch.utils.data import Dataset
from torch.utils.data import DataLoader as V_Loader # dataset management


%run ./graph_feature.ipynb 
%run ./dataset_processing.ipynb 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def compute_fingerprint_ECFP2(smiles_list: List[str]) -> np.ndarray:
    """
    Compute ECFP2 & PubChem fingerprint features for a list 
    of SMILES strings

    Parameters
    ----------
    smiles_list: List[str]
        The list of SMILES strings.

    Returns
    -------
    np.ndarray
        Returns a 2D numpy array, where each row corrsponds
        to the fingerprints of a SMILES strings in order.
    """
    molecular_mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
    # Initialize an array to store ECFP2 & PubChem fingerprint features
    features = np.zeros((len(smiles_list), 1024), dtype=np.int32)

    for i, mol in enumerate(molecular_mols):
        ECFP2_mol_fingerprint = CalculateECFP2Fingerprint(mol)
        numerical_representation = ECFP2_mol_fingerprint[0]
        features[i] = numerical_representation

    return features


def compute_fingerprint_PubChem(smiles_list: List[str]) -> np.ndarray:
    """
    Compute ECFP2 & PubChem fingerprint features for a list 
    of SMILES strings

    Parameters
    ----------
    smiles_list: List[str]
        The list of SMILES strings.

    Returns
    -------
    np.ndarray
        Returns a 2D numpy array, where each row corrsponds
        to the fingerprints of a SMILES strings in order.
    """
    molecular_mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
    # Initialize an array to store ECFP2 & PubChem fingerprint features
    features = np.zeros((len(smiles_list), 881), dtype=np.int32)

    for i, mol in enumerate(molecular_mols):
        pubchem_mol_fingerprint = CalculatePubChemFingerprint(mol)
        numerical_representation =  pubchem_mol_fingerprint
        features[i] = numerical_representation

    return features

def compute_fingerprint_MACCS(smiles_list: List[str]) -> np.ndarray:
    """
    Compute ECFP2 & PubChem fingerprint features for a list 
    of SMILES strings

    Parameters
    ----------
    smiles_list: List[str]
        The list of SMILES strings.

    Returns
    -------
    np.ndarray
        Returns a 2D numpy array, where each row corrsponds
        to the fingerprints of a SMILES strings in order.
    """
    molecular_mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
    # Initialize an array to store ECFP2 & PubChem fingerprint features
    features = np.zeros((len(smiles_list), 166), dtype=np.int32)

    for i, mol in enumerate(molecular_mols):
        one_fingerp =MACCSkeys.GenMACCSKeys(mol)
        list_one_fingerp = list(one_fingerp)
        del list_one_fingerp[0]
        numerical_representation = np.array(list_one_fingerp,dtype=np.int64)
        features[i] = numerical_representation

    return features

def compute_descriptor_features(smiles_list: List[str]) -> pd.DataFrame:
    """
    Compute 2D descriptor features for a list of SMILES strings

    Parameters
    ----------
    smiles_list: List[str]
        The list of SMILES strings.

    Returns
    -------
    np.ndarray
        Returns a pandas dataframe, where each row corrsponds
        to the descriptors of a SMILES strings in order.
    """
    descriptor_calc_2D = Calculator(descriptors, ignore_3D=True)
    molecular_mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
    descriptors_2D = descriptor_calc_2D.pandas(molecular_mols)
    return descriptors_2D


In [3]:
# GET ALL DATA 
pd_smiles,pd_labels = get_dataset('DILI-Dataset.csv')

# SPLIT DATA TRAIN AND TEST 80 AND 20 
X_train, X_test, y_train, y_test = train_test_split(pd_smiles, pd_labels, test_size=0.2, random_state=42)

In [4]:
pd_smiles_train = X_train
pd_smiles_test = X_test
pd_labels_train = y_train
pd_labels_test = y_test

In [5]:
list_y_smiles                   = list(pd_labels_train)
list_y_smiles_test              = list(pd_labels_test)
list_X_smiles                   = list(pd_smiles_train) 
list_X_smiles_test              = list(pd_smiles_test) 

In [6]:
# SHUFFLE FROM THE BEGINNInG 
# SHUFFLE TO SPREAD THE DATA WITH LABEL 0 AND 1 RANDOMLY AND CREATE K-FOLD CROSSVALIDATION
# ========================================================================================
k                               = 10
X_1,y_1                         = shuffle(list_X_smiles, list_y_smiles)
train_data                      = X_1
train_targets                   = y_1
all_train_indices, all_val_indices, total_train_data,total_train_targets,total_validation_data,total_validation_targets = CF_Validation_version_2(k,train_data,train_targets)

In [7]:
all_train_indices 

[1132, 1132, 1132, 1132, 1132, 1132, 1132, 1132, 1133, 1133]

In [8]:
# save all the index 
# train, validation, and test 
np.save('train_indices.npy', all_train_indices)
np.save('val_indices.npy', all_val_indices)
np.save('test_indices.npy', len(list_X_smiles_test))

In [9]:
# CONVERT TO VEC DATA FROM SMILES AFTER CROSS FOLD VALIDATION 
def convert2vec(input_data_smiles):
    fingerp_ECFP2 = compute_fingerprint_ECFP2(input_data_smiles)
    fingerp_PubChem = compute_fingerprint_PubChem(input_data_smiles)
    fingerp_MACCS = compute_fingerprint_MACCS(input_data_smiles)
    descriptors = compute_descriptor_features(input_data_smiles)
    
    return descriptors, fingerp_ECFP2,fingerp_PubChem,fingerp_MACCS

# convert the total train , validation, and test into vector data
def convert2vec_group(total_data):
    data_ECFP2 = [] 
    data_PubChem = []  
    data_MACCS = []  
    data_Desc = []  
    if len(total_data)> 2: # for data train and validation 
        for one_fold in total_data:
            collected_data = convert2vec(one_fold)
            data_Desc.append(collected_data[0])
            data_ECFP2.append(collected_data[1])
            data_PubChem.append(collected_data[2])
            data_MACCS.append(collected_data[3])
    else : # for data test 
        collected_data = convert2vec(total_data[0])
        data_Desc.append(collected_data[0])
        data_ECFP2.append(collected_data[1])
        data_PubChem.append(collected_data[2])
        data_MACCS.append(collected_data[3])
        
    return data_Desc,data_ECFP2,data_PubChem,data_MACCS

In [11]:
data_train = convert2vec_group(total_train_data)# fold x numberdata x number features(2 x 1) descriptor and fing           

data_train_Desc=  data_train[0]
data_train_ECFP2=  data_train[1]
data_train_PubChem=  data_train[2]
data_train_MACCS=  data_train[3]

  1%|▌                                                                                | 7/1132 [00:03<07:38,  2.45it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  6%|█████                                                                           | 72/1132 [00:07<01:15, 14.01it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  8%|██████▌                                                                         | 93/1132 [00:10<02:27,  7.02it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 12%|█████████▏                                                                     | 131/1132 [00:15<02:45,  6.06it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 12%|█████████▎                                                                     | 134/1132 [00:19<05:41,  2.92it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 15%|███████████▌                                                                   | 165/1132 [00:19<04:34,  3.52it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 22%|█████████████████▏                                                             | 246/1132 [00:24<00:49, 17.84it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 26%|████████████████████▏                                                          | 290/1132 [00:28<01:24,  9.98it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 32%|█████████████████████████                                                      | 359/1132 [00:32<00:40, 18.92it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 36%|████████████████████████████▎                                                  | 405/1132 [00:40<01:35,  7.62it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1132/1132 [02:05<00:00,  9.02it/s]
  2%|█▉                                                                              | 27/1132 [00:05<03:45,  4.90it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  7%|█████▎                                                                          | 75/1132 [00:10<03:20,  5.26it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 10%|███████▉                                                                       | 114/1132 [00:15<03:30,  4.85it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 12%|█████████▋                                                                     | 139/1132 [00:32<06:51,  2.41it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 16%|████████████▌                                                                  | 180/1132 [00:35<03:37,  4.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 21%|████████████████▌                                                              | 238/1132 [00:38<01:19, 11.18it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 31%|████████████████████████▊                                                      | 356/1132 [00:46<00:46, 16.58it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1132/1132 [02:22<00:00,  7.93it/s]
  2%|█▉                                                                              | 28/1132 [00:05<03:22,  5.44it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  7%|█████▎                                                                          | 75/1132 [00:10<03:44,  4.70it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 10%|███████▉                                                                       | 114/1132 [00:15<03:37,  4.68it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 14%|███████████▎                                                                   | 162/1132 [00:31<03:04,  5.25it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 15%|████████████                                                                   | 172/1132 [00:34<03:31,  4.54it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 27%|████████████████████▉                                                          | 300/1132 [00:44<01:24,  9.90it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 35%|███████████████████████████▋                                                   | 397/1132 [00:53<01:17,  9.45it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1132/1132 [02:25<00:00,  7.80it/s]
  2%|█▉                                                                              | 27/1132 [00:05<03:31,  5.22it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  7%|█████▎                                                                          | 75/1132 [00:10<03:33,  4.95it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 10%|███████▉                                                                       | 114/1132 [00:14<03:22,  5.03it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 14%|███████████▎                                                                   | 162/1132 [00:30<03:05,  5.22it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 18%|██████████████▍                                                                | 207/1132 [00:40<03:12,  4.80it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 23%|██████████████████▍                                                            | 265/1132 [00:48<03:26,  4.20it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 49%|██████████████████████████████████████▊                                        | 556/1132 [01:25<01:08,  8.39it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1132/1132 [02:27<00:00,  7.69it/s]
  2%|█▉                                                                              | 27/1132 [00:05<03:23,  5.42it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  7%|█████▎                                                                          | 75/1132 [00:11<03:57,  4.46it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 10%|███████▉                                                                       | 114/1132 [00:17<04:01,  4.22it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 19%|███████████████▎                                                               | 219/1132 [00:43<03:17,  4.62it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 23%|█████████████████▊                                                             | 255/1132 [00:48<02:46,  5.28it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 26%|████████████████████▎                                                          | 291/1132 [00:53<04:51,  2.88it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 54%|██████████████████████████████████████████▋                                    | 611/1132 [01:21<00:49, 10.56it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 59%|██████████████████████████████████████████████▎                                | 664/1132 [01:27<00:51,  9.05it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1132/1132 [02:22<00:00,  7.92it/s]
  5%|███▌                                                                            | 51/1132 [00:05<01:09, 15.61it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  7%|█████▎                                                                          | 75/1132 [00:10<03:30,  5.02it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 10%|███████▉                                                                       | 114/1132 [00:14<03:27,  4.91it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 11%|████████▊                                                                      | 127/1132 [00:30<08:02,  2.08it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 26%|████████████████████▏                                                          | 289/1132 [00:47<03:57,  3.55it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 32%|█████████████████████████▍                                                     | 364/1132 [00:52<01:20,  9.59it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 43%|█████████████████████████████████▋                                             | 482/1132 [01:02<00:41, 15.76it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1132/1132 [02:30<00:00,  7.51it/s]
  5%|███▌                                                                            | 51/1132 [00:05<01:08, 15.88it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  7%|█████▎                                                                          | 75/1132 [00:10<03:23,  5.21it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 10%|███████▉                                                                       | 114/1132 [00:14<03:03,  5.55it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 14%|███████████▎                                                                   | 162/1132 [00:30<03:27,  4.67it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 19%|███████████████▎                                                               | 219/1132 [00:37<02:52,  5.30it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 23%|██████████████████▍                                                            | 265/1132 [00:46<03:39,  3.95it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 27%|█████████████████████▍                                                         | 307/1132 [00:49<01:59,  6.90it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 43%|█████████████████████████████████▋                                             | 482/1132 [01:02<01:06,  9.84it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1132/1132 [02:28<00:00,  7.61it/s]
  5%|███▌                                                                            | 51/1132 [00:05<01:13, 14.75it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  7%|█████▎                                                                          | 75/1132 [00:10<03:43,  4.73it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 10%|███████▉                                                                       | 114/1132 [00:15<03:24,  4.97it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 12%|█████████▎                                                                     | 133/1132 [00:31<07:02,  2.37it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 18%|██████████████▌                                                                | 209/1132 [00:39<02:59,  5.15it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 27%|█████████████████████▍                                                         | 307/1132 [00:51<02:08,  6.41it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 35%|███████████████████████████▉                                                   | 400/1132 [00:57<01:20,  9.04it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 79%|██████████████████████████████████████████████████████████████▋                | 899/1132 [02:08<00:41,  5.58it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1132/1132 [02:28<00:00,  7.61it/s]
  5%|███▌                                                                            | 51/1133 [00:05<01:10, 15.37it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  5%|████▎                                                                           | 61/1133 [00:10<03:25,  5.22it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 10%|███████▉                                                                       | 114/1133 [00:14<02:56,  5.77it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 14%|███████████▎                                                                   | 162/1133 [00:29<03:21,  4.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 19%|██████████████▋                                                                | 210/1133 [00:37<02:46,  5.54it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 23%|█████████████████▉                                                             | 258/1133 [00:45<03:49,  3.81it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 23%|██████████████████▍                                                            | 265/1133 [00:46<03:24,  4.25it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 50%|███████████████████████████████████████▎                                       | 563/1133 [01:23<04:03,  2.34it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1133/1133 [02:25<00:00,  7.81it/s]
  2%|█▉                                                                              | 27/1133 [00:06<04:34,  4.04it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  7%|█████▎                                                                          | 75/1133 [00:13<04:10,  4.22it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 10%|███████▉                                                                       | 114/1133 [00:16<03:30,  4.84it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 11%|████████▊                                                                      | 127/1133 [00:32<05:53,  2.85it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 17%|█████████████▊                                                                 | 198/1133 [00:36<02:08,  7.26it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 23%|█████████████████▊                                                             | 256/1133 [00:43<02:07,  6.86it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 38%|█████████████████████████████▋                                                 | 426/1133 [01:01<01:29,  7.93it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 59%|██████████████████████████████████████████████▊                                | 672/1133 [01:39<01:03,  7.25it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 1133/1133 [02:28<00:00,  7.63it/s]


In [12]:
data_train_PubChem[0]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
len(total_validation_data[0])

126

In [14]:
data_val = convert2vec_group(total_validation_data)# fold x numberdata x number features(2 x 1) descriptor and fing                
data_val_Desc=  data_val[0]
data_val_ECFP2=  data_val[1]
data_val_PubChem=  data_val[2]
data_val_MACCS=  data_val[3]

 21%|█████████████████▎                                                               | 27/126 [00:05<00:22,  4.39it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 60%|████████████████████████████████████████████████▏                                | 75/126 [00:11<00:11,  4.47it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 90%|████████████████████████████████████████████████████████████████████████▍       | 114/126 [00:15<00:02,  4.97it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [00:28<00:00,  4.35it/s]
  6%|████▌                                                                             | 7/126 [00:03<00:51,  2.29it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 57%|██████████████████████████████████████████████▎                                  | 72/126 [00:07<00:03, 15.17it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 74%|███████████████████████████████████████████████████████████▊                     | 93/126 [00:10<00:04,  7.75it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [00:11<00:00, 10.70it/s]
  2%|█▎                                                                                | 2/126 [00:04<04:48,  2.33s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  5%|███▉                                                                              | 6/126 [00:08<02:36,  1.31s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 29%|███████████████████████▊                                                         | 37/126 [00:08<00:42,  2.09it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 44%|███████████████████████████████████▎                                             | 55/126 [00:11<00:09,  7.18it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [00:14<00:00,  8.75it/s]


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  4%|███▎                                                                              | 5/126 [00:02<00:52,  2.31it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 17%|██████████████▏                                                                  | 22/126 [00:05<00:19,  5.39it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 30%|████████████████████████▍                                                        | 38/126 [00:07<00:11,  7.72it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 38%|██████████████████████████████▊                                                  | 48/126 [00:07<00:09,  8.56it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 73%|███████████████████████████████████████████████████████████▏                     | 92/126 [00:10<00:03, 10.25it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [00:11<00:00, 11.03it/s]
  2%|█▎                                                                                | 2/126 [00:04<04:42,  2.28s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 15%|████████████▏                                                                    | 19/126 [00:12<03:33,  1.99s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 25%|████████████████████▌                                                            | 32/126 [00:22<01:00,  1.55it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [00:32<00:00,  3.92it/s]
 30%|████████████████████████▍                                                        | 38/126 [00:04<00:05, 15.14it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 41%|█████████████████████████████████▍                                               | 52/126 [00:07<00:10,  6.90it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 44%|████████████████████████████████████                                             | 56/126 [00:10<00:13,  5.06it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 63%|███████████████████████████████████████████████████▍                             | 80/126 [00:12<00:06,  7.23it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 85%|███████████████████████████████████████████████████████████████████▉            | 107/126 [00:13<00:01, 11.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [00:14<00:00,  8.93it/s]
 17%|██████████████▏                                                                  | 22/126 [00:04<00:15,  6.63it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 24%|███████████████████▎                                                             | 30/126 [00:05<00:10,  8.96it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 32%|█████████████████████████▋                                                       | 40/126 [00:07<00:12,  6.97it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [00:13<00:00,  9.38it/s]


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 18%|██████████████▊                                                                  | 23/126 [00:04<04:21,  2.54s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 22%|██████████████████                                                               | 28/126 [00:05<00:12,  7.92it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 30%|████████████████████████▍                                                        | 38/126 [00:05<00:08, 10.92it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 36%|████████████████████████████▉                                                    | 45/126 [00:07<00:13,  6.03it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 67%|██████████████████████████████████████████████████████                           | 84/126 [00:09<00:02, 16.97it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 84%|███████████████████████████████████████████████████████████████████▎            | 106/126 [00:16<00:05,  3.41it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [00:23<00:00,  5.39it/s]


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 14%|███████████                                                                      | 17/125 [00:06<02:21,  1.31s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 93%|██████████████████████████████████████████████████████████████████████████▏     | 116/125 [00:14<00:00, 18.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 125/125 [00:14<00:00,  8.53it/s]
  5%|███▉                                                                              | 6/125 [00:03<01:23,  1.43it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 38%|██████████████████████████████▍                                                  | 47/125 [00:07<00:06, 11.71it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 74%|████████████████████████████████████████████████████████████▎                    | 93/125 [00:12<00:03,  9.76it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 125/125 [00:13<00:00,  9.51it/s]


In [15]:
len(list_X_smiles_test)
one_set_data_test = []
one_set_data_test.append(list_X_smiles_test)
len(one_set_data_test)

1

In [16]:

data_test = convert2vec_group(one_set_data_test)# fold x numberdata x number features(2 x 1) descriptor and fing                
data_test_Desc=  data_test[0]
data_test_ECFP2=  data_test[1]
data_test_PubChem=  data_test[2]
data_test_MACCS=  data_test[3]

 10%|████████▍                                                                        | 33/315 [00:04<01:35,  2.95it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 17%|█████████████▋                                                                   | 53/315 [00:07<00:33,  7.76it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 31%|█████████████████████████▏                                                       | 98/315 [00:11<00:20, 10.68it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 35%|███████████████████████████▉                                                    | 110/315 [00:12<00:16, 12.10it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 53%|██████████████████████████████████████████▍                                     | 167/315 [00:28<00:28,  5.21it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 65%|███████████████████████████████████████████████████▊                            | 204/315 [00:36<00:28,  3.83it/s]

  s += (eig.vec[i, eig.max] * eig.vec[j, eig.max]) ** -0.5


 69%|██████████████████████████████████████████████████████▊                         | 216/315 [00:36<00:18,  5.41it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|████████████████████████████████████████████████████████████████████████████████| 315/315 [00:49<00:00,  6.33it/s]


# 1. Data Repairing 
## finding the not number value and replace it with zero value 
## finding also Nan number value and replace it also with zero value

In [17]:
import numbers
import math 
def coerce_to_numeric(value):
    if isinstance(value, numbers.Number) and math.isnan(value)==False:
        return value
    else:
        return 0

# Returns a cleaned version of df[col1]
#clean_col = df[col1].apply(coerce_to_numeric)
#for data in descriptors_train.loc[0]:
#    print(coerce_to_numeric(data))

# input is list of descriptors and change it into data list
def find_notnumber_n_replace(descriptors):
    list_data= []
    for j in range(len(descriptors)):
        one_feature=[]
        for data in descriptors.loc[j]:
            clean_data = coerce_to_numeric(data)
            one_feature.append(clean_data)
        list_data.append(one_feature)
    return list_data

In [18]:
# REPAIR ALL DATA
# data repairing only used for descriptor type data
def repair_data(total_vec_data):
    total_data=[]
    if len(total_vec_data)>2 :
        for data_fold in total_vec_data:
            list_data = find_notnumber_n_replace(data_fold)
            total_data.append(list_data)
    else :

        list_data = find_notnumber_n_replace(total_vec_data[0])
        total_data.append(list_data)
    return total_data

#descriptors = data_vec_train[0][0] # descriptor in fold 0
#list_data = find_notnumber_n_replace(descriptors)

In [19]:
len(data_test_Desc)

1

In [20]:
data_train_Desc1 = repair_data(data_train_Desc)
data_val_Desc1 = repair_data(data_val_Desc)
data_test_Desc1 = repair_data(data_test_Desc)

# 3. Standarization (Max-Min Scaler)
only DESCRIPTOR that would be standardize 


the FINGERPRINTS have binary value 0 and 1 , dont need to be scaled

In [21]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.model_selection import train_test_split

# scale only descriptor 
# chi square test need positive data 

In [22]:
#Normalizer = Normalizer()
Scaler = MinMaxScaler()

In [23]:
# join only the descriptor 
filtered_desc_train = np.concatenate((data_train_Desc1[0],data_val_Desc1[0]),axis =0)

In [24]:
scaler1       = Scaler.fit(filtered_desc_train)

scaled_desc_train=[]
scaled_desc_val=[]
for data_train,data_val in zip (data_train_Desc1, data_val_Desc1): 
    scaled_desc_train.append(scaler1.transform(data_train))
    scaled_desc_val.append(scaler1.transform(data_val))
    
scaled_desc_test = scaler1.transform(data_test_Desc1[0])

In [25]:
len(scaled_desc_test)

315

# 5. Create Graph Data 

In [26]:
# RDkit
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
# Pytorch and Pytorch Geometric
import torch

import torch.nn as nn
from torch.nn import Linear
import torch.optim as optim
import torch.nn.functional as F # activation function
from torch.utils.data import Dataset, DataLoader # dataset management

In [27]:
# CONVERT THE DATASET INTO GRAPH STRUCTURED DATA 
# BOTH DATA_TRAIN AND DATA_VALIDATION ARE PREPARED IN GRAPH STRUCTURED DATA 
#=============================================================================================
data_list_train = []
data_list_val =[]
for X_train,y_train,X_val,y_val in zip(total_train_data, total_train_targets, total_validation_data, total_validation_targets):
    data_graph_train = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(X_train, y_train)
    data_graph_val   = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(X_val, y_val)
    
    data_list_train.append(data_graph_train)
    data_list_val.append(data_graph_val)

    
# INDEPENDENT DATASET IN GRAPH STRUCTURED DATA
#===========================================================================================
x_smiles = list_X_smiles_test
y = list_y_smiles_test

data_list_test = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(x_smiles, y)

# 6. Save the preprocessed data 

In [28]:
# save descriptors 
# save in numpy format 
np.save('final_clean_desc_train0.npy',scaled_desc_train[0])
np.save('final_clean_desc_train1.npy',scaled_desc_train[1])
np.save('final_clean_desc_train2.npy',scaled_desc_train[2])
np.save('final_clean_desc_train3.npy',scaled_desc_train[3])
np.save('final_clean_desc_train4.npy',scaled_desc_train[4])
np.save('final_clean_desc_train5.npy',scaled_desc_train[5])
np.save('final_clean_desc_train6.npy',scaled_desc_train[6])
np.save('final_clean_desc_train7.npy',scaled_desc_train[7])
np.save('final_clean_desc_train8.npy',scaled_desc_train[8])
np.save('final_clean_desc_train9.npy',scaled_desc_train[9])
# save the data vector 
# this is saved in numpy data 
np.save('final_clean_desc_val0.npy',scaled_desc_val[0])
np.save('final_clean_desc_val1.npy',scaled_desc_val[1])
np.save('final_clean_desc_val2.npy',scaled_desc_val[2])
np.save('final_clean_desc_val3.npy',scaled_desc_val[3])
np.save('final_clean_desc_val4.npy',scaled_desc_val[4])
np.save('final_clean_desc_val5.npy',scaled_desc_val[5])
np.save('final_clean_desc_val6.npy',scaled_desc_val[6])
np.save('final_clean_desc_val7.npy',scaled_desc_val[7])
np.save('final_clean_desc_val8.npy',scaled_desc_val[8])
np.save('final_clean_desc_val9.npy',scaled_desc_val[9])

np.save('final_clean_desc_test.npy',scaled_desc_test)

In [29]:
# data for fingerprint_ECFP2
np.save('final_clean_fingerp_ECFP2_train0.npy',data_train_ECFP2[0])
np.save('final_clean_fingerp_ECFP2_train1.npy',data_train_ECFP2[1])
np.save('final_clean_fingerp_ECFP2_train2.npy',data_train_ECFP2[2])
np.save('final_clean_fingerp_ECFP2_train3.npy',data_train_ECFP2[3])
np.save('final_clean_fingerp_ECFP2_train4.npy',data_train_ECFP2[4])
np.save('final_clean_fingerp_ECFP2_train5.npy',data_train_ECFP2[5])
np.save('final_clean_fingerp_ECFP2_train6.npy',data_train_ECFP2[6])
np.save('final_clean_fingerp_ECFP2_train7.npy',data_train_ECFP2[7])
np.save('final_clean_fingerp_ECFP2_train8.npy',data_train_ECFP2[8])
np.save('final_clean_fingerp_ECFP2_train9.npy',data_train_ECFP2[9])

# save the data vector 
# this is saved in numpy data 
np.save('final_clean_fingerp_ECFP2_val0.npy',data_val_ECFP2[0])
np.save('final_clean_fingerp_ECFP2_val1.npy',data_val_ECFP2[1])
np.save('final_clean_fingerp_ECFP2_val2.npy',data_val_ECFP2[2])
np.save('final_clean_fingerp_ECFP2_val3.npy',data_val_ECFP2[3])
np.save('final_clean_fingerp_ECFP2_val4.npy',data_val_ECFP2[4])
np.save('final_clean_fingerp_ECFP2_val5.npy',data_val_ECFP2[5])
np.save('final_clean_fingerp_ECFP2_val6.npy',data_val_ECFP2[6])
np.save('final_clean_fingerp_ECFP2_val7.npy',data_val_ECFP2[7])
np.save('final_clean_fingerp_ECFP2_val8.npy',data_val_ECFP2[8])
np.save('final_clean_fingerp_ECFP2_val9.npy',data_val_ECFP2[9])

np.save('final_clean_fingerp_ECFP2_test.npy',data_test_ECFP2)

In [31]:
# data for fingerprint_PubChem
np.save('final_clean_fingerp_PubChem_train0.npy',data_train_PubChem[0])
np.save('final_clean_fingerp_PubChem_train1.npy',data_train_PubChem[1])
np.save('final_clean_fingerp_PubChem_train2.npy',data_train_PubChem[2])
np.save('final_clean_fingerp_PubChem_train3.npy',data_train_PubChem[3])
np.save('final_clean_fingerp_PubChem_train4.npy',data_train_PubChem[4])
np.save('final_clean_fingerp_PubChem_train5.npy',data_train_PubChem[5])
np.save('final_clean_fingerp_PubChem_train6.npy',data_train_PubChem[6])
np.save('final_clean_fingerp_PubChem_train7.npy',data_train_PubChem[7])
np.save('final_clean_fingerp_PubChem_train8.npy',data_train_PubChem[8])
np.save('final_clean_fingerp_PubChem_train9.npy',data_train_PubChem[9])

# save the data vector 
# this is saved in numpy data 
np.save('final_clean_fingerp_PubChem_val0.npy',data_val_PubChem[0])
np.save('final_clean_fingerp_PubChem_val1.npy',data_val_PubChem[1])
np.save('final_clean_fingerp_PubChem_val2.npy',data_val_PubChem[2])
np.save('final_clean_fingerp_PubChem_val3.npy',data_val_PubChem[3])
np.save('final_clean_fingerp_PubChem_val4.npy',data_val_PubChem[4])
np.save('final_clean_fingerp_PubChem_val5.npy',data_val_PubChem[5])
np.save('final_clean_fingerp_PubChem_val6.npy',data_val_PubChem[6])
np.save('final_clean_fingerp_PubChem_val7.npy',data_val_PubChem[7])
np.save('final_clean_fingerp_PubChem_val8.npy',data_val_PubChem[8])
np.save('final_clean_fingerp_PubChem_val9.npy',data_val_PubChem[9])

np.save('final_clean_fingerp_PubChem_test.npy',data_test_PubChem)

In [32]:
# data for fingerprint_MACCS
np.save('final_clean_fingerp_MACCS_train0.npy',data_train_MACCS[0])
np.save('final_clean_fingerp_MACCS_train1.npy',data_train_MACCS[1])
np.save('final_clean_fingerp_MACCS_train2.npy',data_train_MACCS[2])
np.save('final_clean_fingerp_MACCS_train3.npy',data_train_MACCS[3])
np.save('final_clean_fingerp_MACCS_train4.npy',data_train_MACCS[4])
np.save('final_clean_fingerp_MACCS_train5.npy',data_train_MACCS[5])
np.save('final_clean_fingerp_MACCS_train6.npy',data_train_MACCS[6])
np.save('final_clean_fingerp_MACCS_train7.npy',data_train_MACCS[7])
np.save('final_clean_fingerp_MACCS_train8.npy',data_train_MACCS[8])
np.save('final_clean_fingerp_MACCS_train9.npy',data_train_MACCS[9])

# save the data vector 
# this is saved in numpy data 
np.save('final_clean_fingerp_MACCS_val0.npy',data_val_MACCS[0])
np.save('final_clean_fingerp_MACCS_val1.npy',data_val_MACCS[1])
np.save('final_clean_fingerp_MACCS_val2.npy',data_val_MACCS[2])
np.save('final_clean_fingerp_MACCS_val3.npy',data_val_MACCS[3])
np.save('final_clean_fingerp_MACCS_val4.npy',data_val_MACCS[4])
np.save('final_clean_fingerp_MACCS_val5.npy',data_val_MACCS[5])
np.save('final_clean_fingerp_MACCS_val6.npy',data_val_MACCS[6])
np.save('final_clean_fingerp_MACCS_val7.npy',data_val_MACCS[7])
np.save('final_clean_fingerp_MACCS_val8.npy',data_val_MACCS[8])
np.save('final_clean_fingerp_MACCS_val9.npy',data_val_MACCS[9])

np.save('final_clean_fingerp_MACCS_test.npy',data_test_MACCS)

In [33]:
import os
# save data train
for i in range(10):
    path = './data_train_' + str(i)
    # create new single directory
    os.mkdir(path)
    for idx, tensor in enumerate(data_list_train[i]):
        torch.save(tensor, f"data_train_{i}/tensor{idx}.pt")

# save data validation        
for i in range(10):
    path = './data_val_' + str(i)
    # create new single directory
    os.mkdir(path)
    for idx, tensor in enumerate(data_list_val[i]):
        torch.save(tensor, f"data_val_{i}/tensor{idx}.pt")
        
# save data test 
path = './data_test' 
# create new single directory
os.mkdir(path)
for idx, tensor in enumerate(data_list_test):
    torch.save(tensor, f"data_test/tensor{idx}.pt")

In [34]:
# save the target 
# save the label also 
# save the data vector (only use variance threshold)
# this is saved in numpy data 
np.save('total_train_targets0.npy',total_train_targets[0])
np.save('total_train_targets1.npy',total_train_targets[1])
np.save('total_train_targets2.npy',total_train_targets[2])
np.save('total_train_targets3.npy',total_train_targets[3])
np.save('total_train_targets4.npy',total_train_targets[4])
np.save('total_train_targets5.npy',total_train_targets[5])
np.save('total_train_targets6.npy',total_train_targets[6])
np.save('total_train_targets7.npy',total_train_targets[7])
np.save('total_train_targets8.npy',total_train_targets[8])
np.save('total_train_targets9.npy',total_train_targets[9])

# save the data vector 
# this is saved in numpy data 
np.save('total_validation_targets0.npy',total_validation_targets[0])
np.save('total_validation_targets1.npy',total_validation_targets[1])
np.save('total_validation_targets2.npy',total_validation_targets[2])
np.save('total_validation_targets3.npy',total_validation_targets[3])
np.save('total_validation_targets4.npy',total_validation_targets[4])
np.save('total_validation_targets5.npy',total_validation_targets[5])
np.save('total_validation_targets6.npy',total_validation_targets[6])
np.save('total_validation_targets7.npy',total_validation_targets[7])
np.save('total_validation_targets8.npy',total_validation_targets[8])
np.save('total_validation_targets9.npy',total_validation_targets[9])


total_test_targets = np.array(list_y_smiles_test)
np.save('total_test_targets.npy',total_test_targets)