In [None]:
from math import sqrt
from typing import Tuple, List

import numpy as np
import pandas as pd
from mordred import Calculator, descriptors
#import openbabel
from openbabel import pybel
from PyBioMed.PyMolecule.fingerprint import CalculatePubChemFingerprint,CalculateECFP2Fingerprint
from rdkit import Chem
from rdkit.Chem.rdchem import Atom
from sklearn.metrics import f1_score, confusion_matrix

from torch_geometric.data import Data
from torch_geometric.loader import DataLoader as G_Loader 
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold


from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, roc_curve, auc 
from sklearn.metrics import precision_recall_curve

In [None]:
def compute_fingerprint_features(smiles_list: List[str]) -> np.ndarray:
    """
    Compute ECFP2 & PubChem fingerprint features for a list 
    of SMILES strings

    Parameters
    ----------
    smiles_list: List[str]
        The list of SMILES strings.

    Returns
    -------
    np.ndarray
        Returns a 2D numpy array, where each row corrsponds
        to the fingerprints of a SMILES strings in order.
    """
    molecular_mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
    # Initialize an array to store ECFP2 & PubChem fingerprint features
    features = np.zeros((len(smiles_list), 1024 + 881), dtype=np.int32)

    for i, mol in enumerate(molecular_mols):
        ECFP2_mol_fingerprint = CalculateECFP2Fingerprint(mol)
        pubchem_mol_fingerprint = CalculatePubChemFingerprint(mol)
        numerical_representation = np.concatenate(
            (ECFP2_mol_fingerprint[0], pubchem_mol_fingerprint))
        features[i] = numerical_representation

    return features


def compute_descriptor_features(smiles_list: List[str]) -> pd.DataFrame:
    """
    Compute 2D descriptor features for a list of SMILES strings

    Parameters
    ----------
    smiles_list: List[str]
        The list of SMILES strings.

    Returns
    -------
    np.ndarray
        Returns a pandas dataframe, where each row corrsponds
        to the descriptors of a SMILES strings in order.
    """
    descriptor_calc_2D = Calculator(descriptors, ignore_3D=True)
    molecular_mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
    descriptors_2D = descriptor_calc_2D.pandas(molecular_mols)
    return descriptors_2D


In [None]:
%run ./graph_feature.ipynb 
%run ./dataset_processing.ipynb 

In [None]:
# GET ALL DATA 
pd_smiles,pd_labels = get_dataset('all_dataset_mtbpen5371.csv')

# SPLIT DATA TRAIN AND TEST 90 AND 10 
X_train, X_test, y_train, y_test = train_test_split(pd_smiles, pd_labels, test_size=0.1, random_state=42)

In [None]:
pd_smiles_train = X_train
pd_smiles_test = X_test
pd_labels_train = y_train
pd_labels_test = y_test

In [None]:
list_y_smiles                   = list(pd_labels_train)
list_y_smiles_test              = list(pd_labels_test)
list_X_smiles                   = list(pd_smiles_train) 
list_X_smiles_test              = list(pd_smiles_test) 

In [None]:
# SHUFFLE FROM THE BEGINNInG 
# SHUFFLE TO SPREAD THE DATA WITH LABEL 0 AND 1 RANDOMLY AND CREATE K-FOLD CROSSVALIDATION
# ========================================================================================
k                               = 10
X_1,y_1                         = shuffle(list_X_smiles, list_y_smiles)
train_data                      = X_1
train_targets                   = y_1
all_train_indices, all_val_indices, total_train_data,total_train_targets,total_validation_data,total_validation_targets = CF_Validation_version_2(k,train_data,train_targets)

In [None]:
# save all the index 
# train, validation, and test 
np.save('train_indices.npy', all_train_indices)
np.save('val_indices.npy', all_val_indices)
np.save('test_indices.npy', len(list_X_smiles_test))

In [None]:
# CONVERT TO VEC DATA FROM SMILES AFTER CROSS FOLD VALIDATION 
def convert2vec(input_data_smiles):
    fingerprints = compute_fingerprint_features(input_data_smiles)
    descriptors = compute_descriptor_features(input_data_smiles)
    return descriptors, fingerprints

# convert the total train , validation, and test into vector data
def convert2vec_group(total_data):
    data_vec = [] # this would be 5 x 2 matriks 
    for one_fold in total_data:
        data_vec.append(convert2vec(one_fold))
                        
    return data_vec
                                                              # fold x numberdata x number features(2 x 1) descriptor and fing                
data_vec_train  = convert2vec_group(total_train_data)         #(5 x 2330 x number features)
data_vec_val    = convert2vec_group(total_validation_data)
data_vec_test   = convert2vec(list_X_smiles_test)

In [None]:
import numbers
def coerce_to_numeric(value):
    if isinstance(value, numbers.Number):
        return value
    else:
        return 0

# Returns a cleaned version of df[col1]
#clean_col = df[col1].apply(coerce_to_numeric)
#for data in descriptors_train.loc[0]:
#    print(coerce_to_numeric(data))

# input is list of descriptors and change it into data list
def find_notnumber_n_replace(descriptors):
    list_data= []
    for j in range(len(descriptors)):
        one_feature=[]
        for data in descriptors.loc[j]:
            clean_data = coerce_to_numeric(data)
            one_feature.append(clean_data)
        list_data.append(one_feature)
    return list_data

In [None]:
# REPAIR ALL DATA
def repair_data(total_vec_data):
    total_data_vec=[]
    if len(total_vec_data)>2 :
        for data in total_vec_data:
            descriptors = data[0]
            fingerprints = data[1]
            list_data = find_notnumber_n_replace(descriptors)
            total_data_vec.append([list_data, fingerprints])
    else :
        descriptors = total_vec_data[0]
        fingerprints = total_vec_data[1]
        list_data = find_notnumber_n_replace(descriptors)
        total_data_vec.append([list_data, fingerprints])
    
    return total_data_vec

#descriptors = data_vec_train[0][0] # descriptor in fold 0
#list_data = find_notnumber_n_replace(descriptors)

data_vec_train1 = repair_data(data_vec_train)
data_vec_val1   = repair_data(data_vec_val)
data_vec_test1  = repair_data(data_vec_test)
#data_vec_val1   = repair_data(data_vec_val)
#data_vec_test1  = repair_data(data_vec_test)

In [None]:
def cek_null(descriptors):
    count_isnull =0
    name_isnull_label=[]
    for i in range(len(descriptors.isnull().sum())): 
        if descriptors.isnull().sum().iloc[i] > 0:
            count_isnull = count_isnull+1 
            name_isnull_label.append(descriptors.isnull().sum().index[i])
    return count_isnull, name_isnull_label

# just cek only once 
count_isnull, name_isnull_label = cek_null(data_vec_train[0][0])

In [None]:
count_isnull

In [None]:
name_isnull_label

In [None]:
count_isnull_val, name_isnull_label_val = cek_null(data_vec_val[0][0])

In [None]:
count_isnull_val

In [None]:
name_isnull_label_val

In [None]:
count_isnull_test, name_isnull_label_test = cek_null(data_vec_test[0])

In [None]:
count_isnull_test

In [None]:
name_isnull_label_test

In [None]:
def cek_null_fingerprints(fingerprints):
    get_idx = []
    get_row =[]
    for row,_ in enumerate(fingerprints):
        for idx, data in enumerate(fingerprints[row]):
            if data != 0 and data !=1:
                get_idx.append(idx)
                get_row.append(row)
    return get_idx, get_row

get_idx_train, get_row_train = cek_null_fingerprints(data_vec_train[0][1])
get_idx_val, get_row_val = cek_null_fingerprints(data_vec_val[0][1])
get_idx_test, get_row_test = cek_null_fingerprints(data_vec_test[1])

In [None]:
get_idx_train

In [None]:
get_row_train

In [None]:
get_idx_val

In [None]:
get_idx_val

In [None]:
get_idx_test

In [None]:
get_row_test

# 1.  feature selection 

In [None]:
# use standardized descriptor 
# i think the normalized data is too small 
# combine first fingerprints and descriptor 
from sklearn.feature_selection import VarianceThreshold,chi2

In [None]:
desc_train =[]
fingerp_train =[]
for desc, fingerp in data_vec_train1:
    desc_train.append(desc)
    fingerp_train.append(fingerp)

In [None]:
desc_val =[]
fingerp_val =[]
for desc, fingerp in data_vec_val1:
    desc_val.append(desc)
    fingerp_val.append(fingerp)

In [None]:
desc_test = data_vec_test1[0][0]
fingerp_test = data_vec_test1[0][1]

In [None]:
type(desc_train[0])

In [None]:
# choose just only one fold for fitting the variance threshold
total_desc       = desc_train[0] + desc_val[0]
total_fingerp    = np.concatenate ((fingerp_train[0], fingerp_val[0]), axis= 0)

In [None]:
len(total_desc) # total number of data 

In [None]:
# feature number before selection 
print(len(total_desc[0])) 
print(len(total_fingerp[0])) 

In [None]:
# we'll get bad feature from variance threshold and chi2
variance_desc =VarianceThreshold(threshold=(.8 * (1 - .8)))
variance_fingerp =VarianceThreshold(threshold=(.8 * (1 - .8)))
pd_desc = pd.DataFrame(total_desc)
pd_fingerp = pd.DataFrame(total_fingerp)

result_desc = variance_desc.fit(pd_desc)
result_fingerp = variance_fingerp.fit(pd_fingerp)

In [None]:
group_desc_train =[]
group_fingerp_train=[]
for desc,fingerp in zip (desc_train, fingerp_train):
    group_desc_train.append(result_desc.transform(desc))
    group_fingerp_train.append(result_fingerp.transform(fingerp))

In [None]:
len(group_desc_train[0][0])

In [None]:
len(group_fingerp_train[0][0])

In [None]:
desc_name = variance_desc.get_feature_names_out()
len(desc_name) 

In [None]:
desc_name

In [None]:
len(group_fingerp_train[0][0])

In [None]:
fingerp_name = variance_fingerp.get_feature_names_out()
len(fingerp_name)

In [None]:
fingerp_name 

In [None]:
group_desc_val =[]
group_fingerp_val=[]
for desc,fingerp in zip (desc_val, fingerp_val):
    group_desc_val.append(result_desc.transform(desc))
    group_fingerp_val.append(result_fingerp.transform(fingerp))

In [None]:
group_desc_test = result_desc.transform(desc_test)
group_fingerp_test = result_fingerp.transform(fingerp_test)

# 2. max-min scaler

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.model_selection import train_test_split

# scale only descriptor 
# chi square test need positive data 


In [None]:
#Normalizer = Normalizer()
Scaler = MinMaxScaler()

In [None]:
# join only the descriptor 
filtered_desc_train = np.concatenate((group_desc_train[0],group_desc_val[0]),axis =0)

In [None]:
scaler1       = Scaler.fit(filtered_desc_train)

In [None]:
scaled_desc_train=[]
scaled_desc_val=[]
for data_train,data_val in zip (group_desc_train, group_desc_val): 
    scaled_desc_train.append(scaler1.transform(data_train))
    scaled_desc_val.append(scaler1.transform(data_val))

In [None]:
scaled_desc_test = scaler1.transform(group_desc_test)

In [None]:
scaled_desc_test

In [None]:
# now we have these data 
#FINGERPRINTS
# group_fingerp_train
# group_fingerp_val
# group_fingerp_test

#SCALED DESCRIPTORS
# and 
# scaled_desc_train
# scaled_desc_val
# scaled_desc_test

# 3. chi- squared test 

In [None]:
# prepare data descriptor and fingerprints
np_desc_train = np.concatenate((scaled_desc_train[0],scaled_desc_val[0]),axis= 0)
np_fingerp_train = np.concatenate((group_fingerp_train[0],group_fingerp_val[0]),axis= 0)

np_label =np.concatenate((total_train_targets[0],total_validation_targets[0]),axis=0)

In [None]:
print(len(np_desc_train))
print(len(np_fingerp_train))

In [None]:
len(np_label)

In [None]:
f_score_desc = chi2(np_desc_train,np_label)
f_score_fingerp = chi2(np_fingerp_train,np_label)

In [None]:
pValue_desc = pd.Series(f_score_desc[1])
pValue_fingerp = pd.Series(f_score_fingerp[1])

In [None]:
pd_sort_desc = pValue_desc.sort_values(ascending=False)
pd_sort_desc

In [None]:
pd_sort_fingerp = pValue_fingerp.sort_values(ascending=False)
pd_sort_fingerp

In [None]:
def get_bad_features(pValue,thr):
    list_bad_feature=[]
    for i in range(len(pValue)):
        if pValue[i] < thr:
            list_bad_feature.append(i)
    return list_bad_feature

thr1 = 0.7
thr2 = 0.5
list_bad_feature_chi_desc = get_bad_features(pValue_desc,thr1)
list_bad_feature_chi_fingerp = get_bad_features(pValue_fingerp,thr2)

In [None]:
print(len(list_bad_feature_chi_desc))
print(len(list_bad_feature_chi_fingerp))

In [None]:
final_clean_desc_train = []
final_clean_desc_val = []
final_clean_desc_test = []

final_clean_fingerp_train = []
final_clean_fingerp_val = []
final_clean_fingerp_test = []

for data_train, data_val in zip(scaled_desc_train, scaled_desc_val):
    final_clean_desc_train.append(np.delete(data_train, list_bad_feature_chi_desc, axis=1))
    final_clean_desc_val.append(np.delete(data_val, list_bad_feature_chi_desc, axis=1))

final_clean_desc_test = np.delete(scaled_desc_test, list_bad_feature_chi_desc, axis=1)

In [None]:
for data_train, data_val in zip(group_fingerp_train, group_fingerp_val):
    final_clean_fingerp_train.append(np.delete(data_train, list_bad_feature_chi_fingerp, axis=1))
    final_clean_fingerp_val.append(np.delete(data_val, list_bad_feature_chi_fingerp, axis=1))

final_clean_fingerp_test = np.delete(group_fingerp_test, list_bad_feature_chi_fingerp, axis=1)

In [None]:
print(len(final_clean_desc_train[0][0]))
print(len(final_clean_desc_val[0][0]))
print(len(final_clean_desc_test[0]))

In [None]:
print(len(final_clean_fingerp_train[0][0]))
print(len(final_clean_fingerp_val[0][0]))
print(len(final_clean_fingerp_test[0]))

In [None]:
final_clean_desc_train[0][1]

In [None]:
final_clean_desc_test

In [None]:
final_clean_fingerp_test

In [None]:
# save the data vector (use variance threshold and chi squared)
# data for descriptor
# this is saved in numpy data 
np.save('final_clean_desc_train0.npy',final_clean_desc_train[0])
np.save('final_clean_desc_train1.npy',final_clean_desc_train[1])
np.save('final_clean_desc_train2.npy',final_clean_desc_train[2])
np.save('final_clean_desc_train3.npy',final_clean_desc_train[3])
np.save('final_clean_desc_train4.npy',final_clean_desc_train[4])
np.save('final_clean_desc_train5.npy',final_clean_desc_train[5])
np.save('final_clean_desc_train6.npy',final_clean_desc_train[6])
np.save('final_clean_desc_train7.npy',final_clean_desc_train[7])
np.save('final_clean_desc_train8.npy',final_clean_desc_train[8])
np.save('final_clean_desc_train9.npy',final_clean_desc_train[9])
# save the data vector 
# this is saved in numpy data 
np.save('final_clean_desc_val0.npy',final_clean_desc_val[0])
np.save('final_clean_desc_val1.npy',final_clean_desc_val[1])
np.save('final_clean_desc_val2.npy',final_clean_desc_val[2])
np.save('final_clean_desc_val3.npy',final_clean_desc_val[3])
np.save('final_clean_desc_val4.npy',final_clean_desc_val[4])
np.save('final_clean_desc_val5.npy',final_clean_desc_val[5])
np.save('final_clean_desc_val6.npy',final_clean_desc_val[6])
np.save('final_clean_desc_val7.npy',final_clean_desc_val[7])
np.save('final_clean_desc_val8.npy',final_clean_desc_val[8])
np.save('final_clean_desc_val9.npy',final_clean_desc_val[9])

np.save('final_clean_desc_test.npy',final_clean_desc_test)


In [None]:
# data for fingerprint
np.save('final_clean_fingerp_train0.npy',final_clean_fingerp_train[0])
np.save('final_clean_fingerp_train1.npy',final_clean_fingerp_train[1])
np.save('final_clean_fingerp_train2.npy',final_clean_fingerp_train[2])
np.save('final_clean_fingerp_train3.npy',final_clean_fingerp_train[3])
np.save('final_clean_fingerp_train4.npy',final_clean_fingerp_train[4])
np.save('final_clean_fingerp_train5.npy',final_clean_fingerp_train[5])
np.save('final_clean_fingerp_train6.npy',final_clean_fingerp_train[6])
np.save('final_clean_fingerp_train7.npy',final_clean_fingerp_train[7])
np.save('final_clean_fingerp_train8.npy',final_clean_fingerp_train[8])
np.save('final_clean_fingerp_train9.npy',final_clean_fingerp_train[9])

# save the data vector 
# this is saved in numpy data 
np.save('final_clean_fingerp_val0.npy',final_clean_fingerp_val[0])
np.save('final_clean_fingerp_val1.npy',final_clean_fingerp_val[1])
np.save('final_clean_fingerp_val2.npy',final_clean_fingerp_val[2])
np.save('final_clean_fingerp_val3.npy',final_clean_fingerp_val[3])
np.save('final_clean_fingerp_val4.npy',final_clean_fingerp_val[4])
np.save('final_clean_fingerp_val5.npy',final_clean_fingerp_val[5])
np.save('final_clean_fingerp_val6.npy',final_clean_fingerp_val[6])
np.save('final_clean_fingerp_val7.npy',final_clean_fingerp_val[7])
np.save('final_clean_fingerp_val8.npy',final_clean_fingerp_val[8])
np.save('final_clean_fingerp_val9.npy',final_clean_fingerp_val[9])

np.save('final_clean_fingerp_test.npy',final_clean_fingerp_test)

In [None]:
total_test_targets = np.array(list_y_smiles_test)

In [None]:
# save the label also 
# save the data vector (only use variance threshold)
# this is saved in numpy data 
np.save('total_train_targets0.npy',total_train_targets[0])
np.save('total_train_targets1.npy',total_train_targets[1])
np.save('total_train_targets2.npy',total_train_targets[2])
np.save('total_train_targets3.npy',total_train_targets[3])
np.save('total_train_targets4.npy',total_train_targets[4])
np.save('total_train_targets5.npy',total_train_targets[5])
np.save('total_train_targets6.npy',total_train_targets[6])
np.save('total_train_targets7.npy',total_train_targets[7])
np.save('total_train_targets8.npy',total_train_targets[8])
np.save('total_train_targets9.npy',total_train_targets[9])

# save the data vector 
# this is saved in numpy data 
np.save('total_validation_targets0.npy',total_validation_targets[0])
np.save('total_validation_targets1.npy',total_validation_targets[1])
np.save('total_validation_targets2.npy',total_validation_targets[2])
np.save('total_validation_targets3.npy',total_validation_targets[3])
np.save('total_validation_targets4.npy',total_validation_targets[4])
np.save('total_validation_targets5.npy',total_validation_targets[5])
np.save('total_validation_targets6.npy',total_validation_targets[6])
np.save('total_validation_targets7.npy',total_validation_targets[7])
np.save('total_validation_targets8.npy',total_validation_targets[8])
np.save('total_validation_targets9.npy',total_validation_targets[9])

np.save('total_test_targets.npy',total_test_targets)

# graph data 

In [None]:
# RDkit
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
# Pytorch and Pytorch Geometric
import torch

import torch.nn as nn
from torch.nn import Linear
import torch.optim as optim
import torch.nn.functional as F # activation function
from torch.utils.data import Dataset, DataLoader # dataset management

In [None]:
# CONVERT THE DATASET INTO GRAPH STRUCTURED DATA 
# BOTH DATA_TRAIN AND DATA_VALIDATION ARE PREPARED IN GRAPH STRUCTURED DATA 
#=============================================================================================
data_list_train = []
data_list_val =[]
for X_train,y_train,X_val,y_val in zip(total_train_data, total_train_targets, total_validation_data, total_validation_targets):
    data_graph_train = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(X_train, y_train)
    data_graph_val   = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(X_val, y_val)
    
    data_list_train.append(data_graph_train)
    data_list_val.append(data_graph_val)

    
# INDEPENDENT DATASET IN GRAPH STRUCTURED DATA
#===========================================================================================
x_smiles = list_X_smiles_test
y = list_y_smiles_test

data_list_test = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(x_smiles, y)

In [None]:
data_list_train[1]

In [None]:
import os

In [None]:
for i in range(10):
    path = './data_train_' + str(i)
    # create new single directory
    os.mkdir(path)
    for idx, tensor in enumerate(data_list_train[i]):
        torch.save(tensor, f"data_train_{i}/tensor{idx}.pt")

In [None]:
for i in range(10):
    path = './data_val_' + str(i)
    # create new single directory
    os.mkdir(path)
    for idx, tensor in enumerate(data_list_val[i]):
        torch.save(tensor, f"data_val_{i}/tensor{idx}.pt")

In [None]:
path = './data_test' 
# create new single directory
os.mkdir(path)
for idx, tensor in enumerate(data_list_test):
    torch.save(tensor, f"data_test/tensor{idx}.pt")