#### This notebook creates training and test data. 
#### It creates 6 clusters based on pIC50 values. 
#### From each cluster 70 % is randomly reserved as training data and 30 % as test data. 
#### Furthermore this notebook also creates leave one out dataset by iterating the training data and leaving one training data sample as leave one out sample.

In [1]:
import pandas as pd
import numpy as np
from itertools import cycle 
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut
import sys
from sklearn.cluster import KMeans

In [2]:
# Read min-max scaled descriptor csv file
csv_df = pd.read_csv('molecular_descriptors_csv/min_max_scaled_molecular_descriptors_with_pIC50_values.csv')

In [5]:
# Separate data into clusters based on pIC50 values. From each cluster use 30 % for test and rest for training 
n_clusters = 6
kmeans = KMeans(n_clusters)
kmeans.fit_transform(csv_df[['values']])
csv_df_test = pd.DataFrame(columns=csv_df.columns)
csv_df_train = pd.DataFrame(columns=csv_df.columns)

x = 0.30 # percentage of test molecules in each cluster

for i in range(n_clusters):

    # Gather all molecule indices in cluster i
    C_i = np.where(kmeans.labels_ == i)[0].tolist()
    n_i = len(C_i) # number of points in cluster i

    test_indices = np.random.choice(C_i, int(x * n_i),replace = False) 
    train_indices = []
    for i in C_i:
        if (i not in test_indices):
            train_indices.append(i)
    
    csv_df_test = csv_df_test.append(csv_df.iloc[test_indices])
    csv_df_train = csv_df_train.append(csv_df.iloc[train_indices])
print(str(len(csv_df_test)) + " test molecules with "+str(csv_df_test.shape[1]-3)+ " descriptors")

print(str(len(csv_df_train)) + " train molecules with "+str(csv_df_test.shape[1]-3)+ " descriptors")


26 test molecules with 1119 descriptors
68 train molecules with 1119 descriptors


In [6]:
csv_df_train

Unnamed: 0.1,Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,values
32,32,ClC(Cl)=C(Cl)C(=O)OC1=CC=C(C=C1)S(=O)(=O)C1=CC...,0.0,0.838906,0.538644,0.762701,0.537134,0.388889,0.35,0.382979,...,0.632770,0.104611,0.918333,0.847886,0.000000,0.537486,0.678571,0.916500,0.605263,0.700039
55,55,ClC1=CC(OC(=O)C2=CN=CC=C2)=CN=C1,0.0,0.490404,0.025001,0.000000,0.032588,0.388889,0.35,0.042553,...,0.110825,0.477238,0.286733,0.192106,0.324030,0.037849,0.142857,0.296183,0.092105,0.728352
57,57,[O-][N+](=O)C1=CC=CC(=C1)C(=O)OC1=CN=CC(Cl)=C1,0.0,0.587331,0.101344,0.049437,0.093653,0.388889,0.35,0.106383,...,0.214965,0.350415,0.441852,0.495462,0.325708,0.089460,0.250000,0.446858,0.197368,0.730438
73,73,CC1CCN(CC1)S(=O)(=O)C1=CC2=C(C=C1)N(CC1=CC=CC=...,0.0,0.065151,0.295767,0.514917,0.575819,0.388889,0.35,0.617021,...,0.566809,0.692819,0.491627,0.476723,0.372585,0.327599,0.660714,0.549618,0.592105,0.684025
81,81,CC1CCN(CC1)S(=O)(=O)C1=CC=C2N(CC3=CC=C4C=CC=CC...,0.0,0.065151,0.295767,0.514917,0.727821,0.611111,0.60,0.744681,...,0.728123,0.844539,0.492191,0.476977,0.373312,0.506157,0.821429,0.769935,0.763158,0.630247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,62,O=C1NC2=C(C=C(C=C2)S(=O)(=O)N2CCN(CCC3=CC=CC=C...,0.0,0.173017,0.133703,0.545617,0.551734,0.388889,0.35,0.595745,...,0.569249,0.727467,0.586647,0.473426,0.536257,0.368164,0.607143,0.471873,0.578947,0.294842
64,64,O=C1NC2=C(C=C(C=C2)S(=O)(=O)N2CCN(CC2)C2=CC=CC...,0.0,0.133783,0.185306,0.429155,0.415337,0.388889,0.35,0.446809,...,0.496488,0.757511,0.685392,0.473528,0.703664,0.268381,0.589286,0.207399,0.526316,0.252141
70,70,CN1CCN(CC1)S(=O)(=O)C1=CC2=C(C=C1)N(CC1=CC=CC=...,0.0,0.150507,0.162282,0.561438,0.551734,0.388889,0.35,0.595745,...,0.566809,0.692819,0.593512,0.476723,0.545493,0.327599,0.660714,0.346565,0.592105,0.222301
76,76,CN1CCN(CC1)S(=O)(=O)C1=CC=C2N(CC3=CC=C4C=CC=CC...,0.0,0.150507,0.162282,0.561438,0.703736,0.611111,0.60,0.723404,...,0.728123,0.844539,0.594076,0.476977,0.546221,0.506157,0.821429,0.566882,0.763158,0.199031


In [4]:
# Write to test set
csv_df_test.to_csv('data/test_compounds.csv')

# prepare cross validation
loo = LeaveOneOut()

n = 0
# enumerate splits
for train, test in loo.split(csv_df_train):
    # Write data to train and leave one out set
    df = pd.DataFrame(csv_df_train.iloc[train])
    df.to_csv('data/training_set_'+str(n)+'.csv')
    df = pd.DataFrame(csv_df_train.iloc[test])
    df.to_csv('data/loo_set__'+str(n)+'.csv')
    n+=1