####  After running training_and_test_compounds_to_sdf.ipynb the sdf file generated is processed through Padel to provide topological descriptors. The molecular descriptors generated by Padel are saved in "molecular_descriptors_csv/molecular_descriptors_with_pIC50_values.csv".
#### This notebook reads those descriptors and applies min-max scaling to descriptors. The resultant file is saved in  'molecular_descriptors_csv/min_max_scaled_molecular_descriptors_with_pIC50_values.csv'.
#### The scaler data is also saved in 'scaler_data/scaler.dat'

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import pickle

In [2]:
# Read file containing molecular descriptors
data_file = "molecular_descriptors_csv/molecular_descriptors_with_pIC50_values.csv"
csv_df = pd.read_csv(data_file)

In [3]:
csv_df

Unnamed: 0.1,Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,values
0,0,ClC1=CC(NC(=O)CSC2=NC=CC(=N2)C2=CSC(=N2)C2=CC=...,0,1.8332,3.360622,38.8217,61.657102,23,25,44,...,61.601358,2.053379,26.207945,2.543160,12.436456,2949.0,42.0,6.330,156.0,-0.477121
1,1,CN1N=C(C=C1C(F)(F)F)C1=CC=C(S1)C1=CC=NC(SCC(=O...,0,2.3025,5.301506,48.2354,62.914895,22,24,48,...,67.019947,2.030907,34.022386,2.543262,15.595702,3852.0,48.0,5.512,176.0,-1.000000
2,2,CSC1=C(C(C)=C(S1)C1=NC(C)=CS1)C1=CC=NC(SCC(=O)...,0,3.4539,11.929425,65.7244,70.371067,22,24,51,...,65.276625,2.039895,29.438396,2.543325,12.315692,3324.0,48.0,5.515,168.0,-1.041393
3,3,CC1=NC(=CS1)C1=NC(=CS1)C1=NC(SCC(=O)NC2=CC=C(C...,0,1.7308,2.995669,46.8983,59.957102,22,24,43,...,59.520206,2.052421,29.687873,2.543250,15.458889,2712.0,39.0,4.410,152.0,-1.146128
4,4,ClC1=CC=C(NC(=O)CSC2=NC=CC(=N2)C2=CC(=NO2)C2=C...,0,0.3442,0.118474,27.0317,58.045895,23,25,44,...,59.752710,2.060438,23.645955,5.658412,12.395061,2713.0,40.0,6.084,150.0,-1.176091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,89,IC1=CC=C2N(CC3=CC4=CC=CC=C4S3)C(=O)C(=O)C2=C1,0,1.1237,1.262702,39.2394,47.541930,15,16,32,...,45.712208,2.077828,14.345409,5.129616,3.515549,1034.0,36.0,4.630,124.0,0.022276
90,90,ClC1=C2C(=O)C(=O)N(CC3=CC4=CC=CC=C4S3)C2=CC=C1,0,0.4479,0.200614,30.9088,44.371930,15,16,32,...,45.710723,2.077760,14.357961,5.127368,3.514977,1019.0,37.0,4.092,124.0,-1.049218
91,91,IC1=CC=C2N(C\C=C\C3=CC4=CC=CC=C4S3)C(=O)C(=O)C...,0,1.5915,2.532872,49.5580,52.395516,15,16,36,...,49.706648,2.071110,14.299735,5.122215,3.489699,1455.0,38.0,5.396,132.0,-1.371068
92,92,ClC1=CC=C(NC(=O)C2=CC=C(CN3C(=O)C(=O)C4=CC(I)=...,0,1.2332,1.520782,52.2965,58.237516,17,17,40,...,57.298165,2.046363,22.437376,7.693744,6.565223,2255.0,44.0,4.842,152.0,-1.099335


In [4]:
# Get smiles of compounds
names = csv_df['Name']

In [5]:
# Isolate descriptor columns
descriptor_df = csv_df.loc[:,'nAcid':'Zagreb']

In [6]:
descriptor_df

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,0,1.8332,3.360622,38.8217,61.657102,23,25,44,30,14,...,10.727241,61.601358,2.053379,26.207945,2.543160,12.436456,2949.0,42.0,6.330,156.0
1,0,2.3025,5.301506,48.2354,62.914895,22,24,48,33,15,...,10.604914,67.019947,2.030907,34.022386,2.543262,15.595702,3852.0,48.0,5.512,176.0
2,0,3.4539,11.929425,65.7244,70.371067,22,24,51,32,19,...,10.157118,65.276625,2.039895,29.438396,2.543325,12.315692,3324.0,48.0,5.515,168.0
3,0,1.7308,2.995669,46.8983,59.957102,22,24,43,29,14,...,10.674533,59.520206,2.052421,29.687873,2.543250,15.458889,2712.0,39.0,4.410,152.0
4,0,0.3442,0.118474,27.0317,58.045895,23,25,44,29,15,...,9.592282,59.752710,2.060438,23.645955,5.658412,12.395061,2713.0,40.0,6.084,150.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,0,1.1237,1.262702,39.2394,47.541930,15,16,32,22,10,...,13.092116,45.712208,2.077828,14.345409,5.129616,3.515549,1034.0,36.0,4.630,124.0
90,0,0.4479,0.200614,30.9088,44.371930,15,16,32,22,10,...,10.219127,45.710723,2.077760,14.357961,5.127368,3.514977,1019.0,37.0,4.092,124.0
91,0,1.5915,2.532872,49.5580,52.395516,15,16,36,24,12,...,12.360093,49.706648,2.071110,14.299735,5.122215,3.489699,1455.0,38.0,5.396,132.0
92,0,1.2332,1.520782,52.2965,58.237516,17,17,40,28,12,...,13.048255,57.298165,2.046363,22.437376,7.693744,6.565223,2255.0,44.0,4.842,152.0


In [7]:
# Remove columns with standard deviation = 0
descriptor_df = descriptor_df.loc[:, descriptor_df.std() != 0]
num_of_samples = len(descriptor_df)
processed_descriptor_df = pd.DataFrame()
processed_descriptor_df = descriptor_df
processed_descriptor_df

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,0,1.8332,3.360622,38.8217,61.657102,23,25,44,30,14,...,10.727241,61.601358,2.053379,26.207945,2.543160,12.436456,2949.0,42.0,6.330,156.0
1,0,2.3025,5.301506,48.2354,62.914895,22,24,48,33,15,...,10.604914,67.019947,2.030907,34.022386,2.543262,15.595702,3852.0,48.0,5.512,176.0
2,0,3.4539,11.929425,65.7244,70.371067,22,24,51,32,19,...,10.157118,65.276625,2.039895,29.438396,2.543325,12.315692,3324.0,48.0,5.515,168.0
3,0,1.7308,2.995669,46.8983,59.957102,22,24,43,29,14,...,10.674533,59.520206,2.052421,29.687873,2.543250,15.458889,2712.0,39.0,4.410,152.0
4,0,0.3442,0.118474,27.0317,58.045895,23,25,44,29,15,...,9.592282,59.752710,2.060438,23.645955,5.658412,12.395061,2713.0,40.0,6.084,150.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,0,1.1237,1.262702,39.2394,47.541930,15,16,32,22,10,...,13.092116,45.712208,2.077828,14.345409,5.129616,3.515549,1034.0,36.0,4.630,124.0
90,0,0.4479,0.200614,30.9088,44.371930,15,16,32,22,10,...,10.219127,45.710723,2.077760,14.357961,5.127368,3.514977,1019.0,37.0,4.092,124.0
91,0,1.5915,2.532872,49.5580,52.395516,15,16,36,24,12,...,12.360093,49.706648,2.071110,14.299735,5.122215,3.489699,1455.0,38.0,5.396,132.0
92,0,1.2332,1.520782,52.2965,58.237516,17,17,40,28,12,...,13.048255,57.298165,2.046363,22.437376,7.693744,6.565223,2255.0,44.0,4.842,152.0


In [8]:
values_df = csv_df['values']

In [9]:
values_df

0    -0.477121
1    -1.000000
2    -1.041393
3    -1.146128
4    -1.176091
        ...   
89    0.022276
90   -1.049218
91   -1.371068
92   -1.099335
93   -1.243038
Name: values, Length: 94, dtype: float64

In [10]:
processed_descriptor_df['values'] = values_df
processed_descriptor_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,values
0,0,1.8332,3.360622,38.8217,61.657102,23,25,44,30,14,...,61.601358,2.053379,26.207945,2.543160,12.436456,2949.0,42.0,6.330,156.0,-0.477121
1,0,2.3025,5.301506,48.2354,62.914895,22,24,48,33,15,...,67.019947,2.030907,34.022386,2.543262,15.595702,3852.0,48.0,5.512,176.0,-1.000000
2,0,3.4539,11.929425,65.7244,70.371067,22,24,51,32,19,...,65.276625,2.039895,29.438396,2.543325,12.315692,3324.0,48.0,5.515,168.0,-1.041393
3,0,1.7308,2.995669,46.8983,59.957102,22,24,43,29,14,...,59.520206,2.052421,29.687873,2.543250,15.458889,2712.0,39.0,4.410,152.0,-1.146128
4,0,0.3442,0.118474,27.0317,58.045895,23,25,44,29,15,...,59.752710,2.060438,23.645955,5.658412,12.395061,2713.0,40.0,6.084,150.0,-1.176091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,0,1.1237,1.262702,39.2394,47.541930,15,16,32,22,10,...,45.712208,2.077828,14.345409,5.129616,3.515549,1034.0,36.0,4.630,124.0,0.022276
90,0,0.4479,0.200614,30.9088,44.371930,15,16,32,22,10,...,45.710723,2.077760,14.357961,5.127368,3.514977,1019.0,37.0,4.092,124.0,-1.049218
91,0,1.5915,2.532872,49.5580,52.395516,15,16,36,24,12,...,49.706648,2.071110,14.299735,5.122215,3.489699,1455.0,38.0,5.396,132.0,-1.371068
92,0,1.2332,1.520782,52.2965,58.237516,17,17,40,28,12,...,57.298165,2.046363,22.437376,7.693744,6.565223,2255.0,44.0,4.842,152.0,-1.099335


In [11]:
# Add back the smiles of compounds
processed_descriptor_df.insert(0, "Name", names)

In [12]:
# min max scaling of columns.
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(processed_descriptor_df.loc[:,'nAcid':'values'])
processed_descriptor_df.loc[:,1:] = scaled_values
processed_descriptor_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,values
0,ClC1=CC(NC(=O)CSC2=NC=CC(=N2)C2=CSC(=N2)C2=CC=...,0.0,0.663671,0.197552,0.342047,0.607035,1.000000,1.00,0.489362,0.653846,...,0.645277,0.744731,0.670546,0.000130,0.671117,0.486961,0.535714,0.805872,0.605263,0.566680
1,CN1N=C(C=C1C(F)(F)F)C1=CC=C(S1)C1=CC=NC(SCC(=O...,0.0,0.732582,0.311700,0.461683,0.629867,0.944444,0.95,0.574468,0.769231,...,0.743987,0.581986,0.919028,0.000137,0.841602,0.650489,0.642857,0.709806,0.736842,0.433320
2,CSC1=C(C(C)=C(S1)C1=NC(C)=CS1)C1=CC=NC(SCC(=O)...,0.0,0.901649,0.701501,0.683945,0.765216,0.944444,0.95,0.638298,0.730769,...,0.712229,0.647073,0.773268,0.000141,0.664601,0.554871,0.642857,0.710159,0.684211,0.422763
3,CC1=NC(=CS1)C1=NC(=CS1)C1=NC(SCC(=O)NC2=CC=C(C...,0.0,0.648635,0.176089,0.444690,0.576176,0.944444,0.95,0.468085,0.615385,...,0.607365,0.737794,0.781200,0.000136,0.834219,0.444042,0.482143,0.580388,0.578947,0.396050
4,ClC1=CC=C(NC(=O)CSC2=NC=CC(=N2)C2=CC(=NO2)C2=C...,0.0,0.445032,0.006875,0.192211,0.541483,1.000000,1.00,0.489362,0.615385,...,0.611601,0.795860,0.589081,0.195103,0.668884,0.444223,0.500000,0.776982,0.565789,0.388408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,IC1=CC=C2N(CC3=CC4=CC=CC=C4S3)C(=O)C(=O)C2=C1,0.0,0.559491,0.074169,0.347355,0.350809,0.555556,0.55,0.234043,0.346154,...,0.355826,0.921801,0.293344,0.162008,0.189712,0.140167,0.428571,0.606224,0.394737,0.694051
90,ClC1=C2C(=O)C(=O)N(CC3=CC4=CC=CC=C4S3)C2=CC=C1,0.0,0.460259,0.011706,0.241484,0.293265,0.555556,0.55,0.234043,0.346154,...,0.355799,0.921312,0.293744,0.161867,0.189681,0.137450,0.446429,0.543042,0.394737,0.420767
91,IC1=CC=C2N(C\C=C\C3=CC4=CC=CC=C4S3)C(=O)C(=O)C...,0.0,0.628181,0.148871,0.478491,0.438914,0.555556,0.55,0.319149,0.423077,...,0.428593,0.873151,0.291892,0.161545,0.188317,0.216407,0.464286,0.696183,0.447368,0.338680
92,ClC1=CC=C(NC(=O)C2=CC=C(CN3C(=O)C(=O)C4=CC(I)=...,0.0,0.575569,0.089348,0.513294,0.544961,0.666667,0.60,0.404255,0.576923,...,0.566886,0.693921,0.550651,0.322488,0.354284,0.361282,0.571429,0.631122,0.578947,0.407985


In [13]:
# Save scaler for future use during inference
pickle.dump(scaler,open('scaler_data/scaler.dat','wb'))

In [14]:
#  Save min max scaled molecular descriptors as a csv file
processed_descriptor_df.to_csv('molecular_descriptors_csv/min_max_scaled_molecular_descriptors_with_pIC50_values.csv')