#### In this notebook the molecular descriptors for blinded compounds are min-max scaled using the same scaler used to scale training molecular descriptors.
#### The scaler used to min max scale training data is available in scaler_data/scaler.dat

#### Only those molecular descriptors are considered that were considered during training.

#### The scaled molecular descriptors for blinded compounds are saved in molecular_descriptors_csv/min_max_scaled_blinded_molecular_descriptors.csv

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import pickle

In [2]:
data_file = "molecular_descriptors_csv/blinded_molecular_descriptors.csv"
csv_df = pd.read_csv(data_file)

In [3]:
csv_df

Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,CSC1=C(C(C)=C(S1)C1=NC(C)=CS1)C1=CC=NC(SCC(=O)...,0,3.4539,11.929425,65.7244,70.371067,22,24,51,32,...,10.157118,65.274779,2.039837,29.451814,2.542321,12.311309,3274.0,49.0,5.515,168.0
1,COC1=CC=CC(=C1)C1=C(C#N)C(=O)NC(SCC(=O)NC2=CC=...,0,0.1407,0.019796,59.2688,57.611895,12,12,46,31,...,9.501726,62.514552,2.016598,30.622416,12.731426,14.811368,3088.0,48.0,2.768,156.0
2,CC1=COC2=C1C(=O)C(=O)C1=C3CCCC(C)(C)C3=CC=C21,0,0.7113,0.505948,44.0803,47.848274,11,12,40,22,...,7.35314,45.432426,2.06511,8.247287,8.247287,0.0,891.0,45.0,4.929,130.0
3,CCOC(=O)C(=C\NC1=CC=C(C=C1)S(=O)(=O)C1=CC=C(N\...,0,-1.171,1.371241,76.8836,69.021446,12,12,57,35,...,8.668877,69.284577,1.979559,30.296757,15.70302,11.075357,4752.0,54.0,3.847,168.0
4,CCOC(=O)C(=CNC1=CC=C(C=C1)S(=O)(=O)C1=CC=C(NC=...,0,-0.7172,0.514376,97.0508,83.737376,12,12,73,41,...,8.05723,80.95678,1.974556,36.20173,26.494885,6.187816,7208.0,64.0,7.137,196.0
5,[O-][N+](=O)C1=C(C=CC(Cl)=C1)C1=CC=C(O1)C(=O)O...,0,1.9035,3.623312,23.032,44.064344,17,18,33,25,...,11.453971,50.529176,2.021167,24.716991,13.607393,6.051356,1611.0,37.0,4.328,130.0
6,ClC1=CC=C(C=C1)C(=O)OC1=CN=CC(Cl)=C1,0,1.3975,1.953006,17.5229,32.851551,12,12,24,17,...,11.124391,34.120881,2.007111,13.654925,5.613068,3.002318,562.0,22.0,3.482,84.0
7,CN1CCN(CC1)S(=O)(=O)C1=CC2=C(NC(=O)C2=O)C=C1,0,-1.6732,2.799598,51.3726,42.289895,6,6,36,21,...,8.585509,42.503469,2.023975,23.252982,10.093146,9.645249,900.0,36.0,-0.202,116.0
8,O=C1NC2=C(C=C(C=C2)S(=O)(=O)N2CCCCC2)C1=O,0,-2.6759,7.160441,42.0883,40.523102,6,6,34,20,...,8.649042,40.645753,2.032288,20.036741,10.090869,6.433519,773.0,34.0,1.019,110.0
9,IC1=CC=C2N(CC3COC4=C(O3)C=CC=C4)C(=O)C(=O)C2=C1,0,0.1861,0.034633,42.7942,47.579516,12,12,35,23,...,12.028032,47.789658,2.077811,17.481752,11.422526,3.516587,1182.0,39.0,3.314,128.0


In [4]:
names = csv_df['Name']

In [5]:
descriptor_df = csv_df.loc[:,'nAcid':'Zagreb']

In [6]:
descriptor_df

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,0,3.4539,11.929425,65.7244,70.371067,22,24,51,32,19,...,10.157118,65.274779,2.039837,29.451814,2.542321,12.311309,3274.0,49.0,5.515,168.0
1,0,0.1407,0.019796,59.2688,57.611895,12,12,46,31,15,...,9.501726,62.514552,2.016598,30.622416,12.731426,14.811368,3088.0,48.0,2.768,156.0
2,0,0.7113,0.505948,44.0803,47.848274,11,12,40,22,18,...,7.35314,45.432426,2.06511,8.247287,8.247287,0.0,891.0,45.0,4.929,130.0
3,0,-1.171,1.371241,76.8836,69.021446,12,12,57,35,22,...,8.668877,69.284577,1.979559,30.296757,15.70302,11.075357,4752.0,54.0,3.847,168.0
4,0,-0.7172,0.514376,97.0508,83.737376,12,12,73,41,32,...,8.05723,80.95678,1.974556,36.20173,26.494885,6.187816,7208.0,64.0,7.137,196.0
5,0,1.9035,3.623312,23.032,44.064344,17,18,33,25,8,...,11.453971,50.529176,2.021167,24.716991,13.607393,6.051356,1611.0,37.0,4.328,130.0
6,0,1.3975,1.953006,17.5229,32.851551,12,12,24,17,7,...,11.124391,34.120881,2.007111,13.654925,5.613068,3.002318,562.0,22.0,3.482,84.0
7,0,-1.6732,2.799598,51.3726,42.289895,6,6,36,21,15,...,8.585509,42.503469,2.023975,23.252982,10.093146,9.645249,900.0,36.0,-0.202,116.0
8,0,-2.6759,7.160441,42.0883,40.523102,6,6,34,20,14,...,8.649042,40.645753,2.032288,20.036741,10.090869,6.433519,773.0,34.0,1.019,110.0
9,0,0.1861,0.034633,42.7942,47.579516,12,12,35,23,12,...,12.028032,47.789658,2.077811,17.481752,11.422526,3.516587,1182.0,39.0,3.314,128.0


In [7]:
# Keep columns used in training
training_df = pd.read_csv('molecular_descriptors_csv/min_max_scaled_molecular_descriptors_with_pIC50_values.csv')
csv_df_columns_from_training = csv_df.loc[:,training_df.columns[2:len(training_df.loc[0])-1]]
csv_df_columns_from_training

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,0,3.4539,11.929425,65.7244,70.371067,22,24,51,32,19,...,10.157118,65.274779,2.039837,29.451814,2.542321,12.311309,3274.0,49.0,5.515,168.0
1,0,0.1407,0.019796,59.2688,57.611895,12,12,46,31,15,...,9.501726,62.514552,2.016598,30.622416,12.731426,14.811368,3088.0,48.0,2.768,156.0
2,0,0.7113,0.505948,44.0803,47.848274,11,12,40,22,18,...,7.35314,45.432426,2.06511,8.247287,8.247287,0.0,891.0,45.0,4.929,130.0
3,0,-1.171,1.371241,76.8836,69.021446,12,12,57,35,22,...,8.668877,69.284577,1.979559,30.296757,15.70302,11.075357,4752.0,54.0,3.847,168.0
4,0,-0.7172,0.514376,97.0508,83.737376,12,12,73,41,32,...,8.05723,80.95678,1.974556,36.20173,26.494885,6.187816,7208.0,64.0,7.137,196.0
5,0,1.9035,3.623312,23.032,44.064344,17,18,33,25,8,...,11.453971,50.529176,2.021167,24.716991,13.607393,6.051356,1611.0,37.0,4.328,130.0
6,0,1.3975,1.953006,17.5229,32.851551,12,12,24,17,7,...,11.124391,34.120881,2.007111,13.654925,5.613068,3.002318,562.0,22.0,3.482,84.0
7,0,-1.6732,2.799598,51.3726,42.289895,6,6,36,21,15,...,8.585509,42.503469,2.023975,23.252982,10.093146,9.645249,900.0,36.0,-0.202,116.0
8,0,-2.6759,7.160441,42.0883,40.523102,6,6,34,20,14,...,8.649042,40.645753,2.032288,20.036741,10.090869,6.433519,773.0,34.0,1.019,110.0
9,0,0.1861,0.034633,42.7942,47.579516,12,12,35,23,12,...,12.028032,47.789658,2.077811,17.481752,11.422526,3.516587,1182.0,39.0,3.314,128.0


In [8]:
csv_df_columns_from_training.insert(0, "Name", names)

In [12]:
# # Add a dummy values column containing dummy value as 1 for scaler to work since blinded compounds don't have pIC50 values.
csv_df_columns_from_training['values'] = 1
csv_df_columns_from_training

Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,values
0,CSC1=C(C(C)=C(S1)C1=NC(C)=CS1)C1=CC=NC(SCC(=O)...,0,3.4539,11.929425,65.7244,70.371067,22,24,51,32,...,65.274779,2.039837,29.451814,2.542321,12.311309,3274.0,49.0,5.515,168.0,1
1,COC1=CC=CC(=C1)C1=C(C#N)C(=O)NC(SCC(=O)NC2=CC=...,0,0.1407,0.019796,59.2688,57.611895,12,12,46,31,...,62.514552,2.016598,30.622416,12.731426,14.811368,3088.0,48.0,2.768,156.0,1
2,CC1=COC2=C1C(=O)C(=O)C1=C3CCCC(C)(C)C3=CC=C21,0,0.7113,0.505948,44.0803,47.848274,11,12,40,22,...,45.432426,2.06511,8.247287,8.247287,0.0,891.0,45.0,4.929,130.0,1
3,CCOC(=O)C(=C\NC1=CC=C(C=C1)S(=O)(=O)C1=CC=C(N\...,0,-1.171,1.371241,76.8836,69.021446,12,12,57,35,...,69.284577,1.979559,30.296757,15.70302,11.075357,4752.0,54.0,3.847,168.0,1
4,CCOC(=O)C(=CNC1=CC=C(C=C1)S(=O)(=O)C1=CC=C(NC=...,0,-0.7172,0.514376,97.0508,83.737376,12,12,73,41,...,80.95678,1.974556,36.20173,26.494885,6.187816,7208.0,64.0,7.137,196.0,1
5,[O-][N+](=O)C1=C(C=CC(Cl)=C1)C1=CC=C(O1)C(=O)O...,0,1.9035,3.623312,23.032,44.064344,17,18,33,25,...,50.529176,2.021167,24.716991,13.607393,6.051356,1611.0,37.0,4.328,130.0,1
6,ClC1=CC=C(C=C1)C(=O)OC1=CN=CC(Cl)=C1,0,1.3975,1.953006,17.5229,32.851551,12,12,24,17,...,34.120881,2.007111,13.654925,5.613068,3.002318,562.0,22.0,3.482,84.0,1
7,CN1CCN(CC1)S(=O)(=O)C1=CC2=C(NC(=O)C2=O)C=C1,0,-1.6732,2.799598,51.3726,42.289895,6,6,36,21,...,42.503469,2.023975,23.252982,10.093146,9.645249,900.0,36.0,-0.202,116.0,1
8,O=C1NC2=C(C=C(C=C2)S(=O)(=O)N2CCCCC2)C1=O,0,-2.6759,7.160441,42.0883,40.523102,6,6,34,20,...,40.645753,2.032288,20.036741,10.090869,6.433519,773.0,34.0,1.019,110.0,1
9,IC1=CC=C2N(CC3COC4=C(O3)C=CC=C4)C(=O)C(=O)C2=C1,0,0.1861,0.034633,42.7942,47.579516,12,12,35,23,...,47.789658,2.077811,17.481752,11.422526,3.516587,1182.0,39.0,3.314,128.0,1


In [13]:
# Use scaler used during training
scaler = pickle.load(open('scaler_data/scaler.dat','rb'))
scaled_values = scaler.transform(csv_df_columns_from_training.loc[:,'nAcid':'values'])
csv_df_columns_from_training.loc[:,1:] = scaled_values
csv_df_columns_from_training

Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,values
0,CSC1=C(C(C)=C(S1)C1=NC(C)=CS1)C1=CC=NC(SCC(=O)...,0.0,0.901649,0.701501,0.683945,0.765216,0.944444,0.95,0.638298,0.730769,...,0.712195,0.646655,0.773694,7.8e-05,0.664364,0.545817,0.660714,0.710159,0.684211,0.943418
1,COC1=CC=CC(=C1)C1=C(C#N)C(=O)NC(SCC(=O)NC2=CC=...,0.0,0.415151,0.001072,0.601903,0.533604,0.388889,0.35,0.531915,0.692308,...,0.661913,0.478353,0.810917,0.637779,0.799276,0.512133,0.642857,0.387551,0.605263,0.943418
2,CC1=COC2=C1C(=O)C(=O)C1=C3CCCC(C)(C)C3=CC=C21,0.0,0.498935,0.029663,0.408877,0.35637,0.333333,0.35,0.404255,0.346154,...,0.350729,0.829696,0.099438,0.357132,0.0,0.11427,0.589286,0.641339,0.434211,0.943418
3,CCOC(=O)C(=C\NC1=CC=C(C=C1)S(=O)(=O)C1=CC=C(N\...,0.0,0.222545,0.080553,0.825764,0.740717,0.388889,0.35,0.765957,0.846154,...,0.785242,0.210101,0.800561,0.823761,0.597667,0.813473,0.75,0.514269,0.684211,0.943418
4,CCOC(=O)C(=CNC1=CC=C(C=C1)S(=O)(=O)C1=CC=C(NC=...,0.0,0.28918,0.030159,1.082063,1.007849,0.388889,0.35,1.106383,1.076923,...,0.997873,0.173862,0.988326,1.499187,0.333918,1.25824,0.928571,0.900646,0.868421,0.943418
5,[O-][N+](=O)C1=C(C=CC(Cl)=C1)C1=CC=C(O1)C(=O)O...,0.0,0.673994,0.213002,0.14138,0.287682,0.666667,0.65,0.255319,0.461538,...,0.443576,0.511441,0.623137,0.692603,0.326554,0.244658,0.446429,0.570757,0.434211,0.943418
6,ClC1=CC=C(C=C1)C(=O)OC1=CN=CC(Cl)=C1,0.0,0.599695,0.114768,0.071367,0.084141,0.388889,0.35,0.06383,0.153846,...,0.144668,0.409639,0.271389,0.192265,0.162016,0.05469,0.178571,0.471403,0.131579,0.943418
7,CN1CCN(CC1)S(=O)(=O)C1=CC2=C(NC(=O)C2=O)C=C1,0.0,0.148804,0.164557,0.501552,0.255471,0.055556,0.05,0.319149,0.307692,...,0.297373,0.531776,0.576585,0.472658,0.520494,0.1159,0.428571,0.038755,0.342105,0.943418
8,O=C1NC2=C(C=C(C=C2)S(=O)(=O)N2CCCCC2)C1=O,0.0,0.001571,0.421027,0.383561,0.223399,0.055556,0.05,0.276596,0.269231,...,0.263531,0.591981,0.474316,0.472516,0.347177,0.092901,0.392857,0.182149,0.302632,0.943418
9,IC1=CC=C2N(CC3COC4=C(O3)C=CC=C4)C(=O)C(=O)C2=C1,0.0,0.421817,0.001944,0.392532,0.351491,0.388889,0.35,0.297872,0.384615,...,0.393671,0.921682,0.393073,0.55586,0.189768,0.166968,0.482143,0.451674,0.421053,0.943418


In [14]:
csv_df_columns_from_training.loc[:,:].to_csv('molecular_descriptors_csv/min_max_scaled_blinded_molecular_descriptors.csv')