##### The sdf file for CAS Antiviral database was downloaded (https://www.cas.org/covid-19-antiviral-compounds-dataset) and passed through Padel (separate software) to generate topological molecular descriptors. The molecular descriptors are available in molecular_descriptors_csv/cas_antiviral_molecular_descriptors.csv
##### This notebook does min-max scaling on molecular descriptors and saves the output to molecular_descriptors_csv/min_max_scaled_cas_antiviral_molecular_descriptors.csv
##### The same scaler is used to transform the descriptors as used during transforming training data.
##### Only the descriptors used for training are transformed and retained.

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import pickle
import numpy as np

In [2]:
data_file = "molecular_descriptors_csv/cas_antiviral_molecular_descriptors.csv"
csv_df = pd.read_csv(data_file,low_memory=False)

In [3]:
csv_df

Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,"1000161-61-4:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.2007,0.040280,88.3474,77.272790,12,12,67,37,...,7.942950,74.688685,2.018613,27.327565,15.263121,6.963909,4181,67,3.897,198
1,"1000161-55-6:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.4234,0.179268,85.2965,76.470790,12,12,66,36,...,7.820950,72.804537,2.022348,24.789587,12.746408,6.943791,3850,65,4.927,192
2,"1000161-54-5:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.0866,0.007500,86.7654,75.939204,12,12,65,37,...,8.156338,74.686960,2.018566,27.338095,15.281006,6.956951,4150,68,4.188,198
3,"1000161-58-9:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.1354,0.018333,88.2081,79.564376,12,12,69,37,...,7.684034,74.921360,2.024902,24.843649,12.746508,6.994843,4182,68,5.285,196
4,"1000161-59-0:3-Quinolinecarboxylic acid, 6-[[3...",1.0,-0.2014,0.040562,89.6770,79.032790,12,12,68,38,...,8.002612,76.777125,2.020451,27.392024,15.297734,6.992146,4486,71,4.546,202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48871,"99966-16-2:Thymidine, ÃŽÂ±,ÃŽÂ±,ÃŽÂ±-trifluoro...",1.0,-0.5484,0.300743,60.8114,37.382930,0,0,33,23,...,10.849023,46.177947,2.007737,36.355248,19.273512,6.490259,1147,38,-1.276,130
48872,99909-04-3:Cytidine 5Ã¢â‚¬Â²-(tetrahydrogen tr...,4.0,-3.1977,10.225285,81.4462,51.014895,0,0,44,29,...,11.022277,56.587770,1.951302,55.173682,33.764586,8.930565,2486,42,-5.592,154
48873,"960214-88-4:Butanoic acid, 4-[[[4-[[[2-[1-(4-c...",0.0,-1.7762,3.154886,124.3221,142.525650,33,34,120,70,...,8.094193,144.479651,2.063995,48.525446,36.441460,9.564855,30804,126,6.461,382
48874,"960214-90-8:Butanoic acid, 4-[[2-[[3-[5-(4-met...",0.0,-1.6855,2.840910,125.4069,135.545650,29,31,116,66,...,7.950944,135.584998,2.054318,51.764691,35.771691,12.729726,25961,114,5.691,358


In [4]:
names = csv_df['Name']

In [5]:
descriptor_df = csv_df.loc[:,'nAcid':'Zagreb']

In [6]:
descriptor_df

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1.0,0.2007,0.040280,88.3474,77.272790,12,12,67,37,30,...,7.942950,74.688685,2.018613,27.327565,15.263121,6.963909,4181,67,3.897,198
1,1.0,0.4234,0.179268,85.2965,76.470790,12,12,66,36,30,...,7.820950,72.804537,2.022348,24.789587,12.746408,6.943791,3850,65,4.927,192
2,1.0,0.0866,0.007500,86.7654,75.939204,12,12,65,37,28,...,8.156338,74.686960,2.018566,27.338095,15.281006,6.956951,4150,68,4.188,198
3,1.0,0.1354,0.018333,88.2081,79.564376,12,12,69,37,32,...,7.684034,74.921360,2.024902,24.843649,12.746508,6.994843,4182,68,5.285,196
4,1.0,-0.2014,0.040562,89.6770,79.032790,12,12,68,38,30,...,8.002612,76.777125,2.020451,27.392024,15.297734,6.992146,4486,71,4.546,202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48871,1.0,-0.5484,0.300743,60.8114,37.382930,0,0,33,23,10,...,10.849023,46.177947,2.007737,36.355248,19.273512,6.490259,1147,38,-1.276,130
48872,4.0,-3.1977,10.225285,81.4462,51.014895,0,0,44,29,15,...,11.022277,56.587770,1.951302,55.173682,33.764586,8.930565,2486,42,-5.592,154
48873,0.0,-1.7762,3.154886,124.3221,142.525650,33,34,120,70,50,...,8.094193,144.479651,2.063995,48.525446,36.441460,9.564855,30804,126,6.461,382
48874,0.0,-1.6855,2.840910,125.4069,135.545650,29,31,116,66,50,...,7.950944,135.584998,2.054318,51.764691,35.771691,12.729726,25961,114,5.691,358


In [7]:
# Keep columns used in training
training_df = pd.read_csv('molecular_descriptors_csv/min_max_scaled_molecular_descriptors_with_pIC50_values.csv')
csv_df_columns_from_training = csv_df.loc[:,training_df.columns[2:len(training_df.loc[0])-1]]
csv_df_columns_from_training

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,1.0,0.2007,0.040280,88.3474,77.272790,12,12,67,37,30,...,7.942950,74.688685,2.018613,27.327565,15.263121,6.963909,4181,67,3.897,198
1,1.0,0.4234,0.179268,85.2965,76.470790,12,12,66,36,30,...,7.820950,72.804537,2.022348,24.789587,12.746408,6.943791,3850,65,4.927,192
2,1.0,0.0866,0.007500,86.7654,75.939204,12,12,65,37,28,...,8.156338,74.686960,2.018566,27.338095,15.281006,6.956951,4150,68,4.188,198
3,1.0,0.1354,0.018333,88.2081,79.564376,12,12,69,37,32,...,7.684034,74.921360,2.024902,24.843649,12.746508,6.994843,4182,68,5.285,196
4,1.0,-0.2014,0.040562,89.6770,79.032790,12,12,68,38,30,...,8.002612,76.777125,2.020451,27.392024,15.297734,6.992146,4486,71,4.546,202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48871,1.0,-0.5484,0.300743,60.8114,37.382930,0,0,33,23,10,...,10.849023,46.177947,2.007737,36.355248,19.273512,6.490259,1147,38,-1.276,130
48872,4.0,-3.1977,10.225285,81.4462,51.014895,0,0,44,29,15,...,11.022277,56.587770,1.951302,55.173682,33.764586,8.930565,2486,42,-5.592,154
48873,0.0,-1.7762,3.154886,124.3221,142.525650,33,34,120,70,50,...,8.094193,144.479651,2.063995,48.525446,36.441460,9.564855,30804,126,6.461,382
48874,0.0,-1.6855,2.840910,125.4069,135.545650,29,31,116,66,50,...,7.950944,135.584998,2.054318,51.764691,35.771691,12.729726,25961,114,5.691,358


In [8]:
csv_df_columns_from_training.insert(0, "Name", names)

In [9]:
# # Add values column for scaler to work
csv_df_columns_from_training['values'] = 1
csv_df_columns_from_training

Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,values
0,"1000161-61-4:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.2007,0.040280,88.3474,77.272790,12,12,67,37,...,74.688685,2.018613,27.327565,15.263121,6.963909,4181,67,3.897,198,1
1,"1000161-55-6:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.4234,0.179268,85.2965,76.470790,12,12,66,36,...,72.804537,2.022348,24.789587,12.746408,6.943791,3850,65,4.927,192,1
2,"1000161-54-5:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.0866,0.007500,86.7654,75.939204,12,12,65,37,...,74.686960,2.018566,27.338095,15.281006,6.956951,4150,68,4.188,198,1
3,"1000161-58-9:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.1354,0.018333,88.2081,79.564376,12,12,69,37,...,74.921360,2.024902,24.843649,12.746508,6.994843,4182,68,5.285,196,1
4,"1000161-59-0:3-Quinolinecarboxylic acid, 6-[[3...",1.0,-0.2014,0.040562,89.6770,79.032790,12,12,68,38,...,76.777125,2.020451,27.392024,15.297734,6.992146,4486,71,4.546,202,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48871,"99966-16-2:Thymidine, ÃŽÂ±,ÃŽÂ±,ÃŽÂ±-trifluoro...",1.0,-0.5484,0.300743,60.8114,37.382930,0,0,33,23,...,46.177947,2.007737,36.355248,19.273512,6.490259,1147,38,-1.276,130,1
48872,99909-04-3:Cytidine 5Ã¢â‚¬Â²-(tetrahydrogen tr...,4.0,-3.1977,10.225285,81.4462,51.014895,0,0,44,29,...,56.587770,1.951302,55.173682,33.764586,8.930565,2486,42,-5.592,154,1
48873,"960214-88-4:Butanoic acid, 4-[[[4-[[[2-[1-(4-c...",0.0,-1.7762,3.154886,124.3221,142.525650,33,34,120,70,...,144.479651,2.063995,48.525446,36.441460,9.564855,30804,126,6.461,382,1
48874,"960214-90-8:Butanoic acid, 4-[[2-[[3-[5-(4-met...",0.0,-1.6855,2.840910,125.4069,135.545650,29,31,116,66,...,135.584998,2.054318,51.764691,35.771691,12.729726,25961,114,5.691,358,1


In [10]:
# remove rows with any nan values
new_df = csv_df_columns_from_training.loc[:,'Name':'values']
new_df = new_df.dropna(axis = 0 ,how='any',subset = list(new_df.columns)[1:])
new_df

Unnamed: 0,Name,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,values
0,"1000161-61-4:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.2007,0.040280,88.3474,77.272790,12,12,67,37,...,74.688685,2.018613,27.327565,15.263121,6.963909,4181,67,3.897,198,1
1,"1000161-55-6:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.4234,0.179268,85.2965,76.470790,12,12,66,36,...,72.804537,2.022348,24.789587,12.746408,6.943791,3850,65,4.927,192,1
2,"1000161-54-5:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.0866,0.007500,86.7654,75.939204,12,12,65,37,...,74.686960,2.018566,27.338095,15.281006,6.956951,4150,68,4.188,198,1
3,"1000161-58-9:3-Quinolinecarboxylic acid, 6-[[3...",1.0,0.1354,0.018333,88.2081,79.564376,12,12,69,37,...,74.921360,2.024902,24.843649,12.746508,6.994843,4182,68,5.285,196,1
4,"1000161-59-0:3-Quinolinecarboxylic acid, 6-[[3...",1.0,-0.2014,0.040562,89.6770,79.032790,12,12,68,38,...,76.777125,2.020451,27.392024,15.297734,6.992146,4486,71,4.546,202,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48871,"99966-16-2:Thymidine, ÃŽÂ±,ÃŽÂ±,ÃŽÂ±-trifluoro...",1.0,-0.5484,0.300743,60.8114,37.382930,0,0,33,23,...,46.177947,2.007737,36.355248,19.273512,6.490259,1147,38,-1.276,130,1
48872,99909-04-3:Cytidine 5Ã¢â‚¬Â²-(tetrahydrogen tr...,4.0,-3.1977,10.225285,81.4462,51.014895,0,0,44,29,...,56.587770,1.951302,55.173682,33.764586,8.930565,2486,42,-5.592,154,1
48873,"960214-88-4:Butanoic acid, 4-[[[4-[[[2-[1-(4-c...",0.0,-1.7762,3.154886,124.3221,142.525650,33,34,120,70,...,144.479651,2.063995,48.525446,36.441460,9.564855,30804,126,6.461,382,1
48874,"960214-90-8:Butanoic acid, 4-[[2-[[3-[5-(4-met...",0.0,-1.6855,2.840910,125.4069,135.545650,29,31,116,66,...,135.584998,2.054318,51.764691,35.771691,12.729726,25961,114,5.691,358,1


In [12]:
new_df = new_df.loc[:,'nAcid':'values']
# # Use scaler used during training
scaler = pickle.load(open('scaler_data/scaler.dat','rb'))
scaled_values = scaler.transform(new_df)
df = pd.DataFrame(scaled_values,columns=new_df.columns)
df.insert(0, "Name", names)

In [13]:
# Save scaled descriptors
df.loc[:,:].to_csv('molecular_descriptors_csv/min_max_scaled_cas_antiviral_molecular_descriptors.csv')