In [1]:
from google.colab import drive #importing drive for access
drive.mount('/content/drive') #mounting drive so colab can access files

Mounted at /content/drive


In [2]:
! pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [3]:
import pandas as pd #importing pandas

# reading the csv
data = pd.read_csv("/content/drive/Shareddrives/1:1_Aiden_Chavda/Dataset/smile_pIC50_processed.csv")
data.head()

Unnamed: 0,canonical_smiles,pIC50
0,C[S+]([O-])c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc...,4.522879
1,c1ccc(-c2n[nH]cc2-c2ccnc3ccccc23)nc1,4.568636
2,Cc1cccc(-c2[nH]ncc2-c2ccnc3ccccc23)n1,7.468521
3,Cc1cccc(-c2n[nH]cc2-c2ccc(O)cc2)n1,7.508638
4,Cc1cccc(-c2[nH]ncc2-c2ccc(F)c(F)c2)n1,6.821023


In [4]:
# extract main columns for preprocessing
MAIN_COLS = list(data.columns)
print("Main Columns", MAIN_COLS)

Main Columns ['canonical_smiles', 'pIC50']


In [5]:
def extract_lipinski_and_other_descriptors(raw):
  """
  This function extracts lipinski descriptors from the raw smiles formula

  @return
  dataframe raw with each descriptor appended
  """
  from rdkit import Chem
  from rdkit.Chem import Descriptors, Lipinski
  import numpy as np
  #get formula for canonical smiles
  smile = raw['canonical_smiles']
  processed_smiles = Chem.MolFromSmiles(smile)

  #loop through descriptor functions of rdkit descriptors
  for descriptor_name, descriptor_function in Descriptors._descList:
    try:
      descriptor_value = descriptor_function(processed_smiles)
      raw[descriptor_name] = descriptor_value
    except Exception as error:
      print(str(error))
      raw[descriptor_name] = np.nan

  return raw

# run descriptor function for data
data = data.apply(extract_lipinski_and_other_descriptors, axis = 1)



In [6]:
print(data.shape)
data

(1180, 212)


Unnamed: 0,canonical_smiles,pIC50,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C[S+]([O-])c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc...,4.522879,13.337707,13.337707,0.288773,-1.027544,0.525196,12.111111,377.444,361.316,...,0,0,0,0,0,0,0,0,0,0
1,c1ccc(-c2n[nH]cc2-c2ccnc3ccccc23)nc1,4.568636,4.407917,4.407917,0.853660,0.853660,0.605205,10.857143,272.311,260.215,...,0,0,0,0,0,0,0,0,0,0
2,Cc1cccc(-c2[nH]ncc2-c2ccnc3ccccc23)n1,7.468521,4.595003,4.595003,0.896661,0.896661,0.605781,10.954545,286.338,272.226,...,0,0,0,0,0,0,0,0,0,0
3,Cc1cccc(-c2n[nH]cc2-c2ccc(O)cc2)n1,7.508638,9.341134,9.341134,0.252292,0.252292,0.735062,10.578947,251.289,238.185,...,0,0,0,0,0,0,0,0,0,0
4,Cc1cccc(-c2[nH]ncc2-c2ccc(F)c(F)c2)n1,6.821023,13.348139,13.348139,0.551811,-0.882545,0.771308,10.750000,271.270,260.182,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1175,Cc1ncc(-c2cc(Cl)ccc2F)cc1-n1c(=O)n(CC(N)=O)c2c...,6.610834,14.351146,14.351146,0.265592,-0.653149,0.558151,11.137931,411.824,396.704,...,0,0,0,0,0,0,0,0,0,0
1176,Cc1ncc(-c2cc(Cl)ccc2F)cc1-n1c(=O)n(CC(N)=O)c2c...,5.000000,14.359411,14.359411,0.248576,-0.625371,0.559066,11.137931,410.836,394.708,...,0,0,0,0,0,0,0,0,0,0
1177,COc1ccc(OC)c(/C=C/C(=O)Nc2sc3c(c2C#N)CCC3)c1,7.045757,12.262203,12.262203,0.276577,-0.276577,0.829686,12.680000,354.431,336.287,...,0,0,0,0,0,0,0,1,0,0
1178,COc1ccc(OC)c(/C=C/C(=O)Nc2sc3c(c2C#N)CCCC3)c1,7.060481,12.348141,12.348141,0.273176,-0.273176,0.804303,13.115385,368.458,348.298,...,0,0,0,0,0,0,0,1,0,0


In [7]:
# identify cols with missing values
missing_values = data.isnull().sum()
cols_missing = list(missing_values[missing_values > 0].index)
cols_missing

[]

In [8]:
print(data.shape)

(1180, 212)


In [9]:
from sklearn.model_selection import train_test_split

# split the dataset
X = data.loc[:, [cols for cols in list(data.columns) if cols != "pIC50"]]
Y = data.loc[:, "pIC50"]

x_train,x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

print(x_train.shape)
print(y_train.shape)

(944, 211)
(944,)


In [10]:
# fill missing values
from sklearn.impute import SimpleImputer
import pickle

if (len(cols_missing) > 0):
  print("Filling Missing Values")
  imputer = SimpleImputer(strategy = "median").fit(x_train[cols_missing])
  # transform on train_data
  x_train[cols_missing] = imputer.transform(x_train[cols_missing])
  # transform on test_data
  x_test[cols_missing] = imputer.transform(x_test[cols_missing])

  # save impute model for later use
  with open("imputation_model", "wb") as impute_model:
    pickle.dump(imputer, impute_model)
    print("Model Saved")

In [11]:
from sklearn.preprocessing import StandardScaler

# if you want to scale data, set to True
SCALING = True

# get only required columns without pIC50 and smiles
REQUIRED_COLUMNS = [cols for cols in list(x_train.columns) if cols not in MAIN_COLS]
print(REQUIRED_COLUMNS)

if SCALING:
  print("Preprocessing")
  stc = StandardScaler().fit(x_train[REQUIRED_COLUMNS])
  # transform on train_data
  x_train[REQUIRED_COLUMNS] = stc.transform(x_train[REQUIRED_COLUMNS])
  # transform on test_data
  x_test[REQUIRED_COLUMNS] = stc.transform(x_test[REQUIRED_COLUMNS])

  # save scaling model for later use
  with open("scaling_model", "wb") as scaling_file:
    pickle.dump(stc, scaling_file)
    print("Model Saved")




['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'S

In [12]:
# saving file to results folder of drive
with open("/content/drive/Shareddrives/1:1_Aiden_Chavda/Results/scaling_model", "wb") as scaling_file:
    pickle.dump(stc, scaling_file)
    print("Model Saved")

Model Saved


In [13]:
selected_columns = ["canonical_smiles", "pIC50"]

# save train and test csvs
# train
train_csv = pd.concat([x_train, y_train], axis = 1)
train_csv.reset_index(drop = True, inplace = True)
# select only smile and pIC50 for chemberta and pubchem
train_csv_smile = train_csv[selected_columns]
train_csv.drop(selected_columns[0], axis = 1, inplace = True)

# test
test_csv = pd.concat([x_test, y_test], axis = 1)
test_csv.reset_index(drop = True, inplace = True)
# select only smile and pIC50 for chemberta and pubchem
test_csv_smile = test_csv[selected_columns]
test_csv.drop(selected_columns[0], axis = 1, inplace = True)

In [14]:
print(test_csv.shape)
print(test_csv_smile.shape)
print(train_csv.shape)
print(train_csv_smile.shape)

(236, 211)
(236, 2)
(944, 211)
(944, 2)


In [15]:
# saving csvs

test_csv.to_csv("/content/drive/Shareddrives/1:1_Aiden_Chavda/Dataset/test_regular.csv", index = False)
test_csv_smile.to_csv("/content/drive/Shareddrives/1:1_Aiden_Chavda/Dataset/test_smile.csv", index = False)
train_csv.to_csv("/content/drive/Shareddrives/1:1_Aiden_Chavda/Dataset/train_regular.csv", index = False)
train_csv_smile.to_csv("/content/drive/Shareddrives/1:1_Aiden_Chavda/Dataset/train_smile.csv", index = False)