In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor
from pathlib import Path
from warnings import filterwarnings

  from pandas import MultiIndex, Int64Index


In [3]:
# Set path to this notebook
HERE = Path(_dh[-1])
HDAC1and6 = Path(HERE).resolve().parents[1]/'input'
output = HERE/'OUTPUT'

In [4]:
df1 = pd.read_pickle(HDAC1and6/"HDAC1and6_1024B.csv")
df1 = df1.sample(frac=1).reset_index(drop=True)
df1.shape

(1339, 6)

#### By using MACCS fingerprints that are 166 bits 

In [5]:
X1 = np.array(list((df1['fp_MACCS']))).astype(float)
#X.shape
Y1 = df1["SelectivityWindow"].values
#y.shape

In [6]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train1.shape)
print("Shape of test data:", X_test1.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (1071, 167)
Shape of test data: (268, 167)


In [7]:
# Defines and builds the lazyregressor
reg1 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
model1, predictions1 = reg1.fit(X_train1, X_test1, Y_train1, Y_test1)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:17<00:00,  2.40it/s]


In [8]:
predictions1.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGBMRegressor,0.02,0.63,0.81,0.16
HistGradientBoostingRegressor,0.02,0.63,0.81,6.81
RandomForestRegressor,-0.0,0.63,0.82,1.23
XGBRegressor,-0.01,0.62,0.82,0.44
BaggingRegressor,-0.04,0.61,0.83,0.16
SVR,-0.07,0.6,0.85,0.35
KNeighborsRegressor,-0.07,0.6,0.85,0.14
NuSVR,-0.08,0.6,0.85,0.28
MLPRegressor,-0.25,0.53,0.92,1.51
GradientBoostingRegressor,-0.25,0.53,0.92,0.48


### USING 1024 BITS FINGERPRINTS

In [9]:
#By using Morgan fingerprints with radius of 3 and 1024 bits
X2 = np.array(list((df1['fp_Morgan3']))).astype(float)
#X.shape
Y2 = df1["SelectivityWindow"].values
#y.shape

In [10]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train2.shape)
print("Shape of test data:", X_test2.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (1071, 1024)
Shape of test data: (268, 1024)


In [11]:
reg2 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models2, predictions2 = reg2.fit(X_train2, X_test2, Y_train2, Y_test2)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [04:53<00:00,  6.99s/it]


In [12]:
predictions2.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,1.11,0.69,0.74,12.62
XGBRegressor,1.11,0.69,0.74,2.05
LGBMRegressor,1.11,0.69,0.74,1.25
HistGradientBoostingRegressor,1.11,0.69,0.74,31.75
BaggingRegressor,1.12,0.66,0.78,1.11
SVR,1.13,0.64,0.81,2.83
NuSVR,1.13,0.64,0.81,1.79
GradientBoostingRegressor,1.13,0.63,0.81,3.16
TweedieRegressor,1.14,0.62,0.83,0.13
GeneralizedLinearRegressor,1.14,0.62,0.83,0.19


In [13]:
#By using Morgan fingerprints with radius of 3 and 1024 bits and feautures
X3 = np.array(list((df1['fp_MorganF']))).astype(float)
#X.shape
Y3 = df1["SelectivityWindow"].values
#y.shape

In [14]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X3, Y3, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train3.shape)
print("Shape of test data:", X_test3.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (1071, 1024)
Shape of test data: (268, 1024)


In [15]:
reg3 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models3, predictions3 = reg3.fit(X_train3, X_test3, Y_train3, Y_test3)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [05:08<00:00,  7.35s/it]


In [16]:
predictions3.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGBRegressor,1.12,0.67,0.77,1.48
SVR,1.12,0.66,0.78,1.84
NuSVR,1.12,0.66,0.78,2.21
LGBMRegressor,1.12,0.66,0.79,1.14
HistGradientBoostingRegressor,1.12,0.66,0.79,27.98
RandomForestRegressor,1.12,0.66,0.79,12.03
TweedieRegressor,1.12,0.66,0.79,0.28
GeneralizedLinearRegressor,1.12,0.66,0.79,0.13
ElasticNetCV,1.13,0.63,0.82,56.69
GradientBoostingRegressor,1.13,0.63,0.82,4.08


In [17]:
#By using MAP4 fingerprints with 1024 bits
X4 = np.array(list((df1['fp_MAP4']))).astype(float)
#X4.shape
Y4 = df1["SelectivityWindow"].values
#y.shape

In [18]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(X4, Y4, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train4.shape)
print("Shape of test data:", X_test4.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (1071, 1024)
Shape of test data: (268, 1024)


In [19]:
# Defines and builds the lazyclassifier
reg4 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models4, predictions4 = reg4.fit(X_train4, X_test4, Y_train4, Y_test4)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [14:31<00:00, 20.74s/it]


In [20]:
predictions4.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGBMRegressor,1.11,0.7,0.73,11.94
HistGradientBoostingRegressor,1.11,0.69,0.74,28.07
GradientBoostingRegressor,1.11,0.69,0.75,23.51
RandomForestRegressor,1.11,0.68,0.76,69.77
XGBRegressor,1.12,0.66,0.78,6.6
ExtraTreesRegressor,1.12,0.66,0.78,26.7
BaggingRegressor,1.12,0.65,0.79,7.9
KNeighborsRegressor,1.15,0.57,0.88,1.58
AdaBoostRegressor,1.16,0.56,0.89,9.62
SVR,1.19,0.45,0.99,1.87


## USE 2048 BITS FINGERPRINTS

In [21]:
df2 = pd.read_pickle(HDAC1and6/"HDAC1and6_2048B.csv")
df2 = df1.sample(frac=1).reset_index(drop=True)
df2

Unnamed: 0,molecule_chembl_id,fp_MACCS,fp_Morgan3,fp_MorganF,fp_MAP4,SelectivityWindow
0,CHEMBL2047692,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[22987147, 23625102, 34810596, 3307127, 112501...",-2.81
1,CHEMBL4072618,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[13333824, 12708875, 4142580, 33294862, 115873...",0.58
2,CHEMBL4464421,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, ...","[4328289, 10511, 2812614, 5604080, 4030911, 61...",0.50
3,CHEMBL3770531,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ...","[1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...","[37428386, 1070410, 15051402, 218190, 12374797...",0.04
4,CHEMBL4063718,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1705, 2290011, 11111289, 7125875, 338217, 229...",0.41
...,...,...,...,...,...,...
1334,CHEMBL3775072,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2502218, 16948577, 3942253, 7125875, 1475085,...",0.78
1335,CHEMBL140000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[13333824, 16948577, 1098538, 7467572, 6626964...",0.59
1336,CHEMBL491316,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[13333824, 3035081, 7998790, 3344745, 12018511...",-0.27
1337,CHEMBL2018444,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...","[5976924, 3913535, 8033062, 2817337, 31738899,...",3.25


In [22]:
#2048 bits Morgan fingerprints with radius of 3
X5 = np.array(list((df2['fp_Morgan3']))).astype(float)
#X.shape
Y5 = df2["SelectivityWindow"].values
#y.shape

In [23]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train5, X_test5, Y_train5, Y_test5 = train_test_split(X5, Y5, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train5.shape)
print("Shape of test data:", X_test5.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (1071, 1024)
Shape of test data: (268, 1024)


In [24]:
reg5 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models5, predictions5 = reg5.fit(X_train5, X_test5, Y_train5, Y_test5)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [05:34<00:00,  7.97s/it]


In [25]:
predictions5.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,1.1,0.71,0.7,12.55
HistGradientBoostingRegressor,1.1,0.7,0.71,32.73
LGBMRegressor,1.1,0.7,0.71,0.81
XGBRegressor,1.11,0.69,0.73,1.56
BaggingRegressor,1.11,0.68,0.74,1.31
GeneralizedLinearRegressor,1.12,0.67,0.76,0.12
TweedieRegressor,1.12,0.67,0.76,0.25
SVR,1.12,0.67,0.76,2.19
NuSVR,1.12,0.67,0.76,2.97
GradientBoostingRegressor,1.12,0.66,0.76,4.03


In [26]:
#2048 bits Morgan fingerprints with feautures
X6 = np.array(list((df2['fp_MorganF']))).astype(float)
#X.shape
Y6 = df2["SelectivityWindow"].values
#y.shape

In [27]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train6, X_test6, Y_train6, Y_test6 = train_test_split(X6, Y6, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train6.shape)
print("Shape of test data:", X_test6.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (1071, 1024)
Shape of test data: (268, 1024)


In [28]:
reg6 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models6, predictions6 = reg6.fit(X_train6, X_test6, Y_train6, Y_test6)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [04:43<00:00,  6.75s/it]


In [29]:
predictions6.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HistGradientBoostingRegressor,1.1,0.72,0.7,25.71
LGBMRegressor,1.1,0.72,0.7,1.37
RandomForestRegressor,1.1,0.71,0.71,11.74
GradientBoostingRegressor,1.11,0.68,0.74,3.93
XGBRegressor,1.11,0.68,0.74,2.67
SVR,1.12,0.67,0.75,1.77
NuSVR,1.12,0.67,0.75,1.72
GeneralizedLinearRegressor,1.12,0.67,0.75,0.12
TweedieRegressor,1.12,0.67,0.75,0.11
BaggingRegressor,1.12,0.67,0.76,1.08


In [30]:
#2048 bits MAP4 fingerprints
X7 = np.array(list((df2['fp_MAP4']))).astype(float)
#X.shape
Y7 = df2["SelectivityWindow"].values
#y.shape

In [31]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train7, X_test7, Y_train7, Y_test7 = train_test_split(X7, Y7, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train7.shape)
print("Shape of test data:", X_test7.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (1071, 1024)
Shape of test data: (268, 1024)


In [32]:
# Defines and builds the lazyclassifier
reg7 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models7, predictions7 = reg7.fit(X_train7, X_test7, Y_train7, Y_test7)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [14:38<00:00, 20.92s/it]


In [33]:
predictions7.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGBMRegressor,1.1,0.73,0.68,10.35
HistGradientBoostingRegressor,1.1,0.72,0.69,29.92
GradientBoostingRegressor,1.1,0.71,0.71,22.64
ExtraTreesRegressor,1.1,0.71,0.71,23.57
RandomForestRegressor,1.1,0.71,0.71,66.88
XGBRegressor,1.11,0.68,0.74,5.71
KNeighborsRegressor,1.11,0.67,0.75,1.53
BaggingRegressor,1.12,0.67,0.75,7.47
AdaBoostRegressor,1.15,0.58,0.85,8.84
SVR,1.16,0.55,0.88,1.79


In [34]:
with pd.ExcelWriter(output/"LazyPredictResults.xlsx") as writer:
   
    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet
    predictions1.to_excel(writer, sheet_name="MACCS", )
    predictions2.to_excel(writer, sheet_name="Morgan 1024 Bits", )
    predictions3.to_excel(writer, sheet_name="MorganF 1024 Bits", )
    predictions4.to_excel(writer, sheet_name="MAP4 1024 Bits", )
    predictions5.to_excel(writer, sheet_name="Morgan 2048 Bits", )
    predictions6.to_excel(writer, sheet_name="MorganF 2048 Bits", )
    predictions7.to_excel(writer, sheet_name="MAP4 2048 Bits", )
    