In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor
from pathlib import Path
from warnings import filterwarnings

In [6]:
# Set path to this notebook
HERE = Path(_dh[-1])
HDAC1 = Path(HERE).resolve().parents[1]/'input'
output = HERE/'OUTPUT'

In [7]:
df1 = pd.read_pickle(HDAC1/"HDAC1_1024B.csv")
df1 = df1.sample(frac=1).reset_index(drop=True)
df1

Unnamed: 0,molecule_chembl_id,fp_MACCS,fp_Morgan3,fp_MorganF,fp_MAP4,pChEMBL_HDAC1
0,CHEMBL1934909,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, ...","[5547132, 19118050, 3082320, 23833697, 235748,...",4.85
1,CHEMBL2431901,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5976924, 7588115, 8033062, 3196706, 1293667, ...",4.35
2,CHEMBL2425958,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[13333824, 10415845, 3082320, 20854601, 661505...",5.73
3,CHEMBL3648285,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[11012881, 205153, 7998790, 7585349, 9368293, ...",6.94
4,CHEMBL4299491,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[100053, 10511, 1733887, 352257, 692898, 28201...",6.68
...,...,...,...,...,...,...
4487,CHEMBL1084356,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6957840, 337691, 8033062, 3421909, 492964, 13...",7.70
4488,CHEMBL3286734,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...","[4389752, 799448, 1092969, 6018751, 28861, 846...",7.85
4489,CHEMBL113537,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, ...","[21627151, 3821889, 137380, 21312078, 7332227,...",5.01
4490,CHEMBL4453484,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, ...","[4799804, 11204240, 1578989, 6116087, 1838432,...",8.28


#### By using MACCS fingerprints that are 166 bits 

In [8]:
X1 = np.array(list((df1['fp_MACCS']))).astype(float)
#X.shape
Y1 = df1["pChEMBL_HDAC1"].values
#y.shape

In [9]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train1.shape)
print("Shape of test data:", X_test1.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (3593, 167)
Shape of test data: (899, 167)


In [10]:
# Defines and builds the lazyregressor
reg1 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
model1, predictions1 = reg1.fit(X_train1, X_test1, Y_train1, Y_test1)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:47<00:00,  1.13s/it]


In [11]:
predictions1.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,0.57,0.65,0.68,4.28
XGBRegressor,0.54,0.63,0.7,0.78
BaggingRegressor,0.53,0.62,0.71,0.56
HistGradientBoostingRegressor,0.5,0.6,0.73,7.11
LGBMRegressor,0.5,0.6,0.73,0.33
SVR,0.47,0.57,0.75,3.32
NuSVR,0.46,0.56,0.76,2.7
MLPRegressor,0.42,0.53,0.78,6.25
KNeighborsRegressor,0.38,0.5,0.81,0.97
GradientBoostingRegressor,0.29,0.42,0.87,1.45


### USING 1024 BITS FINGERPRINTS

In [12]:
#By using Morgan fingerprints with radius of 3 and 1024 bits
X2 = np.array(list((df1['fp_Morgan3']))).astype(float)
#X.shape
Y2 = df1["pChEMBL_HDAC1"].values
#y.shape

In [13]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train2.shape)
print("Shape of test data:", X_test2.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (3593, 1024)
Shape of test data: (899, 1024)


In [14]:
reg2 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models2, predictions2 = reg2.fit(X_train2, X_test2, Y_train2, Y_test2)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [06:41<00:00,  9.55s/it]


In [15]:
predictions2.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,3.27,0.68,0.65,37.03
XGBRegressor,3.34,0.67,0.66,5.3
LGBMRegressor,3.37,0.67,0.66,1.29
HistGradientBoostingRegressor,3.37,0.67,0.66,27.92
SVR,3.48,0.65,0.67,19.48
NuSVR,3.51,0.65,0.68,19.69
BaggingRegressor,3.52,0.65,0.68,5.0
KNeighborsRegressor,3.91,0.59,0.73,6.92
BayesianRidge,4.09,0.57,0.75,1.29
PoissonRegressor,4.1,0.57,0.75,0.66


In [16]:
#By using Morgan fingerprints with radius of 3 and 1024 bits and feautures
X3 = np.array(list((df1['fp_MorganF']))).astype(float)
#X.shape
Y3 = df1["pChEMBL_HDAC1"].values
#y.shape

In [17]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X3, Y3, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train3.shape)
print("Shape of test data:", X_test3.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (3593, 1024)
Shape of test data: (899, 1024)


In [18]:
reg3 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models3, predictions3 = reg3.fit(X_train3, X_test3, Y_train3, Y_test3)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [07:05<00:00, 10.13s/it]


In [19]:
predictions3.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,3.1,0.71,0.62,35.44
LGBMRegressor,3.33,0.67,0.65,1.16
HistGradientBoostingRegressor,3.33,0.67,0.65,24.92
XGBRegressor,3.39,0.66,0.66,5.1
BaggingRegressor,3.42,0.66,0.67,4.95
SVR,3.58,0.64,0.69,19.43
NuSVR,3.62,0.63,0.69,18.93
KNeighborsRegressor,4.15,0.56,0.76,7.95
ElasticNetCV,4.18,0.55,0.76,66.38
LassoCV,4.18,0.55,0.76,66.33


In [20]:
#By using MAP4 fingerprints with 1024 bits
X4 = np.array(list((df1['fp_MAP4']))).astype(float)
#X4.shape
Y4 = df1["pChEMBL_HDAC1"].values
#y.shape

In [21]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(X4, Y4, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train4.shape)
print("Shape of test data:", X_test4.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (3593, 1024)
Shape of test data: (899, 1024)


In [22]:
# Defines and builds the lazyclassifier
reg4 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models4, predictions4 = reg4.fit(X_train4, X_test4, Y_train4, Y_test4)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [15:34<00:00, 22.26s/it]


In [23]:
predictions4.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGBMRegressor,3.43,0.66,0.67,14.54
HistGradientBoostingRegressor,3.44,0.66,0.67,34.18
XGBRegressor,3.73,0.62,0.71,13.8
RandomForestRegressor,3.76,0.61,0.71,192.52
ExtraTreesRegressor,3.93,0.59,0.73,75.53
KNeighborsRegressor,3.99,0.58,0.74,8.03
GradientBoostingRegressor,4.07,0.57,0.75,71.84
BaggingRegressor,4.13,0.56,0.76,20.44
SVR,4.41,0.52,0.79,19.53
NuSVR,4.5,0.51,0.8,17.55


## USE 2048 BITS FINGERPRINTS

In [24]:
df2 = pd.read_pickle(HDAC1/"HDAC1_2048B.csv")
df2 = df2.sample(frac=1).reset_index(drop=True)
df2

Unnamed: 0,molecule_chembl_id,fp_Morgan3,fp_MorganF,fp_MAP4,pChEMBL_HDAC1
0,CHEMBL1095437,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4035923, 6839885, 7998790, 189507, 22962049, ...",7.92
1,CHEMBL3890451,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...","[9593141, 3346815, 11111289, 7467572, 17276568...",6.14
2,CHEMBL3670680,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...","[3568556, 3234227, 1578989, 11354240, 6804879,...",6.48
3,CHEMBL517370,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3192076, 3346815, 1578989, 5114342, 15143291,...",6.77
4,CHEMBL4204967,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[9413970, 30699112, 11223021, 1561872, 3454932...",4.24
...,...,...,...,...,...
4487,CHEMBL2047544,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[799147, 2011138, 6390501, 4630274, 193508, 99...",6.42
4488,CHEMBL4646807,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[4007266, 2349897, 4250906, 17455853, 7332227,...",8.00
4489,CHEMBL472233,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[17582129, 10511, 3771845, 1988498, 5554620, 1...",6.96
4490,CHEMBL4576202,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1604544, 2642971, 2475943, 6714383, 6083718, ...",8.06


In [25]:
#2048 bits Morgan fingerprints with radius of 3
X5 = np.array(list((df2['fp_Morgan3']))).astype(float)
#X.shape
Y5 = df2["pChEMBL_HDAC1"].values
#y.shape

In [26]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train5, X_test5, Y_train5, Y_test5 = train_test_split(X5, Y5, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train5.shape)
print("Shape of test data:", X_test5.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (3593, 2048)
Shape of test data: (899, 2048)


In [27]:
reg5 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models5, predictions5 = reg5.fit(X_train5, X_test5, Y_train5, Y_test5)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [29:10<00:00, 41.68s/it]


In [28]:
predictions5.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,1.25,0.68,0.65,69.84
LGBMRegressor,1.25,0.68,0.66,2.19
HistGradientBoostingRegressor,1.25,0.68,0.66,65.86
BaggingRegressor,1.27,0.66,0.68,9.35
XGBRegressor,1.28,0.64,0.7,9.87
BayesianRidge,1.3,0.62,0.71,9.54
PoissonRegressor,1.31,0.61,0.72,1.31
SVR,1.31,0.61,0.72,37.93
NuSVR,1.31,0.6,0.73,38.33
ElasticNetCV,1.31,0.6,0.73,386.47


In [29]:
#2048 bits Morgan fingerprints with feautures
X6 = np.array(list((df2['fp_MorganF']))).astype(float)
#X.shape
Y6 = df2["pChEMBL_HDAC1"].values
#y.shape

In [30]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train6, X_test6, Y_train6, Y_test6 = train_test_split(X6, Y6, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train6.shape)
print("Shape of test data:", X_test6.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (3593, 2048)
Shape of test data: (899, 2048)


In [31]:
reg6 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models6, predictions6 = reg6.fit(X_train6, X_test6, Y_train6, Y_test6)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [28:06<00:00, 40.15s/it]


In [32]:
predictions6.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,1.25,0.68,0.65,74.73
HistGradientBoostingRegressor,1.25,0.68,0.66,64.52
LGBMRegressor,1.25,0.68,0.66,2.35
BaggingRegressor,1.26,0.66,0.67,8.7
XGBRegressor,1.28,0.65,0.69,9.17
SVR,1.31,0.6,0.73,38.73
NuSVR,1.31,0.6,0.73,36.8
ElasticNetCV,1.32,0.59,0.74,377.81
LassoCV,1.32,0.59,0.74,389.73
BayesianRidge,1.33,0.58,0.75,9.45


In [33]:
#2048 bits MAP4 fingerprints
X7 = np.array(list((df2['fp_MAP4']))).astype(float)
#X.shape
Y7 = df2["pChEMBL_HDAC1"].values
#y.shape

In [34]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train7, X_test7, Y_train7, Y_test7 = train_test_split(X7, Y7, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train7.shape)
print("Shape of test data:", X_test7.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (3593, 2048)
Shape of test data: (899, 2048)


In [35]:
# Defines and builds the lazyclassifier
reg7 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models7, predictions7 = reg7.fit(X_train7, X_test7, Y_train7, Y_test7)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [50:06<00:00, 71.58s/it]


In [36]:
predictions7.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGBMRegressor,1.26,0.66,0.67,33.87
HistGradientBoostingRegressor,1.26,0.66,0.67,96.23
XGBRegressor,1.31,0.61,0.72,29.02
RandomForestRegressor,1.32,0.6,0.74,439.23
GradientBoostingRegressor,1.32,0.59,0.74,146.05
ExtraTreesRegressor,1.33,0.58,0.75,155.72
SVR,1.33,0.57,0.75,39.8
KNeighborsRegressor,1.33,0.57,0.76,14.57
NuSVR,1.34,0.56,0.77,41.06
BaggingRegressor,1.38,0.51,0.81,43.2


In [37]:
with pd.ExcelWriter(output/"LazyPredictResults.xlsx") as writer:
   
    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet
    predictions1.to_excel(writer, sheet_name="MACCS", )
    predictions2.to_excel(writer, sheet_name="Morgan 1024 Bits", )
    predictions3.to_excel(writer, sheet_name="MorganF 1024 Bits", )
    predictions4.to_excel(writer, sheet_name="MAP4 1024 Bits", )
    predictions5.to_excel(writer, sheet_name="Morgan 2048 Bits", )
    predictions6.to_excel(writer, sheet_name="MorganF 2048 Bits", )
    predictions7.to_excel(writer, sheet_name="MAP4 2048 Bits", )
    