In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor
from pathlib import Path
from warnings import filterwarnings

In [5]:
# Set path to this notebook
HERE = Path(_dh[-1])
HDAC6 = Path(HERE).resolve().parents[1]/'input'
output = HERE/'OUTPUT'

In [6]:
df1 = pd.read_pickle(HDAC6/"HDAC6_1024B.csv")
df1 = df1.sample(frac=1).reset_index(drop=True)
df1

Unnamed: 0,molecule_chembl_id,fp_MACCS,fp_Morgan3,fp_MorganF,fp_MAP4,pChEMBL_HDAC6
0,CHEMBL3693708,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2331191, 2478511, 1098538, 7125875, 729515, 6...",8.22
1,CHEMBL4096377,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6935510, 23887893, 6837934, 21937229, 2730481...",7.38
2,CHEMBL3415450,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[41474484, 59772959, 8033062, 599760, 583588, ...",5.65
3,CHEMBL483892,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, ...","[883566, 10511, 137380, 892555, 4030911, 30740...",8.59
4,CHEMBL1076939,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, ...","[2620858, 10511, 137380, 5988811, 4030911, 149...",8.41
...,...,...,...,...,...,...
2966,CHEMBL3353066,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4291113, 4486351, 8033062, 3637203, 38775486,...",6.89
2967,CHEMBL520279,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[16874629, 29755657, 3771845, 7855021, 2358768...",6.85
2968,CHEMBL2347009,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...","[13333824, 4369836, 1793697, 533014, 2682206, ...",6.88
2969,CHEMBL1214763,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[100053, 10511, 773848, 943340, 37230, 303606,...",5.49


#### By using MACCS fingerprints that are 166 bits 

In [7]:
X1 = np.array(list((df1['fp_MACCS']))).astype(float)
#X.shape
Y1 = df1["pChEMBL_HDAC6"].values
#y.shape

In [8]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train1.shape)
print("Shape of test data:", X_test1.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 167)
Shape of test data: (595, 167)


In [9]:
# Defines and builds the lazyregressor
reg1 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
model1, predictions1 = reg1.fit(X_train1, X_test1, Y_train1, Y_test1)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:30<00:00,  1.36it/s]


In [10]:
predictions1.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGBMRegressor,0.5,0.64,0.69,0.26
HistGradientBoostingRegressor,0.5,0.64,0.69,7.08
RandomForestRegressor,0.5,0.64,0.69,2.57
BaggingRegressor,0.47,0.62,0.71,0.33
XGBRegressor,0.45,0.6,0.72,0.63
NuSVR,0.43,0.59,0.73,1.22
SVR,0.43,0.59,0.74,1.43
KNeighborsRegressor,0.4,0.57,0.75,0.48
GradientBoostingRegressor,0.36,0.54,0.78,1.06
MLPRegressor,0.34,0.52,0.79,4.02


### USING 1024 BITS FINGERPRINTS

In [11]:
#By using Morgan fingerprints with radius of 3 and 1024 bits
X2 = np.array(list((df1['fp_Morgan3']))).astype(float)
#X.shape
Y2 = df1["pChEMBL_HDAC6"].values
#y.shape

In [12]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train2.shape)
print("Shape of test data:", X_test2.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 1024)
Shape of test data: (595, 1024)


In [13]:
reg2 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models2, predictions2 = reg2.fit(X_train2, X_test2, Y_train2, Y_test2)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [04:19<00:00,  6.18s/it]


In [14]:
predictions2.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,1.43,0.69,0.64,20.81
LGBMRegressor,1.46,0.66,0.67,1.17
HistGradientBoostingRegressor,1.46,0.66,0.67,37.11
BaggingRegressor,1.48,0.65,0.68,3.12
XGBRegressor,1.48,0.65,0.68,4.09
SVR,1.51,0.63,0.7,8.52
NuSVR,1.52,0.63,0.7,8.23
GradientBoostingRegressor,1.53,0.62,0.71,10.8
GammaRegressor,1.56,0.6,0.73,0.19
GeneralizedLinearRegressor,1.56,0.59,0.73,0.22


In [15]:
#By using Morgan fingerprints with radius of 3 and 1024 bits and feautures
X3 = np.array(list((df1['fp_MorganF']))).astype(float)
#X.shape
Y3 = df1["pChEMBL_HDAC6"].values
#y.shape

In [16]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X3, Y3, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train3.shape)
print("Shape of test data:", X_test3.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 1024)
Shape of test data: (595, 1024)


In [17]:
reg3 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models3, predictions3 = reg3.fit(X_train3, X_test3, Y_train3, Y_test3)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [04:22<00:00,  6.26s/it]


In [18]:
predictions3.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,1.44,0.68,0.65,21.26
XGBRegressor,1.44,0.68,0.65,3.22
HistGradientBoostingRegressor,1.45,0.68,0.65,23.85
LGBMRegressor,1.45,0.68,0.65,1.67
BaggingRegressor,1.48,0.65,0.68,2.65
SVR,1.5,0.63,0.7,8.37
NuSVR,1.51,0.63,0.7,9.1
GradientBoostingRegressor,1.53,0.62,0.71,10.61
LassoLarsCV,1.53,0.62,0.71,7.73
ElasticNetCV,1.53,0.62,0.71,32.37


In [19]:
#By using MAP4 fingerprints with 1024 bits
X4 = np.array(list((df1['fp_MAP4']))).astype(float)
#X4.shape
Y4 = df1["pChEMBL_HDAC6"].values
#y.shape

In [20]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(X4, Y4, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train4.shape)
print("Shape of test data:", X_test4.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 1024)
Shape of test data: (595, 1024)


In [21]:
# Defines and builds the lazyclassifier
reg4 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models4, predictions4 = reg4.fit(X_train4, X_test4, Y_train4, Y_test4)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [12:20<00:00, 17.63s/it]


In [22]:
predictions4.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGBMRegressor,1.47,0.66,0.67,12.45
HistGradientBoostingRegressor,1.47,0.66,0.67,33.16
RandomForestRegressor,1.49,0.65,0.68,106.93
ExtraTreesRegressor,1.5,0.64,0.69,48.4
GradientBoostingRegressor,1.52,0.62,0.71,47.15
XGBRegressor,1.53,0.61,0.71,10.46
BaggingRegressor,1.58,0.58,0.75,11.34
SVR,1.62,0.55,0.77,8.67
NuSVR,1.64,0.54,0.78,7.63
KNeighborsRegressor,1.64,0.53,0.79,4.05


In [23]:
predictions4.to_csv('LazyPredict_MAP4.csv')

## USE 2048 BITS FINGERPRINTS

In [24]:
df2 = pd.read_pickle(HDAC6/"HDAC6_2048B.csv")
df2 = df2.sample(frac=1).reset_index(drop=True)
df2

Unnamed: 0,molecule_chembl_id,fp_Morgan3,fp_MorganF,fp_MAP4,pChEMBL_HDAC6
0,CHEMBL3589347,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1689662, 3821889, 137380, 3789392, 7016357, 1...",5.80
1,CHEMBL113537,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...","[21627151, 3821889, 137380, 21312078, 7332227,...",5.35
2,CHEMBL2386910,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[776155, 3913535, 2043088, 476717, 3027463, 18...",6.58
3,CHEMBL2018303,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5976924, 3913535, 8033062, 2817337, 16526612,...",8.65
4,CHEMBL4105617,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3028919, 91412, 3771845, 22029617, 11860068, ...",4.85
...,...,...,...,...,...
2966,CHEMBL1836042,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4209369, 3827191, 2475943, 3132350, 2869217, ...",6.21
2967,CHEMBL4641253,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[41474484, 7265840, 8033062, 20867242, 4741152...",7.22
2968,CHEMBL3670667,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3192076, 401183, 1578989, 587110, 19178293, 1...",8.18
2969,CHEMBL515285,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[3192076, 10511, 137380, 9227729, 4030911, 163...",4.30


In [25]:
#2048 bits Morgan fingerprints with radius of 3
X5 = np.array(list((df2['fp_Morgan3']))).astype(float)
#X.shape
Y5 = df2["pChEMBL_HDAC6"].values
#y.shape

In [26]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train5, X_test5, Y_train5, Y_test5 = train_test_split(X5, Y5, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train5.shape)
print("Shape of test data:", X_test5.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 2048)
Shape of test data: (595, 2048)


In [27]:
reg5 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models5, predictions5 = reg5.fit(X_train5, X_test5, Y_train5, Y_test5)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [30:25<00:00, 43.47s/it]


In [28]:
predictions5.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,1.11,0.73,0.59,42.62
LGBMRegressor,1.11,0.73,0.59,2.93
HistGradientBoostingRegressor,1.11,0.73,0.59,64.84
BaggingRegressor,1.11,0.72,0.61,5.72
XGBRegressor,1.13,0.68,0.65,6.35
TweedieRegressor,1.14,0.65,0.68,0.44
GeneralizedLinearRegressor,1.14,0.65,0.68,0.46
GammaRegressor,1.14,0.65,0.68,0.44
BayesianRidge,1.15,0.64,0.68,9.36
ElasticNetCV,1.15,0.64,0.69,422.23


In [29]:
#2048 bits Morgan fingerprints with feautures
X6 = np.array(list((df2['fp_MorganF']))).astype(float)
#X.shape
Y6 = df2["pChEMBL_HDAC6"].values
#y.shape

In [30]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train6, X_test6, Y_train6, Y_test6 = train_test_split(X6, Y6, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train6.shape)
print("Shape of test data:", X_test6.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 2048)
Shape of test data: (595, 2048)


In [31]:
reg6 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models6, predictions6 = reg6.fit(X_train6, X_test6, Y_train6, Y_test6)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [32:36<00:00, 46.57s/it]


In [32]:
predictions6.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,1.11,0.74,0.58,34.92
BaggingRegressor,1.11,0.72,0.61,4.81
LGBMRegressor,1.12,0.72,0.61,2.53
HistGradientBoostingRegressor,1.12,0.72,0.61,68.16
XGBRegressor,1.12,0.71,0.62,6.16
TweedieRegressor,1.14,0.65,0.67,0.49
GeneralizedLinearRegressor,1.14,0.65,0.67,0.44
GammaRegressor,1.14,0.65,0.68,0.42
ElasticNetCV,1.14,0.65,0.68,489.34
BayesianRidge,1.14,0.65,0.68,9.33


In [33]:
#2048 bits MAP4 fingerprints
X7 = np.array(list((df2['fp_MAP4']))).astype(float)
#X.shape
Y7 = df2["pChEMBL_HDAC6"].values
#y.shape

In [34]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train7, X_test7, Y_train7, Y_test7 = train_test_split(X7, Y7, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train7.shape)
print("Shape of test data:", X_test7.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 2048)
Shape of test data: (595, 2048)


In [35]:
# Defines and builds the lazyclassifier
reg7 = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models7, predictions7 = reg7.fit(X_train7, X_test7, Y_train7, Y_test7)

100%|███████████████████████████████████████████████████████████████████████████████| 42/42 [1:15:54<00:00, 108.45s/it]


In [36]:
predictions7.sort_values(by="R-Squared", ascending=False)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HistGradientBoostingRegressor,1.12,0.71,0.62,103.44
LGBMRegressor,1.12,0.7,0.62,31.84
RandomForestRegressor,1.13,0.68,0.65,247.06
XGBRegressor,1.13,0.67,0.66,23.12
ExtraTreesRegressor,1.14,0.66,0.67,106.82
GradientBoostingRegressor,1.14,0.65,0.68,98.93
BaggingRegressor,1.15,0.62,0.7,26.69
SVR,1.16,0.61,0.72,17.71
NuSVR,1.17,0.59,0.74,16.55
KNeighborsRegressor,1.19,0.54,0.78,9.41


In [37]:
with pd.ExcelWriter(output/"LazyPredictResults.xlsx") as writer:
   
    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet
    predictions1.to_excel(writer, sheet_name="MACCS", )
    predictions2.to_excel(writer, sheet_name="Morgan 1024 Bits", )
    predictions3.to_excel(writer, sheet_name="MorganF 1024 Bits", )
    predictions4.to_excel(writer, sheet_name="MAP4 1024 Bits", )
    predictions5.to_excel(writer, sheet_name="Morgan 2048 Bits", )
    predictions6.to_excel(writer, sheet_name="MorganF 2048 Bits", )
    predictions7.to_excel(writer, sheet_name="MAP4 2048 Bits", )
    