In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyClassifier
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [10]:
# Set path to this notebook
HERE = Path(_dh[-1])
HDAC6 = Path(HERE).resolve().parents[1]/'input'
output = HERE/'OUTPUT'

In [11]:
df1 = pd.read_pickle(HDAC6/"HDAC6_1024B.csv")
df1 = df1.sample(frac=1).reset_index(drop=True)
df1.head(5)

Unnamed: 0,molecule_chembl_id,fp_MACCS,fp_Morgan3,fp_MorganF,fp_MAP4,pChEMBL_HDAC6
0,CHEMBL3339010,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, ...","[1705, 4290185, 111326, 552416, 13986880, 2298...",7.62
1,CHEMBL3655945,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, ...","[35600637, 2478511, 10872982, 16070626, 119321...",7.72
2,CHEMBL4070745,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...","[5196932, 7212973, 3844537, 7125875, 28861, 15...",8.21
3,CHEMBL3104851,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[18342286, 2349897, 17428153, 3351916, 5886681...",5.34
4,CHEMBL4465218,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3176507, 16443623, 1495766, 1162638, 13730070...",8.24


In [12]:
# Add column for activity
df1["activity"] = np.zeros(len(df1))

# Mark every molecule as active with an pIC50 of >= 6.6, 0 otherwise
df1.loc[df1[df1.pChEMBL_HDAC6 >= 6.6].index, "activity"] = 1.0

# NBVAL_CHECK_OUTPUT
print("Number of active compounds:", int(df1.activity.sum()))
print("Number of inactive compounds:", len(df1) - int(df1.activity.sum()))

Number of active compounds: 1837
Number of inactive compounds: 1134


#### By using MACCS fingerprints that are 166 bits 

In [13]:
X1 = np.array(list((df1['fp_MACCS']))).astype(float)
#X.shape
Y1 = df1["activity"].values
#y.shape

In [14]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train1.shape)
print("Shape of test data:", X_test1.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 167)
Shape of test data: (595, 167)


In [15]:
# Defines and builds the LazyClassifier
clf1 =  LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
model1,predictions1 = clf1.fit(X_train1, X_test1, Y_train1, Y_test1)

 90%|█████████████████████████████████████████████████████████████████████████▌        | 26/29 [00:12<00:01,  2.37it/s]



100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:12<00:00,  2.23it/s]


In [16]:
predictions1.sort_values(by="F1 Score", ascending=False).head(10)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BaggingClassifier,0.84,0.83,0.83,0.84,0.29
XGBClassifier,0.83,0.82,0.82,0.83,0.7
LGBMClassifier,0.83,0.81,0.81,0.83,0.27
ExtraTreesClassifier,0.83,0.81,0.81,0.82,0.56
RandomForestClassifier,0.82,0.81,0.81,0.82,0.59
SVC,0.82,0.8,0.8,0.82,1.12
NuSVC,0.82,0.8,0.8,0.82,1.35
KNeighborsClassifier,0.8,0.79,0.79,0.8,0.6
DecisionTreeClassifier,0.78,0.78,0.78,0.79,0.06
ExtraTreeClassifier,0.78,0.77,0.77,0.78,0.06


### USING 1024 BITS FINGERPRINTS

In [17]:
#By using Morgan fingerprints with radius of 3 and 1024 bits
X2 = np.array(list((df1['fp_Morgan3']))).astype(float)
#X.shape
Y2 = df1["activity"].values
#y.shape

In [18]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train2.shape)
print("Shape of test data:", X_test2.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 1024)
Shape of test data: (595, 1024)


In [19]:
clf2 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models2, predictions2 = clf2.fit(X_train2, X_test2, Y_train2, Y_test2)

 90%|█████████████████████████████████████████████████████████████████████████▌        | 26/29 [00:55<00:09,  3.14s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [01:01<00:00,  2.11s/it]


In [20]:
predictions2.sort_values(by="F1 Score", ascending=False).head(10)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.87,0.85,0.85,0.87,1.21
ExtraTreesClassifier,0.87,0.85,0.85,0.86,1.55
NuSVC,0.84,0.81,0.81,0.84,9.87
SVC,0.84,0.81,0.81,0.84,8.0
BaggingClassifier,0.84,0.82,0.82,0.84,2.99
XGBClassifier,0.83,0.81,0.81,0.83,4.24
LGBMClassifier,0.83,0.82,0.82,0.83,1.04
DecisionTreeClassifier,0.8,0.79,0.79,0.8,0.46
KNeighborsClassifier,0.79,0.76,0.76,0.79,3.53
RidgeClassifierCV,0.78,0.77,0.77,0.78,1.16


In [21]:
#By using Morgan fingerprints with radius of 3 and 1024 bits and feautures
X3 = np.array(list((df1['fp_MorganF']))).astype(float)
#X.shape
Y3 = df1["activity"].values
#y.shape

In [22]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X3, Y3, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train3.shape)
print("Shape of test data:", X_test3.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 1024)
Shape of test data: (595, 1024)


In [23]:
clf3 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models3, predictions3 = clf3.fit(X_train3, X_test3, Y_train3, Y_test3)

 90%|█████████████████████████████████████████████████████████████████████████▌        | 26/29 [00:50<00:09,  3.04s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:56<00:00,  1.94s/it]


In [24]:
predictions3.sort_values(by="F1 Score", ascending=False).head(10)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.86,0.84,0.84,0.86,1.68
BaggingClassifier,0.85,0.84,0.84,0.85,2.79
RandomForestClassifier,0.85,0.83,0.83,0.85,1.18
ExtraTreesClassifier,0.85,0.83,0.83,0.85,1.63
XGBClassifier,0.84,0.82,0.82,0.84,4.1
NuSVC,0.83,0.8,0.8,0.82,8.24
SVC,0.83,0.79,0.79,0.82,7.52
AdaBoostClassifier,0.82,0.8,0.8,0.82,2.97
DecisionTreeClassifier,0.79,0.78,0.78,0.79,0.44
ExtraTreeClassifier,0.79,0.77,0.77,0.79,0.18


In [25]:
#By using MAP4 fingerprints with 1024 bits
X4 = np.array(list((df1['fp_MAP4']))).astype(float)
#X.shape
Y4 = df1["activity"].values
#y.shape

In [26]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(X4, Y4, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train4.shape)
print("Shape of test data:", X_test4.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 1024)
Shape of test data: (595, 1024)


In [27]:
# Defines and builds the lazyclassifier
clf4 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models4, predictions4 = clf4.fit(X_train4, X_test4, Y_train4, Y_test4)

 90%|█████████████████████████████████████████████████████████████████████████▌        | 26/29 [01:36<00:11,  3.92s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [02:03<00:00,  4.25s/it]


In [28]:
predictions4.sort_values(by="F1 Score", ascending=False).head(10)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.85,0.83,0.83,0.85,15.61
RandomForestClassifier,0.85,0.82,0.82,0.84,6.07
XGBClassifier,0.84,0.82,0.82,0.84,10.95
BaggingClassifier,0.82,0.81,0.81,0.82,24.23
ExtraTreesClassifier,0.82,0.78,0.78,0.81,2.05
KNeighborsClassifier,0.8,0.77,0.77,0.79,4.96
NuSVC,0.79,0.76,0.76,0.79,8.95
AdaBoostClassifier,0.79,0.76,0.76,0.78,11.7
SVC,0.78,0.74,0.74,0.77,9.13
DecisionTreeClassifier,0.77,0.75,0.75,0.77,2.33


## USE 2048 BITS FINGERPRINTS

In [29]:
df2 = pd.read_pickle(HDAC6/"HDAC6_2048B.csv")
df2 = df2.sample(frac=1).reset_index(drop=True)
df2

Unnamed: 0,molecule_chembl_id,fp_Morgan3,fp_MorganF,fp_MAP4,pChEMBL_HDAC6
0,CHEMBL4162826,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5976924, 11099288, 6095547, 7501117, 9204552,...",8.22
1,CHEMBL4217166,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[960441, 7687405, 1457796, 4020886, 3402370, 1...",6.72
2,CHEMBL2312168,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4828773, 16948577, 8033062, 52924930, 583588,...",7.30
3,CHEMBL4537466,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...","[4681989, 3821889, 137380, 6078715, 11250182, ...",8.01
4,CHEMBL253309,"[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[145521, 1787454, 2475943, 2937747, 3299882, 2...",6.64
...,...,...,...,...,...
2966,CHEMBL4078721,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[11130522, 9724804, 1959173, 1535176, 3310692,...",5.55
2967,CHEMBL4593948,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5155069, 9016152, 1578989, 8623717, 23971717,...",5.44
2968,CHEMBL1812335,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[16425928, 17077509, 11223021, 13354224, 86723...",4.52
2969,CHEMBL3758451,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[30954026, 45888749, 8033062, 7467572, 1920889...",6.62


In [31]:
# Add column for activity
df2["activity"] = np.zeros(len(df2))

# Mark every molecule as active with an pIC50 of >= 6.6, 0 otherwise
df2.loc[df2[df2.pChEMBL_HDAC6 >= 6.6].index, "activity"] = 1.0

# NBVAL_CHECK_OUTPUT
print("Number of active compounds:", int(df2.activity.sum()))
print("Number of inactive compounds:", len(df2) - int(df2.activity.sum()))

Number of active compounds: 1837
Number of inactive compounds: 1134


In [32]:
#2048 bits Morgan fingerprints with radius of 3
X5 = np.array(list((df2['fp_Morgan3']))).astype(float)
#X.shape
Y5 = df2["activity"].values
#y.shape

In [33]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train5, X_test5, Y_train5, Y_test5 = train_test_split(X5, Y5, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train5.shape)
print("Shape of test data:", X_test5.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 2048)
Shape of test data: (595, 2048)


In [34]:
clf5 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models5, predictions5 = clf5.fit(X_train5, X_test5, Y_train5, Y_test5)

 90%|█████████████████████████████████████████████████████████████████████████▌        | 26/29 [01:55<00:21,  7.13s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [02:04<00:00,  4.29s/it]


In [35]:
predictions5.sort_values(by="F1 Score", ascending=False).head(10)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,0.86,0.85,0.85,0.86,2.73
RandomForestClassifier,0.85,0.84,0.84,0.85,1.97
XGBClassifier,0.85,0.84,0.84,0.85,6.13
LGBMClassifier,0.85,0.84,0.84,0.85,2.86
SVC,0.84,0.82,0.82,0.84,17.9
NuSVC,0.84,0.82,0.82,0.84,17.77
BaggingClassifier,0.83,0.82,0.82,0.83,6.67
DecisionTreeClassifier,0.82,0.81,0.81,0.82,1.18
BernoulliNB,0.81,0.8,0.8,0.81,0.44
NearestCentroid,0.81,0.79,0.79,0.81,0.37


In [36]:
#2048 bits Morgan fingerprints with feautures
X6 = np.array(list((df2['fp_MorganF']))).astype(float)
#X.shape
Y6 = df2["activity"].values
#y.shape

In [37]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train6, X_test6, Y_train6, Y_test6 = train_test_split(X6, Y6, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train6.shape)
print("Shape of test data:", X_test6.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 2048)
Shape of test data: (595, 2048)


In [38]:
clf6 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models6, predictions6 = clf6.fit(X_train6, X_test6, Y_train6, Y_test6)

 90%|█████████████████████████████████████████████████████████████████████████▌        | 26/29 [01:51<00:18,  6.17s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [01:58<00:00,  4.09s/it]


In [39]:
predictions6.sort_values(by="F1 Score", ascending=False).head(10)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,0.85,0.83,0.83,0.84,3.8
BaggingClassifier,0.84,0.83,0.83,0.84,5.25
RandomForestClassifier,0.84,0.82,0.82,0.84,1.95
LGBMClassifier,0.83,0.82,0.82,0.83,1.28
XGBClassifier,0.83,0.81,0.81,0.83,5.92
DecisionTreeClassifier,0.82,0.81,0.81,0.82,1.11
NuSVC,0.82,0.79,0.79,0.82,17.44
SVC,0.82,0.79,0.79,0.81,15.14
ExtraTreeClassifier,0.81,0.79,0.79,0.8,0.34
KNeighborsClassifier,0.79,0.75,0.75,0.78,6.71


In [40]:
#2048 bits MAP4 fingerprints
X7 = np.array(list((df2['fp_MAP4']))).astype(float)
#X.shape
Y7 = df2["activity"].values
#y.shape

In [41]:
# Split the data into training and test set
# Perform data splitting using 80/20 ratio
X_train7, X_test7, Y_train7, Y_test7 = train_test_split(X7, Y7, test_size=0.2, random_state=42)

# Print the shape of training and testing data
print("Shape of training data:", X_train7.shape)
print("Shape of test data:", X_test7.shape)
# NBVAL_CHECK_OUTPUT

Shape of training data: (2376, 2048)
Shape of test data: (595, 2048)


In [42]:
# Defines and builds the lazyclassifier
clf7 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models7, predictions7 = clf7.fit(X_train7, X_test7, Y_train7, Y_test7)

 90%|█████████████████████████████████████████████████████████████████████████▌        | 26/29 [02:45<00:20,  6.93s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [03:30<00:00,  7.26s/it]


In [43]:
predictions7.sort_values(by="F1 Score", ascending=False).head(10)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.86,0.84,0.84,0.86,16.52
LGBMClassifier,0.86,0.84,0.84,0.85,28.59
RandomForestClassifier,0.84,0.82,0.82,0.84,6.8
ExtraTreesClassifier,0.82,0.79,0.79,0.82,2.49
NuSVC,0.8,0.78,0.78,0.8,16.67
BaggingClassifier,0.8,0.79,0.79,0.8,34.46
AdaBoostClassifier,0.79,0.77,0.77,0.79,21.12
SVC,0.79,0.75,0.75,0.78,16.13
KNeighborsClassifier,0.78,0.75,0.75,0.77,7.35
Perceptron,0.77,0.75,0.75,0.77,0.54


In [44]:
with pd.ExcelWriter(output/"LazyPredictResults.xlsx") as writer:
   
    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet
    predictions1.to_excel(writer, sheet_name="MACCS", )
    predictions2.to_excel(writer, sheet_name="Morgan 1024 Bits", )
    predictions3.to_excel(writer, sheet_name="MorganF 1024 Bits", )
    predictions4.to_excel(writer, sheet_name="MAP4 1024 Bits", )
    predictions5.to_excel(writer, sheet_name="Morgan 2048 Bits", )
    predictions6.to_excel(writer, sheet_name="MorganF 2048 Bits", )
    predictions7.to_excel(writer, sheet_name="MAP4 2048 Bits", )
    