## Comparing regressors
In this part, we will be comparing several ML algorithms for build regression models of acetylcholinesterase inhibitors.

In [1]:
#!pip install lazypredict

In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor

#### Load and pre-processed of data

In [3]:
df = pd.read_csv('/home/alejandrodf1/Documents/computational_drug_discovery_project/data/processed/acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

X = df.drop('pIC50', axis=1) #getting X inputs
Y= df.pIC50                  #getting Y vector

In [4]:
print(f'Before feature removing: {X.shape}')  #dimensions before feature removing

#remove low variance features
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(0.8 * (1-0.8)))

X = selection.fit_transform(X)
print(f'After feature removing: {X.shape}') #dimensions after feature removing


Before feature removing: (5798, 881)
After feature removing: (5798, 144)


#### Splitting data (80/20)

In [5]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2, random_state=42)

## Comparing ML algorithm 

In [6]:
# defines and building the lazy classifier
clf = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models_train,prediction_trains = clf.fit(X_train,X_train,Y_train,Y_train)
models_test,prediction_test = clf.fit(X_test,X_test,Y_test,Y_test)


 36%|███▌      | 15/42 [01:25<02:44,  6.10s/it]

In [None]:
#Performance table of training set
prediction_trains


In [None]:
#Performance table of testing set
prediction_test


#### Datavisualization of models performance:
- Bar plot of R-Squared 
- Bar plot of RMSE
- Calculation time

In [None]:
#bar plot of R squared per model
import matplotlib.pyplot as plt
import seaborn as sns

#train['R-Squared'] = [0 if i< 0 else if i in train.iloc[:,0] ]

plt.figure(figsize=(5,10))
sns.set_theme(style='whitegrid')
ax = sns.barplot(y=prediction_trains.index, x = 'R-squared', data=prediction_trains)
ax.set(xlim=(0,1))

In [None]:
#rmse bar plot
plt.figure(figsize=(5,10))
sns.set_theme(style='whitegrid')
ax = sns.barplot(y=prediction_trains.index, x = 'RMSE', data=prediction_trains)
ax.set(xlim=(0,1))

In [None]:
# time taken bar plot
plt.figure(figsize=(5,10))
sns.set_theme(style='whitegrid')
ax = sns.barplot(y=prediction_trains.index, x = 'Time Taken', data=prediction_trains)
ax.set(xlim=(0,1))