## Import packages

In [None]:
#!pip3 install numpy=1.20.2
#!pip3 install pandas=1.2.4
#!pip3 install pillow=8.1.2
#!pip3 install scipy=1.6.3
#!pip3 install rdkit=2022.3.4
#!pip3 install scikit-learn=0.24.2
#!pip3 install matplotlib=3.4.1
#!pip3 install seaborn=0.11.1
#!pip3 install shap=0.41.0

In [2]:
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
import scipy
import multiprocessing
import pickle
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [3]:
multiprocessing.cpu_count()

12

## Define function to prepare data, train/test the ML models and evaluate them

In [4]:
def prepareDataFrame(df, categorical_features):
    
    for ft in categorical_features:
        
        cat_ft = pd.get_dummies(df[ft], prefix='cat')
        df = pd.concat([df,cat_ft], axis=1)
        df = df.drop(ft, 1)
    
        
    print("Dataframe after dummy variables:")
    print(df.shape)

    Y = df[['ba']] 
    Y = Y.values.ravel()
    
    X = df.drop('ba', axis=1)
    
    return X, Y

In [7]:
def model_selection_and_tuning(model_type, model_name):
    
    categorical_features = []
    
    if(model_type == "dual_model_pl_split_random"):
     
        train = pd.read_csv("work/training/dual-model/train-random.csv", sep=",", encoding="utf-8")
        
    elif(model_type == "dual_model_pl_split_protein"):
     
        train = pd.read_csv("work/training/dual-model/train-protein.csv", sep=",", encoding="utf-8")
        
    elif(model_type == "dual_model_pl_split_pocket"):
     
        train = pd.read_csv("work/training/dual-model/train-pocket.csv", sep=",", encoding="utf-8")
                
    elif(model_type == "dual_model_pl_split_ligand_weight"):
     
        train = pd.read_csv("work/training/dual-model/train-ligand-weight.csv", sep=",", encoding="utf-8")
        
    elif(model_type == "dual_model_pl_split_ligand_diversity"):
     
        train = pd.read_csv("work/training/dual-model/train-ligand-diversity.csv", sep=",", encoding="utf-8")
                
    elif(model_type == "dual_model_pl_split_ligand_tpsa"):
     
        train = pd.read_csv("work/training/dual-model/train-ligand-tpsa.csv", sep=",", encoding="utf-8")
                
    elif(model_type == "dual_model_pl_split_ligand_volume"):
     
        train = pd.read_csv("work/training/dual-model/train-ligand-volume.csv", sep=",", encoding="utf-8")
        
    print(train.shape)
    
    train_X, train_Y = prepareDataFrame(train, categorical_features)
    train_X = np.array(train_X)
    
    inner_cv = KFold(n_splits=3, shuffle=True, random_state=456)
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=789)
     
        
    if(model_name == 'RandomForest'):
        
        param_grid = {'max_features':['auto', 'sqrt', 'log2', round(train_X.shape[1]/2)],
                        'n_estimators':[200,300,400,500],
                        'min_samples_leaf':[1,2,5,10],
                        'min_samples_split':[2,5,10]
                     }
        regressor = RandomForestRegressor(n_jobs=-1, criterion= 'mse') 
        
    elif(model_name == 'DecisionTree'):
        
        param_grid = {'max_features':['auto', 'sqrt', 'log2', round(train_X.shape[1]/2)],
                        'max_depth':[None,2,5,10],
                        'min_samples_leaf':[1,2,5,10],
                        'min_samples_split':[2,5,10]
                     }
        regressor = DecisionTreeRegressor(criterion= 'mse') 
    
    elif(model_name == 'LassoRegression'):
        
        param_grid = {'alpha':[0.01, 0.02, 0.4, 0.06, 0.08, 0.1, 0.2, 0.5, 1.0]}
        regressor = Lasso(max_iter=10000)
        
        scaler = StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)
    
    elif(model_name == 'RidgeRegression'):
        
        param_grid = {'alpha':[0.001, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.2, 0.5, 1.0, 2.0, 4.0, 10.0, 20.0]}
        regressor = Ridge()
        
        scaler = StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)

        
    tuned_model = GridSearchCV(estimator=regressor, param_grid=param_grid, n_jobs=-1, verbose=3, cv=inner_cv, scoring='r2')
    
    tuned_model.fit(train_X, train_Y)
    
    tuned_outer_cv = cross_val_score(tuned_model, train_X, train_Y, cv=outer_cv)
    tuned_outer_cv = np.round(tuned_outer_cv, 2);
    
    return tuned_outer_cv.mean(), tuned_model.best_params_

## Model selection and tuning

In [8]:
tuning_metrics_df = pd.DataFrame()

In [9]:
tuning_metrics_df.shape

(0, 0)

In [10]:
model_types = ['dual_model_pl_split_random',
              'dual_model_pl_split_pocket',
              'dual_model_pl_split_protein',
              'dual_model_pl_split_ligand_diversity',
              'dual_model_pl_split_ligand_weight',
              'dual_model_pl_split_ligand_volume']

tuning_metrics_df['dataset'] = model_types

In [11]:
tuning_metrics_df.shape

(6, 1)

In [12]:
for model_name in ['RandomForest', 'DecisionTree', 'LassoRegression', 'RidgeRegression']:

    tuned_outer_cv_arr = []
    
    model_type_best_params_df = pd.DataFrame()
    
    for model_type in model_types:
        
        tuned_outer_cv, tuned_model_best_params = model_selection_and_tuning(model_type, model_name)
    
        tuned_outer_cv_arr.append(tuned_outer_cv)
    
        tmp_key_list = list()
        tmp_value_list = list()
        
        for key,value in tuned_model_best_params.items():
            tmp_key_list.append(key)    
            tmp_value_list.append(value)
        
        model_type_best_params_df['param'] = pd.Series(tmp_key_list)
        model_type_best_params_df[model_type] = pd.Series(tmp_value_list)
        
        # temporary saving a snapshot of the file
        model_type_best_params_df.to_csv('work/tuning/'+model_name+'_best_params_tuning.csv', encoding='utf-8', index=False)

    tuning_metrics_df[model_name] = pd.Series(tuned_outer_cv_arr)
    
    model_type_best_params_df.to_csv('work/tuning/'+model_name+'_best_params_tuning.csv', encoding='utf-8', index=False)
    
    # temporary saving a snapshot of the file
    tuning_metrics_df.to_csv('work/tuning/tuning_metrics_df.csv', encoding='utf-8', index=False)
    
tuning_metrics_df.to_csv('work/tuning/tuning_metrics_df.csv', encoding='utf-8', index=False)

(6738, 126)
Dataframe after dummy variables:
(6738, 126)
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 folds for each of 192 candidates, totalling 576 fits
(6969, 126)
Dataframe after dummy variables:
(6969, 126)
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 folds for each of 192 candidates, totalling 576 fits
(6731, 126)
Dataframe after dummy variables:
(6731, 126)
Fitting 3 folds for each of 192 candidates, totalling 576 fits
Fitting 3 

Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
(6731, 126)
Dataframe after dummy variables:
(6731, 126)
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
(6737, 126)
Dataframe after dummy variables:
(6737, 126)
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 c