# ML regression to predict the Efficacy of an active G9a inhibitor, using Dataset 1

### Content   <a name="content"></a>

1. [Data loading and observations](#1)
2. [Regression Machine Learning](#2)
3. [Cros-validation](#3)
4. [Feature importance methods for the Gradient Boost Regressor](#4)
5. [Comparison of the first fifteen features of the most important features for GBR](#5)
6. [Permutation feature importance methods for the Support Vector Regressor](#6)
7. [Comparison of the first fifteen features of the most important features for SVR](#7)
8. [Define the best result ](#8)
9. [Hyper parameter tuning of the new model](#9)
10. [Illustration of the reduced features model results](#10)

### Data loading and observations <a name="1"></a>

In [1]:
# pip install modin[ray] 
# pip install sidetable

In [2]:
import pandas as pd 

# # loading the dataset for the regression ML
df = pd.read_csv('data_Efficacy_no_solub.csv', index_col=[0])
# Avoid some columns to be truncated during df display
pd.set_option('display.max_columns', None)
# Display the data frame
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (27323, 60)


Unnamed: 0,Efficacy,MW,TPSA,XL,HAC,HBDC,HBAC,RBC,CBUC,MMX6,MMX,SX6,SX,MMY6,MMY,SY6,SY,Volume_1,Volume_2,MMX6_3D,MMX_3D,SX6_3D,SX_3D,MMY6_3D,MMY_3D,SY6_3D,SY_3D,MMZ6_3D,MMZ_3D,SZ6_3D,SZ_3D,Volume_1_3D,XY_3D_volume,XZ_3D_volume,YZ_3D_volume,C_relative,H_relative,O_relative,S_relative,N_relative,Br_relative,Cl_relative,F_relative,C,H,O,S,N,Br,Cl,F,C_rel_2D,allAtoms_rel_2D,C_rel_XY_3D,allAtoms_rel_XY_3D,C_rel_XZ_3D,allAtoms_rel_XZ_3D,C_rel_YZ_3D,allAtoms_rel_YZ_3D,Similarity
0,163.334,435.5,117.0,22.8,31,2,5,7,1,17.8694,17.8694,20.082314,19.965107,10.8275,12.0275,19.912513,19.826117,0.195432,0.001865,17.534,19.3722,20.172632,20.28235,6.6305,8.0824,19.572353,19.845644,3.5023,5.649,20.488571,20.292375,32.824767,20.010407,20.190597,20.098098,0.41,0.45,0.04,0.02,0.09,0.0,0.0,0.0,63.43,5.79,7.35,7.36,16.08,0.0,0.0,0.0,21.650372,21.485712,2.644446,2.396838,5.006424,3.429315,1.893184,1.430767,20.052
1,109.521,500.6,172.0,20.8,35,3,8,11,1,14.0287,14.0287,20.075508,20.076284,14.256,15.456,19.977489,19.986298,0.195432,0.000699,10.2185,12.2236,20.629553,20.386651,6.0154,7.884,20.149252,20.288979,6.9525,8.3908,20.974558,20.550393,29.040476,20.170102,20.382104,20.119195,0.37,0.44,0.08,0.02,0.1,0.0,0.0,0.0,55.19,5.64,15.98,6.4,16.79,0.0,0.0,0.0,20.984056,20.907654,1.698723,1.550431,1.469759,1.456786,0.865214,0.939601,20.054
2,98.1573,449.4,151.0,22.0,31,4,9,6,1,6.5161,6.5161,19.840823,19.895281,18.75,21.15,20.088285,20.143002,12.507661,0.037176,11.0932,13.3081,20.335645,20.46717,6.4155,8.2934,19.551476,19.309144,2.4547,5.5169,19.860958,19.577412,162.225931,20.211769,21.251976,21.686218,0.42,0.31,0.07,0.02,0.11,0.0,0.0,0.07,50.78,3.14,10.68,7.13,15.58,0.0,0.0,12.68,20.347525,20.30809,1.729125,1.604662,4.519167,2.412242,2.613558,1.503272,20.049
3,146.98,512.5,199.0,20.8,36,5,9,9,1,20.5259,21.5663,19.950236,19.912669,14.1034,14.1034,20.128934,20.200037,0.083006,0.000517,11.56,13.5814,20.346999,20.432999,10.3972,12.0706,19.550438,19.458557,2.8515,5.5493,19.617349,19.727582,83.276709,20.082873,20.2232,20.225223,0.38,0.4,0.1,0.02,0.1,0.0,0.0,0.0,53.9,4.72,18.73,6.26,16.4,0.0,0.0,0.0,21.455387,21.529156,1.111838,1.125164,4.054007,2.447408,3.646221,2.175157,20.052
4,163.041,502.6,137.0,21.8,37,3,6,9,1,12.9638,12.9638,20.575062,20.532346,16.5215,17.7215,20.34646,20.176789,0.195432,0.003433,9.9702,11.1871,20.252543,20.277778,9.2767,10.4937,19.909155,20.111323,4.5327,6.8431,20.168788,20.31839,11.70753,20.00211,20.04038,20.026634,0.4,0.45,0.06,0.0,0.09,0.0,0.0,0.0,64.53,6.02,12.73,0.0,16.72,0.0,0.0,0.0,20.784662,20.731529,1.074757,1.066078,2.199616,1.6348,2.046617,1.533472,20.059


In [3]:
# Check for NaN
df.isnull().values.any()

False

In [4]:
df.describe(include="all")

Unnamed: 0,Efficacy,MW,TPSA,XL,HAC,HBDC,HBAC,RBC,CBUC,MMX6,MMX,SX6,SX,MMY6,MMY,SY6,SY,Volume_1,Volume_2,MMX6_3D,MMX_3D,SX6_3D,SX_3D,MMY6_3D,MMY_3D,SY6_3D,SY_3D,MMZ6_3D,MMZ_3D,SZ6_3D,SZ_3D,Volume_1_3D,XY_3D_volume,XZ_3D_volume,YZ_3D_volume,C_relative,H_relative,O_relative,S_relative,N_relative,Br_relative,Cl_relative,F_relative,C,H,O,S,N,Br,Cl,F,C_rel_2D,allAtoms_rel_2D,C_rel_XY_3D,allAtoms_rel_XY_3D,C_rel_XZ_3D,allAtoms_rel_XZ_3D,C_rel_YZ_3D,allAtoms_rel_YZ_3D,Similarity
count,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0,27323.0
mean,130.817048,361.679559,91.025661,23.092483,25.147165,1.282729,5.153314,4.763569,1.0,10.771973,11.346082,20.026201,20.024996,9.286464,9.846514,19.975389,19.992414,6.539488,0.039406,11.322665,13.329239,20.004356,20.005516,5.604325,7.593329,19.998456,19.99914,2.952183,4.764932,20.001213,19.999929,62.95082,20.372044,20.506639,20.485864,0.415877,0.41953,0.065801,0.017378,0.07106,0.001467,0.00454,0.004039,60.492773,5.226612,12.45734,6.271464,11.807595,1.106745,1.704213,0.931106,21.381739,21.32153,2.268995,1.854026,102.199583,31.093978,50.353342,19.970465,20.056554
std,30.314449,80.14949,34.855096,1.318537,5.657259,0.983037,1.849339,2.270493,0.0,3.65343,3.55108,0.313678,0.288993,3.614025,3.493136,0.381484,0.331885,42.573505,0.259385,2.876677,2.81144,0.322899,0.287727,1.862739,1.775259,0.459767,0.331879,1.319908,1.439797,0.568947,0.363984,79.42518,1.359993,7.233649,7.090387,0.044337,0.060747,0.039261,0.020016,0.03614,0.006878,0.012809,0.015565,8.9191,1.388152,7.005158,6.772923,5.574678,4.920027,4.533172,3.344172,0.869477,0.702711,1.062784,0.608144,942.579899,445.365002,524.905055,314.857842,0.016953
min,31.1299,82.1,0.0,11.8,6.0,0.0,0.0,0.0,1.0,0.5834,1.3349,18.442165,18.824699,0.375,0.4125,18.482865,18.785181,0.0,0.0,0.975,3.8999,18.53478,18.885661,0.166,2.2976,17.726484,18.590198,0.0001,0.0003,16.402595,14.612511,1.161843e-07,20.000109,20.0,20.000002,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.152146,20.158514,0.537249,0.752498,0.596586,0.908398,0.210874,0.683794,20.0
25%,109.0725,304.765,67.6,22.3,21.0,1.0,4.0,3.0,1.0,8.0892,8.8328,19.82625,19.835632,6.7358,7.358,19.718901,19.765444,0.0,2e-06,9.26955,11.30575,19.802707,19.814318,4.2263,6.2529,19.708923,19.793464,2.25595,4.03645,19.716906,19.827044,27.13871,20.050375,20.042369,20.041538,0.39,0.38,0.04,0.0,0.05,0.0,0.0,0.0,54.76,4.27,7.65,0.0,8.04,0.0,0.0,0.0,20.818771,20.831248,1.512077,1.390404,2.85622,2.201813,1.433364,1.300203,20.048
50%,129.606,358.9,88.0,23.1,25.0,1.0,5.0,5.0,1.0,10.3936,11.1723,20.013756,20.012651,9.0675,9.6415,19.980873,19.994495,0.082433,0.00019,11.1279,13.1158,19.999997,20.003445,5.5171,7.4675,20.000045,20.00004,2.7934,4.7386,20.000771,20.000487,42.61515,20.125092,20.10596,20.104442,0.42,0.42,0.06,0.02,0.07,0.0,0.0,0.0,60.83,5.15,12.26,6.92,11.37,0.0,0.0,0.0,21.163663,21.160411,1.997055,1.728821,3.913496,2.782357,1.898136,1.564353,20.054
75%,149.6405,416.55,113.0,23.9,29.0,2.0,6.0,6.0,1.0,12.9906,13.6871,20.22281,20.211088,11.74775,12.009,20.228463,20.223174,0.547099,0.003918,13.1583,15.1294,20.20457,20.197513,6.8429,8.75935,20.289855,20.206206,3.73675,5.6369,20.284822,20.171754,71.09673,20.318068,20.265494,20.262508,0.44,0.46,0.09,0.03,0.09,0.0,0.0,0.0,66.65,6.11,16.88,10.15,14.83,0.0,0.0,0.0,21.711988,21.637268,2.764323,2.194057,5.507451,3.569186,2.588613,1.933767,20.061
max,299.718,821.9,340.0,30.9,50.0,13.0,18.0,15.0,1.0,31.938,31.938,21.530227,21.224436,31.358,31.529,21.633274,21.483559,2321.338098,13.151876,26.3806,27.6703,21.572993,21.059648,13.3793,15.479,22.127669,22.122627,9.6569,11.1469,23.854187,25.467601,3526.947,149.800141,779.730455,797.263945,0.6,0.66,0.33,0.23,0.54,0.12,0.25,0.35,90.85,13.19,54.5,53.95,57.5,60.09,46.97,63.94,45.304933,42.516606,38.598795,7.721405,32177.666667,24400.0,24163.0,18692.333333,20.678


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27323 entries, 0 to 164
Data columns (total 60 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Efficacy            27323 non-null  float64
 1   MW                  27323 non-null  float64
 2   TPSA                27323 non-null  float64
 3   XL                  27323 non-null  float64
 4   HAC                 27323 non-null  int64  
 5   HBDC                27323 non-null  int64  
 6   HBAC                27323 non-null  int64  
 7   RBC                 27323 non-null  int64  
 8   CBUC                27323 non-null  int64  
 9   MMX6                27323 non-null  float64
 10  MMX                 27323 non-null  float64
 11  SX6                 27323 non-null  float64
 12  SX                  27323 non-null  float64
 13  MMY6                27323 non-null  float64
 14  MMY                 27323 non-null  float64
 15  SY6                 27323 non-null  float64
 16  SY         

[<a href="#content">Back to top</a>]

### Regression Machine Learning <a name="2"></a>

In [6]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Separate the training columns from the target column 'Efficacy'
X = df.drop(['Efficacy'], axis=1) 
y = df['Efficacy'] 

# Split the data set into train and test parts 
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    random_state=5) 
# # Standardise the data points
sc = StandardScaler()
X_train = sc.fit_transform(X_train_unscaled)
X_test = sc.transform(X_test_unscaled)

# Print the shape of each part
print("Shapes:")
print("X_train: ", X_train.shape)
print("X_test:  ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test:  ", y_test.shape)

Shapes:
X_train:  (21858, 59)
X_test:   (5465, 59)
y_train:  (21858,)
y_test:   (5465,)


In [7]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate the algorithms that will be used, placing them in a dictionary 
regs = {"SVR":SVR(kernel='linear'),
        "DecisionTree":DecisionTreeRegressor(), 
        "RandomForest":RandomForestRegressor(), 
        "GradientBoost":GradientBoostingRegressor(),}

In [8]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import numpy as np

# Create statistics with the results of training with different algorithms
def model_fit(regs):
    fitted_model={}
    model_result = pd.DataFrame()
    for model_name, model in regs.items():
        model.fit(X_train,y_train)
        fitted_model.update({model_name:model})
        model_dict = {}
        model_dict['1.Algorithm'] = model_name
        model_dict['2.RMSE_Train'] = round(np.sqrt (mean_squared_error(y_train, model.predict(X_train))),2)
        model_dict['3.RMSE_Test'] = round( np.sqrt (mean_squared_error(y_test, model.predict(X_test))),2)
        model_dict['4.MAE_Train'] = round(mean_absolute_error(y_train, model.predict(X_train)),2)
        model_dict['5.MAE_Test'] = round(mean_absolute_error(y_test, model.predict(X_test)),2)
        model_result = model_result._append(model_dict,ignore_index=True)
    return fitted_model, model_result

fitted_model, model_result = model_fit(regs)
model_result.sort_values(by=['5.MAE_Test'],ascending=True)

Unnamed: 0,1.Algorithm,2.RMSE_Train,3.RMSE_Test,4.MAE_Train,5.MAE_Test
2,RandomForest,10.97,29.68,8.63,23.4
3,GradientBoost,28.83,29.91,22.85,23.61
0,SVR,29.83,30.19,23.53,23.87
1,DecisionTree,0.0,42.89,0.0,34.0


[<a href="#content">Back to top</a>]

### Cross-validation <a name="3"></a>

In [9]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Create statistics with the results of cross-validation
def model_CV(regs):
    fitted_model={}
    model_cv_result = pd.DataFrame()
    for model_name, model in regs.items():
        fitted_model.update({model_name:model})
        scores = cross_val_score(model, X, y, cv=5,
                        scoring=('neg_mean_absolute_error'))
        scores = -scores
        model_dict = {}
        model_dict['1.Algorithm'] = model_name
        model_dict['2.CV_MAE'] = round(np.mean(scores), 2)
        model_dict['3.Sta Dev MAE'] = round(np.std(scores), 2)
        model_dict['4.List of MAE'] = np.round(scores, 2)
        model_cv_result = model_cv_result._append(model_dict,ignore_index=True)
    return fitted_model, model_cv_result

fitted_model, model_cv_result = model_CV(regs)
model_cv_result.sort_values(by=['2.CV_MAE'],ascending= True)

Unnamed: 0,1.Algorithm,2.CV_MAE,3.Sta Dev MAE,4.List of MAE
2,RandomForest,23.68,1.0,"[25.46, 23.84, 22.46, 23.4, 23.21]"
3,GradientBoost,23.7,1.07,"[25.68, 23.79, 22.51, 23.37, 23.14]"
0,SVR,23.86,0.94,"[25.5, 24.16, 22.7, 23.56, 23.38]"
1,DecisionTree,33.83,0.66,"[34.36, 34.69, 32.78, 33.71, 33.59]"


[<a href="#content">Back to top</a>]