# ML regression to predict the Efficacy of an active G9a inhibitor, using Dataset 2

### Content   <a name="content"></a>

1. [Data loading and observations](#1)
2. [Regression Machine Learning](#2)
3. [Cros-validation](#3)
4. [Feature importance methods for the Gradient Boost Regressor](#4)
5. [Comparison of the first fifteen features of the most important features for GBR](#5)
6. [Permutation feature importance methods for the Support Vector Regressor](#6)
7. [Comparison of the first fifteen features of the most important features for SVR](#7)
8. [Define the best result ](#8)
9. [Hyper parameter tuning of the new model](#9)
10. [Illustration of the reduced features model results](#10)

### Data loading and observations <a name="1"></a>

In [1]:
# pip install modin[ray] 
# pip install sidetable

In [2]:
import pandas as pd 

# # loading the dataset for the regression ML
df = pd.read_csv('data_Efficacy_with_solub_and_no_imputation.csv', index_col=[0])
# Avoid some columns to be truncated during df display
pd.set_option('display.max_columns', None)
# Display the data frame
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (3910, 61)


Unnamed: 0,Efficacy,MW,TPSA,XL,HAC,HBDC,HBAC,RBC,CBUC,MMX6,MMX,SX6,SX,MMY6,MMY,SY6,SY,Volume_1,Volume_2,MMX6_3D,MMX_3D,SX6_3D,SX_3D,MMY6_3D,MMY_3D,SY6_3D,SY_3D,MMZ6_3D,MMZ_3D,SZ6_3D,SZ_3D,Volume_1_3D,XY_3D_volume,XZ_3D_volume,YZ_3D_volume,C_relative,H_relative,O_relative,S_relative,N_relative,Br_relative,Cl_relative,F_relative,C,H,O,S,N,Br,Cl,F,C_rel_2D,allAtoms_rel_2D,C_rel_XY_3D,allAtoms_rel_XY_3D,C_rel_XZ_3D,allAtoms_rel_XZ_3D,C_rel_YZ_3D,allAtoms_rel_YZ_3D,Similarity,Solubility_at_pH_7_4
0,72.9655,278.29,125.0,22.2,19,1,6,2,1,6.6342,8.9727,20.089578,19.643204,10.0201,10.0201,19.512813,19.724724,10.703674,0.3062773,10.4533,13.4223,20.485431,20.053612,2.9936,5.1742,20.513676,20.355558,1.165,3.0878,17.575617,18.545417,154.967504,22.341853,20.810366,20.061954,0.38,0.34,0.1,0.03,0.14,0.0,0.0,0.0,47.48,3.62,17.25,11.52,20.13,0.0,0.0,0.0,20.662089,20.89547,3.491883,2.594082,8.97279,4.346881,2.569614,1.675691,20.065,0.1
1,147.317,358.4,143.0,23.4,24,3,7,5,1,7.0834,7.2974,19.929298,20.0284,12.5835,12.5835,20.114455,20.268933,6e-06,2.241061e-07,11.7024,13.8931,20.065879,20.130764,6.9136,8.9136,19.069242,19.53302,2.4349,4.3407,19.78975,19.952041,69.723704,20.040203,20.078547,20.012436,0.42,0.37,0.05,0.05,0.11,0.0,0.0,0.0,53.62,3.94,8.93,17.89,15.63,0.0,0.0,0.0,20.562912,20.579918,1.692664,1.558641,4.806111,3.200659,2.839377,2.053494,20.054,13.7
2,174.885,293.28,95.1,23.1,22,2,6,3,1,6.4686,6.4686,20.643863,20.698222,11.2534,11.8613,20.31901,20.36589,0.003303,5.384583e-06,10.1926,12.9043,20.323032,20.32477,4.9932,7.6809,20.171935,19.992688,0.698,1.1042,19.684257,19.74633,8.764451,21.172396,20.104192,20.209574,0.48,0.33,0.09,0.0,0.09,0.0,0.0,0.0,65.53,3.78,16.37,0.0,14.33,0.0,0.0,0.0,20.574813,20.545353,2.041296,1.680051,14.602579,11.68656,7.153582,6.956077,20.05,24.7
3,189.224,324.4,70.9,23.8,24,2,4,6,1,15.3302,15.3302,19.936493,19.962066,7.2036,7.2036,19.776341,19.850233,0.0,3.739519e-09,12.6857,14.2391,19.675916,19.69456,7.6102,8.9715,19.346611,19.397476,2.7585,4.488,20.219574,20.105079,13.375666,20.014993,20.059015,20.040756,0.43,0.45,0.07,0.0,0.05,0.0,0.0,0.0,70.35,6.21,14.8,0.0,8.64,0.0,0.0,0.0,22.12813,22.12813,1.666934,1.587148,4.598767,3.172705,2.758818,1.998997,20.059,1.0
4,166.868,410.5,124.0,24.4,29,3,7,7,1,16.8859,16.8859,20.085001,19.999575,9.623,11.9205,20.105757,20.021858,9.625909,0.05822748,13.1863,15.5571,19.995282,20.086889,7.4717,9.5477,21.094927,20.563269,3.4022,5.2265,19.421458,19.497382,80.618934,20.802356,20.111462,20.517832,0.41,0.43,0.06,0.02,0.08,0.0,0.0,0.0,61.45,5.4,11.69,7.81,13.65,0.0,0.0,0.0,21.754744,21.416543,1.764833,1.629408,3.875816,2.976581,2.196138,1.826787,20.054,9.7


In [3]:
# Check for NaN
df.isnull().values.any()

False

In [4]:
df.describe(include="all")

Unnamed: 0,Efficacy,MW,TPSA,XL,HAC,HBDC,HBAC,RBC,CBUC,MMX6,MMX,SX6,SX,MMY6,MMY,SY6,SY,Volume_1,Volume_2,MMX6_3D,MMX_3D,SX6_3D,SX_3D,MMY6_3D,MMY_3D,SY6_3D,SY_3D,MMZ6_3D,MMZ_3D,SZ6_3D,SZ_3D,Volume_1_3D,XY_3D_volume,XZ_3D_volume,YZ_3D_volume,C_relative,H_relative,O_relative,S_relative,N_relative,Br_relative,Cl_relative,F_relative,C,H,O,S,N,Br,Cl,F,C_rel_2D,allAtoms_rel_2D,C_rel_XY_3D,allAtoms_rel_XY_3D,C_rel_XZ_3D,allAtoms_rel_XZ_3D,C_rel_YZ_3D,allAtoms_rel_YZ_3D,Similarity,Solubility_at_pH_7_4
count,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0,3910.0
mean,132.946193,299.792608,79.740605,22.74179,20.890281,1.320972,4.269821,3.653197,1.0,9.59317,10.283733,20.025411,20.023448,8.156124,8.761234,19.947922,19.975281,8.839811,0.05250568,10.216495,12.302156,20.009058,20.016798,4.623222,6.682458,20.001535,20.000059,2.295846,4.036754,19.993442,19.992171,66.39251,20.484385,20.860596,20.760633,0.414731,0.416297,0.059148,0.018338,0.08033,0.001184,0.005192,0.004148,60.253862,5.195637,11.098957,6.569281,13.241939,0.877284,1.908578,0.854535,21.405104,21.341317,2.466931,1.933868,156.840848,60.109337,69.194167,33.528364,20.055876,24.391445
std,29.298979,46.451019,31.028478,1.138304,3.468011,0.910441,1.545286,1.738444,0.0,2.914391,2.756242,0.303383,0.268621,2.8851,2.709377,0.365838,0.298546,52.01384,0.2906268,2.447613,2.373278,0.324318,0.259735,1.4727,1.406965,0.442283,0.292805,1.10447,1.323772,0.584236,0.408079,84.79359,1.675376,11.40286,11.319885,0.045315,0.067075,0.042052,0.021937,0.04077,0.006493,0.014833,0.01669,9.475151,1.503883,7.415527,7.322516,6.119205,4.659358,5.127845,3.194685,0.929411,0.69567,1.111839,0.600992,979.422684,571.645953,468.12314,339.487127,0.015479,17.2318
min,62.0004,97.07,3.2,16.3,7.0,0.0,1.0,0.0,1.0,1.9635,2.6052,18.787935,19.122281,0.75,1.95,18.691621,18.928861,0.0,2.2600949999999998e-38,2.1454,4.5893,18.53478,19.082458,0.3947,3.0249,17.909794,18.886292,0.0003,0.0005,16.89794,14.612511,3.807599e-07,20.000129,20.0,20.000007,0.17,0.14,0.0,0.0,0.0,0.0,0.0,0.0,20.83,0.89,0.0,0.0,0.0,0.0,0.0,0.0,20.158514,20.158514,0.576067,0.798337,1.002917,0.908398,0.260297,0.683794,20.011,0.1
25%,112.5415,271.29,58.9,22.0,19.0,1.0,3.0,2.0,1.0,7.540725,8.485,19.833757,19.85588,5.9884,6.739075,19.717026,19.775284,1.5798e-18,3.813652e-06,8.524675,10.6033,19.81516,19.853884,3.483325,5.5954,19.73787,19.822481,1.585225,3.311075,19.733393,19.82604,28.30925,20.061518,20.040245,20.042456,0.38,0.37,0.03,0.0,0.05,0.0,0.0,0.0,53.98,4.12,5.4025,0.0,8.91,0.0,0.0,0.0,20.822175,20.842512,1.668023,1.477675,3.207464,2.371279,1.468679,1.321189,20.046,6.3
50%,131.5865,303.4,78.4,22.8,21.0,1.0,4.0,3.0,1.0,9.1373,10.1128,20.004489,20.004154,7.9037,8.7071,19.956295,19.975617,0.08338993,0.0003778434,10.08945,12.18165,20.003206,20.018748,4.53375,6.56305,20.000197,20.001907,2.41755,4.2659,19.998384,19.99791,46.23445,20.162119,20.110584,20.112234,0.42,0.42,0.06,0.02,0.07,0.0,0.0,0.0,60.59,5.08,10.65,8.47,12.72,0.0,0.0,0.0,21.154956,21.153548,2.244387,1.843721,4.372811,2.975742,1.978404,1.602092,20.053,25.8
75%,151.59975,329.5,98.475,23.5,23.0,2.0,5.0,5.0,1.0,11.4544,12.032875,20.199033,20.181619,9.91575,10.808025,20.184689,20.186429,1.047315,0.008012407,11.75215,13.800825,20.199994,20.189645,5.590175,7.575375,20.268822,20.181078,2.899975,4.83905,20.258312,20.151851,77.03677,20.413366,20.300765,20.291505,0.44,0.46,0.08,0.03,0.1,0.0,0.0,0.0,66.8775,6.14,15.585,11.04,16.86,0.0,0.0,0.0,21.735105,21.68676,3.032039,2.28489,6.767283,3.991879,2.929672,2.052029,20.062,39.775
max,289.972,575.7,206.0,26.0,42.0,7.0,12.0,12.0,1.0,22.8673,22.8673,21.221301,21.064085,18.6015,18.6015,21.471536,21.017549,1563.001,10.43078,18.3838,21.153,21.360924,20.889557,11.3365,12.8395,21.920991,21.247962,6.9126,8.5461,23.854187,23.894075,2783.715,91.047919,608.567017,628.859044,0.56,0.61,0.25,0.2,0.33,0.1,0.21,0.25,86.05,10.74,39.11,53.95,51.83,47.14,46.97,36.94,45.304933,26.399333,14.72266,4.970768,16189.25,16913.6,9390.0,9837.2,20.173,68.8


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3910 entries, 0 to 3909
Data columns (total 61 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Efficacy              3910 non-null   float64
 1   MW                    3910 non-null   float64
 2   TPSA                  3910 non-null   float64
 3   XL                    3910 non-null   float64
 4   HAC                   3910 non-null   int64  
 5   HBDC                  3910 non-null   int64  
 6   HBAC                  3910 non-null   int64  
 7   RBC                   3910 non-null   int64  
 8   CBUC                  3910 non-null   int64  
 9   MMX6                  3910 non-null   float64
 10  MMX                   3910 non-null   float64
 11  SX6                   3910 non-null   float64
 12  SX                    3910 non-null   float64
 13  MMY6                  3910 non-null   float64
 14  MMY                   3910 non-null   float64
 15  SY6                   3910

[<a href="#content">Back to top</a>]

### Regression Machine Learning <a name="2"></a>

In [6]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Separate the training columns from the target column 'Fit_HillSlope'
X = df.drop(['Efficacy'], axis=1) 
y = df['Efficacy'] 

# Split the data set into train and test parts 
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    random_state=5) 
# # Standardise the data points
sc = StandardScaler()
X_train = sc.fit_transform(X_train_unscaled)
X_test = sc.transform(X_test_unscaled)

# Print the shape of each part
print("Shapes:")
print("X_train: ", X_train.shape)
print("X_test:  ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test:  ", y_test.shape)

Shapes:
X_train:  (3128, 60)
X_test:   (782, 60)
y_train:  (3128,)
y_test:   (782,)


In [7]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate the algorithms that will be used, placing them in a dictionary 
regs = {"SVR":SVR(kernel='linear'),
        "DecisionTree":DecisionTreeRegressor(), 
        "RandomForest":RandomForestRegressor(), 
        "GradientBoost":GradientBoostingRegressor(),}

In [8]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import numpy as np

# Create statistics with the results of training with different algorithms
def model_fit(regs):
    fitted_model={}
    model_result = pd.DataFrame()
    for model_name, model in regs.items():
        model.fit(X_train,y_train)
        fitted_model.update({model_name:model})
        model_dict = {}
        model_dict['1.Algorithm'] = model_name
        model_dict['2.RMSE_Train'] = round(np.sqrt (mean_squared_error(y_train, model.predict(X_train))),2)
        model_dict['3.RMSE_Test'] = round( np.sqrt (mean_squared_error(y_test, model.predict(X_test))),2)
        model_dict['4.MAE_Train'] = round(mean_absolute_error(y_train, model.predict(X_train)),2)
        model_dict['5.MAE_Test'] = round(mean_absolute_error(y_test, model.predict(X_test)),2)
        model_result = model_result._append(model_dict,ignore_index=True)
    return fitted_model, model_result

fitted_model, model_result = model_fit(regs)
model_result.sort_values(by=['5.MAE_Test'],ascending=True)

Unnamed: 0,1.Algorithm,2.RMSE_Train,3.RMSE_Test,4.MAE_Train,5.MAE_Test
2,RandomForest,10.77,29.21,8.37,22.66
3,GradientBoost,24.61,29.43,19.52,23.03
0,SVR,28.56,29.64,22.22,23.16
1,DecisionTree,0.0,40.21,0.0,31.41


[<a href="#content">Back to top</a>]

### Cross-validation <a name="3"></a>

In [9]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Create statistics with the results of cross-validation
def model_CV(regs):
    fitted_model={}
    model_cv_result = pd.DataFrame()
    for model_name, model in regs.items():
        fitted_model.update({model_name:model})
        scores = cross_val_score(model, X, y, cv=5,
                        scoring=('neg_mean_absolute_error'))
        scores = -scores
        model_dict = {}
        model_dict['1.Algorithm'] = model_name
        model_dict['2.CV_MAE'] = round(np.mean(scores), 2)
        model_dict['3.Sta Dev MAE'] = round(np.std(scores), 2)
        model_dict['4.List of MAE'] = np.round(scores, 2)
        model_cv_result = model_cv_result._append(model_dict,ignore_index=True)
    return fitted_model, model_cv_result

fitted_model, model_cv_result = model_CV(regs)
model_cv_result.sort_values(by=['2.CV_MAE'],ascending= True)

Unnamed: 0,1.Algorithm,2.CV_MAE,3.Sta Dev MAE,4.List of MAE
2,RandomForest,22.89,0.69,"[23.34, 22.88, 21.98, 22.35, 23.92]"
3,GradientBoost,22.93,0.7,"[23.26, 22.92, 22.04, 22.41, 24.05]"
0,SVR,23.18,0.81,"[23.74, 22.84, 22.14, 22.76, 24.44]"
1,DecisionTree,32.98,0.97,"[33.81, 33.13, 31.1, 33.53, 33.33]"


[<a href="#content">Back to top</a>]