# ML regression to predict the Efficacy of an active G9a inhibitor

### Content   <a name="content"></a>

1. [Load data](#1)
2. [Regression Machine Learning](#2)
3. [Cros-validation](#3)
4. [Calculate the relative error of the Gradient Boosting Regressor model](#4)
5. [Feature importance of the Gradient Boosting Regressor model](#5)
6. [Comparison of the first six features from the feature importance results](#6)
7. [Hyperparameter tuning of the model with the reduced features](#7)
8. [Relative error of the reduced data model](#8)

## Load data<a name="1"></a>

In [1]:
# pip install modin[ray] 
# pip install sidetable

In [2]:
import pandas as pd 

# # loading the dataset for the regression ML
df = pd.read_csv('data_reg_basic.csv', index_col=[0])
# Avoid some columns to be truncated during df display
pd.set_option('display.max_columns', None)
# Display the data frame
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (26790, 64)


Unnamed: 0,CID,SID,Efficacy,SMILES,MW,MF,TPSA,XL,HAC,HBDC,HBAC,RBC,CBUC,MMX6,MMX,SX6,SX,MMY6,MMY,SY6,SY,Volume_1,Volume_2,MMX6_3D,MMX_3D,SX6_3D,SX_3D,MMY6_3D,MMY_3D,SY6_3D,SY_3D,MMZ6_3D,MMZ_3D,SZ6_3D,SZ_3D,Volume_1_3D,XY_3D_volume,XZ_3D_volume,YZ_3D_volume,C_relative,H_relative,O_relative,S_relative,N_relative,Br_relative,Cl_relative,F_relative,C,H,O,S,N,Br,Cl,F,C_rel_2D,allAtoms_rel_2D,C_rel_XY_3D,allAtoms_rel_XY_3D,C_rel_XZ_3D,allAtoms_rel_XZ_3D,C_rel_YZ_3D,allAtoms_rel_YZ_3D,Similarity
0,135915053,56319965,163.334,CCCN1C(=CC(=C1C)C2=CSC(=N2)NC(=O)CCC3=NC4=CC=C...,435.5,C23H25N5O2S,117.0,2.8,31,2,5,7,1,17.8694,17.8694,0.082314,-0.034893,10.8275,12.0275,-0.087487,-0.173883,0.195432,0.001865,17.534,19.3722,0.172632,0.28235,6.6305,8.0824,-0.427647,-0.154356,3.5023,5.649,0.488571,0.292375,32.824767,0.010407,0.190597,0.098098,0.41,0.45,0.04,0.02,0.09,0.0,0.0,0.0,63.43,5.79,7.35,7.36,16.08,0.0,0.0,0.0,1.650372,1.485712,2.644446,2.396838,5.006424,3.429315,1.893184,1.430767,0.052
1,135915052,56318992,109.521,CCCC1=CC(=O)NC(=N1)SCC(=O)N(CCOC)C2=C(N(C(=O)N...,500.6,C23H28N6O5S,172.0,0.8,35,3,8,11,1,14.0287,14.0287,0.075508,0.076284,14.256,15.456,-0.022511,-0.013702,0.195432,0.000699,10.2185,12.2236,0.629553,0.386651,6.0154,7.884,0.149252,0.288979,6.9525,8.3908,0.974558,0.550393,29.040476,0.170102,0.382104,0.119195,0.37,0.44,0.08,0.02,0.1,0.0,0.0,0.0,55.19,5.64,15.98,6.4,16.79,0.0,0.0,0.0,0.984056,0.907654,1.698723,1.550431,1.469759,1.456786,0.865214,0.939601,0.054
2,135900543,51086642,98.1573,C1=CC(=CC=C1NC(=O)CSC2=NC(=C(C(=O)N2)NC(=O)C3=...,449.4,C19H14F3N5O3S,151.0,2.0,31,4,9,6,1,6.5161,6.5161,-0.159177,-0.104719,18.75,21.15,0.088285,0.143002,12.507661,0.037176,11.0932,13.3081,0.335645,0.46717,6.4155,8.2934,-0.448524,-0.690856,2.4547,5.5169,-0.139042,-0.422588,162.225931,0.211769,1.251976,1.686218,0.42,0.31,0.07,0.02,0.11,0.0,0.0,0.07,50.78,3.14,10.68,7.13,15.58,0.0,0.0,12.68,0.347525,0.30809,1.729125,1.604662,4.519167,2.412242,2.613558,1.503272,0.049
3,135900540,51086515,146.98,CC(=O)NC1=CC=C(C=C1)NC(=O)CSC2=NC(=C(C(=O)N2)N...,512.5,C23H24N6O6S,199.0,0.8,36,5,9,9,1,20.5259,21.5663,-0.049764,-0.087331,14.1034,14.1034,0.128934,0.200037,0.083006,0.000517,11.56,13.5814,0.346999,0.432999,10.3972,12.0706,-0.449562,-0.541443,2.8515,5.5493,-0.382651,-0.272418,83.276709,0.082873,0.2232,0.225223,0.38,0.4,0.1,0.02,0.1,0.0,0.0,0.0,53.9,4.72,18.73,6.26,16.4,0.0,0.0,0.0,1.455387,1.529156,1.111838,1.125164,4.054007,2.447408,3.646221,2.175157,0.052
4,135897134,56318894,163.041,CC(C)CN(C1=C(N(C(=O)NC1=O)CC2=CC=CC=C2)N)C(=O)...,502.6,C27H30N6O4,137.0,1.8,37,3,6,9,1,12.9638,12.9638,0.575062,0.532346,16.5215,17.7215,0.34646,0.176789,0.195432,0.003433,9.9702,11.1871,0.252543,0.277778,9.2767,10.4937,-0.090845,0.111323,4.5327,6.8431,0.168788,0.31839,11.70753,0.00211,0.04038,0.026634,0.4,0.45,0.06,0.0,0.09,0.0,0.0,0.0,64.53,6.02,12.73,0.0,16.72,0.0,0.0,0.0,0.784662,0.731529,1.074757,1.066078,2.199616,1.6348,2.046617,1.533472,0.059


In [3]:
# Check for NaN
df.isnull().values.any()

False

In [4]:
df.describe(include="all")

Unnamed: 0,CID,SID,Efficacy,SMILES,MW,MF,TPSA,XL,HAC,HBDC,HBAC,RBC,CBUC,MMX6,MMX,SX6,SX,MMY6,MMY,SY6,SY,Volume_1,Volume_2,MMX6_3D,MMX_3D,SX6_3D,SX_3D,MMY6_3D,MMY_3D,SY6_3D,SY_3D,MMZ6_3D,MMZ_3D,SZ6_3D,SZ_3D,Volume_1_3D,XY_3D_volume,XZ_3D_volume,YZ_3D_volume,C_relative,H_relative,O_relative,S_relative,N_relative,Br_relative,Cl_relative,F_relative,C,H,O,S,N,Br,Cl,F,C_rel_2D,allAtoms_rel_2D,C_rel_XY_3D,allAtoms_rel_XY_3D,C_rel_XZ_3D,allAtoms_rel_XZ_3D,C_rel_YZ_3D,allAtoms_rel_YZ_3D,Similarity
count,26790.0,26790.0,26790.0,26790,26790.0,26790,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0,26790.0
unique,,,,26790,,15265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,,,,CCCN1C(=CC(=C1C)C2=CSC(=N2)NC(=O)CCC3=NC4=CC=C...,,C18H17N3O2S,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,,,,1,,25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,10918740.0,26021930.0,130.902124,,362.219342,,91.129613,3.103576,25.184248,1.273647,5.159164,4.772004,1.0,10.820822,11.393794,0.026397,0.025054,9.34694,9.905757,-0.024719,-0.007829,6.587725,0.039534,11.341407,13.344028,0.004201479,0.005084,5.613876,7.600966,-0.002127,-0.001531,2.957842,4.770969,0.000684,-4.5e-05,62.27197,0.366787,0.4965853,0.47995,0.416009,0.419357,0.065514,0.017486,0.071291,0.001466,0.004536,0.004057,60.488541,5.221697,12.400523,6.311274,11.842106,1.106977,1.703862,0.922828,1.375404,1.317216,2.269236,1.854651,96.800158,29.25981,46.495549,17.401598,0.056346
std,26155650.0,16719740.0,30.207705,,79.512256,,34.758588,1.293311,5.61241,0.972059,1.843106,2.258578,0.0,3.614898,3.513774,0.313437,0.289196,3.577346,3.455948,0.381519,0.331921,42.901301,0.260756,2.856502,2.796471,0.3232232,0.288005,1.856924,1.771663,0.460464,0.331995,1.317354,1.435069,0.569092,0.36288,77.12382,1.350007,7.203959,7.110876,0.04387,0.06056,0.038843,0.019971,0.035989,0.006874,0.012777,0.015631,8.847334,1.381787,6.928816,6.758371,5.544879,4.920089,4.525238,3.299506,0.859841,0.699258,1.062891,0.608571,899.440646,428.96554,479.208717,284.085023,0.015052
min,237.0,842134.0,43.5771,,82.1,,0.0,-8.2,6.0,0.0,0.0,0.0,1.0,0.7145,1.3349,-1.557835,-1.175301,0.4125,0.4125,-1.517135,-1.214819,0.0,0.0,0.975,3.8999,-1.46522,-1.114339,0.166,2.2976,-2.273516,-1.409802,0.0001,0.0003,-3.597405,-5.387489,1.161843e-07,0.000109,2.000274e-07,2e-06,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152146,0.158514,0.537249,0.752498,0.596586,0.908398,0.210874,0.683794,0.0
25%,1480231.0,14745060.0,109.23475,,305.4,,67.9,2.3,21.0,1.0,4.0,3.0,1.0,8.2643,8.8381,-0.17375,-0.164448,6.7578,7.446975,-0.281499,-0.234983,0.0,2e-06,9.288925,11.3179,-0.1976979,-0.186527,4.241625,6.26685,-0.292486,-0.206794,2.2619,4.041175,-0.284096,-0.173399,27.12392,0.050191,0.04215651,0.041396,0.39,0.38,0.04,0.0,0.05,0.0,0.0,0.0,54.8,4.27,7.61,0.0,8.09,0.0,0.0,0.0,0.818009,0.830296,1.511822,1.390169,2.852956,2.202303,1.43392,1.300653,0.048
50%,3237418.0,24785750.0,129.762,,359.4,,88.2,3.2,25.0,1.0,5.0,5.0,1.0,10.3961,11.17855,0.014301,0.01272,9.1331,9.7201,-0.019787,-0.00567,0.082433,0.000188,11.1376,13.12355,-9.395224e-07,0.003506,5.5254,7.47655,2e-05,-1.7e-05,2.79705,4.74145,0.000678,0.000391,42.52487,0.124258,0.1055183,0.104203,0.42,0.42,0.06,0.02,0.07,0.0,0.0,0.0,60.83,5.15,12.24,6.98,11.37,0.0,0.0,0.0,1.16026,1.157257,1.997489,1.729193,3.913761,2.784937,1.897556,1.564289,0.054
75%,6899892.0,26730660.0,149.6905,,416.9,,112.0,3.9,29.0,2.0,6.0,6.0,1.0,12.994,13.718525,0.223271,0.211232,11.801275,12.0117,0.228826,0.223174,0.539235,0.003868,13.1608,15.130975,0.2051459,0.197531,6.844825,8.76045,0.289904,0.205278,3.740375,5.638575,0.283911,0.171741,70.77959,0.314035,0.2630848,0.261348,0.44,0.46,0.09,0.03,0.09,0.0,0.0,0.0,66.65,6.1,16.82,10.17,14.85,0.0,0.0,0.0,1.704289,1.631291,2.766201,2.196369,5.507507,3.571511,2.584801,1.932936,0.061


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26790 entries, 0 to 26789
Data columns (total 64 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CID                 26790 non-null  int64  
 1   SID                 26790 non-null  int64  
 2   Efficacy            26790 non-null  float64
 3   SMILES              26790 non-null  object 
 4   MW                  26790 non-null  float64
 5   MF                  26790 non-null  object 
 6   TPSA                26790 non-null  float64
 7   XL                  26790 non-null  float64
 8   HAC                 26790 non-null  int64  
 9   HBDC                26790 non-null  int64  
 10  HBAC                26790 non-null  int64  
 11  RBC                 26790 non-null  int64  
 12  CBUC                26790 non-null  int64  
 13  MMX6                26790 non-null  float64
 14  MMX                 26790 non-null  float64
 15  SX6                 26790 non-null  float64
 16  SX       

[<a href="#content">Back to top</a>]

## Regression Machine Learning <a name="2"></a>

In [6]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Separate the training columns from the target column 'Fit_HillSlope'
X = df.drop(columns=['CID', 'SID', 'SMILES','MF', 'Efficacy']) 
y = df['Efficacy'] 

# Split the data set into train and test parts 
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    random_state=5) 
# # Standardise the data points
sc = StandardScaler()
X_train = sc.fit_transform(X_train_unscaled)
X_test = sc.transform(X_test_unscaled)

# Print the shape of each part
print("Shapes:")
print("X_train: ", X_train.shape)
print("X_test:  ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test:  ", y_test.shape)

Shapes:
X_train:  (21432, 59)
X_test:   (5358, 59)
y_train:  (21432,)
y_test:   (5358,)


In [7]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate the algorithms that will be used, placing them in a dictionary 
regs = {"SVR":SVR(kernel='linear'),
        "DecisionTree":DecisionTreeRegressor(), 
        "RandomForest":RandomForestRegressor(), 
        "GradientBoost":GradientBoostingRegressor(),}

In [8]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Create statistics with the results of training with different algorithms
def model_fit(regs):
    fitted_model={}
    model_result = pd.DataFrame()
    for model_name, model in regs.items():
        model.fit(X_train,y_train)
        fitted_model.update({model_name:model})
        model_dict = {}
        model_dict['Algorithm'] = model_name
        model_dict['RMSE_Train'] = round(root_mean_squared_error(y_train, model.predict(X_train)),2)
        model_dict['RMSE_Test'] = round(root_mean_squared_error(y_test, model.predict(X_test)),2)
        model_dict['MAE_Train'] = round(mean_absolute_error(y_train, model.predict(X_train)),2)
        model_dict['MAE_Test'] = round(mean_absolute_error(y_test, model.predict(X_test)),2)
        model_dict['R2_Train'] = round(r2_score(y_train, model.predict(X_train)),2)
        model_dict['R2_Test'] = round(r2_score(y_test, model.predict(X_test)),2)
        model_result = model_result._append(model_dict,ignore_index=True)
    return fitted_model, model_result

fitted_model, model_result = model_fit(regs)
model_result.sort_values(by=['MAE_Test'],ascending=True)

Unnamed: 0,Algorithm,RMSE_Train,RMSE_Test,MAE_Train,MAE_Test,R2_Train,R2_Test
2,RandomForest,11.03,29.08,8.63,23.19,0.87,0.05
3,GradientBoost,28.89,29.29,22.84,23.37,0.09,0.04
0,SVR,29.88,29.5,23.53,23.55,0.03,0.03
1,DecisionTree,0.15,42.26,0.0,33.34,1.0,-1.0


[<a href="#content">Back to top</a>]

## Cross-validation <a name="3"></a>

In [9]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Create statistics with the results of cross-validation
def model_CV(regs):
    fitted_model={}
    model_cv_result = pd.DataFrame()
    for model_name, model in regs.items():
        fitted_model.update({model_name:model})
        scores = cross_val_score(model, X_train, y_train, cv=5,
                        scoring=('neg_mean_absolute_error'))
        scores = -scores
        model_dict = {}
        model_dict['Algorithm'] = model_name
        model_dict['CV_MAE'] = round(np.mean(scores), 2)
        model_dict['Sta Dev MAE'] = round(np.std(scores), 2)
        model_dict['List of MAE'] = np.round(scores, 2)
        model_cv_result = model_cv_result._append(model_dict,ignore_index=True)
    return fitted_model, model_cv_result

fitted_model, model_cv_result = model_CV(regs)
model_cv_result.sort_values(by=['CV_MAE'],ascending= True)

Unnamed: 0,Algorithm,CV_MAE,Sta Dev MAE,List of MAE
2,RandomForest,23.31,0.26,"[23.17, 23.27, 23.71, 22.96, 23.46]"
3,GradientBoost,23.45,0.25,"[23.26, 23.37, 23.84, 23.15, 23.66]"
0,SVR,23.61,0.21,"[23.4, 23.63, 23.92, 23.35, 23.75]"
1,DecisionTree,33.45,0.28,"[33.14, 33.21, 33.83, 33.34, 33.76]"


[<a href="#content">Back to top</a>]

## Calculate the relative error of the Gradient Boosting Regressor model  <a name="4"></a>

In [11]:
import sklearn.metrics as metrics
from sklearn.metrics import r2_score

# Instantiate and train a model
model = GradientBoostingRegressor().fit(X_train, y_train)

# Predict 
pred = model.predict(X_test)

# Evaluate
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, pred),2))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, pred),2))
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, pred))))
print("R2 score for perfect model is:", round(r2_score(y_test, pred), 2))

Mean Absolute Error (MAE): 23.37
Mean Squared Error (MSE): 858.31
Root Mean Squared Error (RMSE): 29
R2 score for perfect model is: 0.04


In [12]:
# Create a data frame with the test values 
data_verify=pd.DataFrame(y_test.tolist(),columns=["Real Values"])

# Create a data frame with the values predicted 
data_predicted=pd.DataFrame(pred.tolist(),columns=["Predicted Values"])

# Concatenate the data frames with the test and the values predicted
final_output=pd.concat([data_verify,data_predicted],axis=1)

# Create column with the difference between the test and prediction values
final_output["Difference"]= np.abs(final_output["Real Values"]-final_output["Predicted Values"])
final_output["Relative proportion Difference/Real Value"]= (final_output["Difference"]/final_output["Real Values"])

# Display the resulted data frame 
final_output

Unnamed: 0,Real Values,Predicted Values,Difference,Relative proportion Difference/Real Value
0,93.9131,129.071853,35.158753,0.374375
1,184.9150,133.314664,51.600336,0.279049
2,134.7190,117.482561,17.236439,0.127944
3,146.6760,135.256592,11.419408,0.077855
4,162.1990,133.000295,29.198705,0.180018
...,...,...,...,...
5353,168.7410,147.710479,21.030521,0.124632
5354,124.2480,134.156776,9.908776,0.079750
5355,148.7570,130.981006,17.775994,0.119497
5356,126.7020,129.386354,2.684354,0.021186


In [13]:
# Mean of the relative error
df_reg_rel_mean = final_output["Relative proportion Difference/Real Value"].mean()
print("Relative error: ", df_reg_rel_mean)

Relative error:  0.19218762226144664


[<a href="#content">Back to top</a>]