# ML regression to predict the Efficacy of an active G9a inhibitor. Dataset 3 (extended dataset with new features)

### Content   <a name="content"></a>

1. [Load data](#1)
2. [Regression Machine Learning](#2)
3. [Cros-validation](#3)
4. [Calculate the relative error of the Gradient Boosting Regressor model](#4)
5. [Feature importance of the Gradient Boosting Regressor model](#5)
6. [Comparison of the first six features from the feature importance results](#6)
7. [Hyperparameter tuning of the model with the reduced features](#7)
8. [Relative error of the reduced data model](#8)

## Load data<a name="1"></a>

In [1]:
# pip install modin[ray] 
# pip install sidetable

In [2]:
import pandas as pd 

# # loading the dataset for the regression ML
df = pd.read_csv('data_reg_extended.csv', index_col=[0])
# Avoid some columns to be truncated during df display
pd.set_option('display.max_columns', None)
# Display the data frame
print('Shape of df: ', df.shape)
df.head()

Shape of df:  (3890, 72)


Unnamed: 0,Efficacy,MW,TPSA,XL,HAC,HBDC,HBAC,RBC,CBUC,MMX6,MMX,SX6,SX,MMY6,MMY,SY6,SY,Volume_1,Volume_2,MMX6_3D,MMX_3D,SX6_3D,SX_3D,MMY6_3D,MMY_3D,SY6_3D,SY_3D,MMZ6_3D,MMZ_3D,SZ6_3D,SZ_3D,Volume_1_3D,XY_3D_volume,XZ_3D_volume,YZ_3D_volume,C_relative,H_relative,O_relative,S_relative,N_relative,Br_relative,Cl_relative,F_relative,C,H,O,S,N,Br,Cl,F,C_rel_2D,allAtoms_rel_2D,C_rel_XY_3D,allAtoms_rel_XY_3D,C_rel_XZ_3D,allAtoms_rel_XZ_3D,C_rel_YZ_3D,allAtoms_rel_YZ_3D,Similarity,Solubility_at_pH_7_4,Similarity_1,Similarity_2,Similarity_3,C_rel**MMY6_3D,C_rel**MMZ6_3D,mean_C_rel**MM6_3D,mean_C_rel**C_rel_3D,mean_C**SX6_S,mean_H_rel**MM_3D,mean_H_rel**allAtoms_rel_3D,mean_H**SX6_S
0,72.9655,278.29,125.0,2.2,19,1,6,2,1,6.6342,8.9727,0.089578,-0.356796,10.0201,10.0201,-0.487187,-0.275276,10.703674,0.3062773,10.4533,13.4223,0.485431,0.053612,2.9936,5.1742,0.513676,0.355558,1.165,3.0878,-2.424383,-1.454583,154.967504,2.341853,0.810366,0.061954,0.38,0.34,0.1,0.03,0.14,0.0,0.0,0.0,47.48,3.62,17.25,11.52,20.13,0.0,0.0,0.0,0.662089,0.89547,3.491883,2.594082,8.97279,4.346881,2.569614,1.675691,0.065,0.1,0.261,0.265,0.269,0.055213,0.323928,4e-05,0.039159,6.513746,5.144812e-07,0.078038,1.071405
1,147.317,358.4,143.0,3.4,24,3,7,5,1,7.0834,7.2974,-0.070702,0.0284,12.5835,12.5835,0.114455,0.268933,6e-06,2.241061e-07,11.7024,13.8931,0.065879,0.130764,6.9136,8.9136,-0.930758,-0.46698,2.4349,4.3407,-0.21025,-0.047959,69.723704,0.040203,0.078547,0.012436,0.42,0.37,0.05,0.05,0.11,0.0,0.0,0.0,53.62,3.94,8.93,17.89,15.63,0.0,0.0,0.0,0.562912,0.579918,1.692664,1.558641,4.806111,3.200659,2.839377,2.053494,0.054,13.7,0.382,0.356,0.328,0.002485,0.120962,3.9e-05,0.110308,1.299951,1.002267e-06,0.127872,1.196381
2,174.885,293.28,95.1,3.1,22,2,6,3,1,6.4686,6.4686,0.643863,0.698222,11.2534,11.8613,0.31901,0.36589,0.003303,5.384583e-06,10.1926,12.9043,0.323032,0.32477,4.9932,7.6809,0.171935,-0.007312,0.698,1.1042,-0.315743,-0.25367,8.764451,1.172396,0.104192,0.209574,0.48,0.33,0.09,0.0,0.09,0.0,0.0,0.0,65.53,3.78,16.37,0.0,14.33,0.0,0.0,0.0,0.574813,0.545353,2.041296,1.680051,14.602579,11.68656,7.153582,6.956077,0.05,24.7,0.358,0.338,0.325,0.025608,0.59911,0.000564,0.076263,3.861615,6.120113e-07,0.051906,1.540111
3,189.224,324.4,70.9,3.8,24,2,4,6,1,15.3302,15.3302,-0.063507,-0.037934,7.2036,7.2036,-0.223659,-0.149767,0.0,3.739519e-09,12.6857,14.2391,-0.324084,-0.30544,7.6102,8.9715,-0.653389,-0.602524,2.7585,4.488,0.219574,0.105079,13.375666,0.014993,0.059015,0.040756,0.43,0.45,0.07,0.0,0.05,0.0,0.0,0.0,70.35,6.21,14.8,0.0,8.64,0.0,0.0,0.0,2.12813,2.12813,1.666934,1.587148,4.598767,3.172705,2.758818,1.998997,0.059,1.0,0.263,0.252,0.25,0.001624,0.097482,2.2e-05,0.120999,0.25196,1.153608e-05,0.187875,0.572478
4,166.868,410.5,124.0,4.4,29,3,7,7,1,16.8859,16.8859,0.085001,-0.000425,9.623,11.9205,0.105757,0.021858,9.625909,0.05822748,13.1863,15.5571,-0.004718,0.086889,7.4717,9.5477,1.094927,0.563269,3.4022,5.2265,-0.578542,-0.502618,80.618934,0.802356,0.111462,0.517832,0.41,0.43,0.06,0.02,0.08,0.0,0.0,0.0,61.45,5.4,11.69,7.81,13.65,0.0,0.0,0.0,1.754744,1.416543,1.764833,1.629408,3.875816,2.976581,2.196138,1.826787,0.054,9.7,0.404,0.372,0.339,0.001279,0.048152,8e-06,0.12667,0.980758,1.985327e-06,0.182632,1.15781


In [3]:
# Check for NaN
df.isnull().values.any()

False

In [4]:
df.describe(include="all")

Unnamed: 0,Efficacy,MW,TPSA,XL,HAC,HBDC,HBAC,RBC,CBUC,MMX6,MMX,SX6,SX,MMY6,MMY,SY6,SY,Volume_1,Volume_2,MMX6_3D,MMX_3D,SX6_3D,SX_3D,MMY6_3D,MMY_3D,SY6_3D,SY_3D,MMZ6_3D,MMZ_3D,SZ6_3D,SZ_3D,Volume_1_3D,XY_3D_volume,XZ_3D_volume,YZ_3D_volume,C_relative,H_relative,O_relative,S_relative,N_relative,Br_relative,Cl_relative,F_relative,C,H,O,S,N,Br,Cl,F,C_rel_2D,allAtoms_rel_2D,C_rel_XY_3D,allAtoms_rel_XY_3D,C_rel_XZ_3D,allAtoms_rel_XZ_3D,C_rel_YZ_3D,allAtoms_rel_YZ_3D,Similarity,Solubility_at_pH_7_4,Similarity_1,Similarity_2,Similarity_3,C_rel**MMY6_3D,C_rel**MMZ6_3D,mean_C_rel**MM6_3D,mean_C_rel**C_rel_3D,mean_C**SX6_S,mean_H_rel**MM_3D,mean_H_rel**allAtoms_rel_3D,mean_H**SX6_S
count,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0,3890.0
mean,132.911167,299.861143,79.775416,2.744679,20.894859,1.320566,4.268123,3.651414,1.0,9.596785,10.286804,0.025353,0.023352,8.158382,8.763759,-0.052072,-0.024514,8.851506,0.05244091,10.221921,12.306994,0.008828,0.01704,4.622067,6.681055,0.003001,0.000904,2.297309,4.037785,-0.00676,-0.008338,66.32588,0.484506,0.8630829,0.762899,0.414787,0.416226,0.059111,0.018355,0.080378,0.00119,0.00518,0.004141,60.254805,5.19419,11.088846,6.577746,13.248085,0.881794,1.905689,0.84892,1.405983,1.341871,2.4688,1.934973,157.000569,59.782415,69.179706,33.285931,0.055878,24.361632,0.285496,0.277494,0.268793,0.034125,0.212593,0.0009184778,0.1290639,2.720137,0.0002116497,0.182918,1.128723
std,29.294427,46.148193,31.013529,1.136054,3.446987,0.910392,1.540684,1.73341,0.0,2.911908,2.751989,0.303721,0.26867,2.887211,2.710905,0.365724,0.298293,52.12942,0.2911455,2.44256,2.368003,0.324426,0.259901,1.468863,1.403279,0.442323,0.292514,1.103894,1.322431,0.585349,0.407594,84.87196,1.678814,11.43204,11.348781,0.045202,0.067071,0.042085,0.021932,0.04079,0.006509,0.014802,0.016681,9.460849,1.503441,7.417339,7.322493,6.120085,4.670897,5.122398,3.182549,0.93072,0.69647,1.11241,0.600815,981.269775,571.847622,468.845781,339.45173,0.01549,17.227432,0.07615,0.068609,0.061669,0.040117,0.236257,0.003474711,0.06770598,8.513598,0.0007383254,0.085113,0.512968
min,62.0004,97.07,3.2,-3.7,7.0,0.0,1.0,0.0,1.0,1.9635,2.6052,-1.212065,-0.877719,0.75,1.95,-1.308379,-1.071139,0.0,2.2600949999999998e-38,2.1454,4.5893,-1.46522,-0.917542,0.3947,3.0249,-2.090206,-1.113708,0.0003,0.0005,-3.10206,-5.387489,3.807599e-07,0.000129,2.000274e-07,7e-06,0.17,0.14,0.0,0.0,0.0,0.0,0.0,0.0,20.83,0.89,0.0,0.0,0.0,0.0,0.0,0.0,0.158514,0.158514,0.576067,0.798337,1.002917,0.908398,0.260297,0.683794,0.011,0.1,0.04,0.042,0.05,3.8e-05,0.000857,6.968734e-09,6.417663e-12,0.002349,5.782439e-10,0.003183,0.131177
25%,112.4935,271.3025,58.9,2.0,19.0,1.0,3.0,2.0,1.0,7.5411,8.485,-0.166488,-0.143731,5.98745,6.739075,-0.282742,-0.223614,1.5798e-18,3.77369e-06,8.5309,10.60855,-0.185455,-0.146116,3.4837,5.5945,-0.260382,-0.176874,1.589575,3.3129,-0.26812,-0.17396,28.26394,0.061344,0.04021581,0.042548,0.38,0.37,0.03,0.0,0.05,0.0,0.0,0.0,53.9825,4.12,5.4,0.0,8.91,0.0,0.0,0.0,0.822175,0.843109,1.669144,1.480358,3.208697,2.371797,1.467018,1.320636,0.046,6.3,0.232,0.231,0.228,0.007255,0.071979,2.72444e-05,0.079934,0.4692,3.693498e-06,0.122392,0.795213
50%,131.5865,303.4,78.5,2.8,21.0,1.0,4.0,3.5,1.0,9.1466,10.1148,0.004392,0.00404,7.91875,8.7071,-0.043705,-0.024694,0.08336591,0.0003773034,10.0992,12.18475,0.002707,0.018748,4.53375,6.56185,0.000414,0.00296,2.41765,4.26615,-0.001453,-0.002105,46.00758,0.161662,0.1104697,0.112316,0.42,0.42,0.06,0.02,0.07,0.0,0.0,0.0,60.6,5.08,10.645,8.52,12.72,0.0,0.0,0.0,1.155311,1.152417,2.247356,1.844886,4.373008,2.976804,1.978111,1.599595,0.053,25.8,0.285,0.278,0.27,0.018931,0.123886,0.0001510436,0.1217025,1.010592,2.291465e-05,0.179361,1.028124
75%,151.59975,329.5,98.5,3.5,23.0,2.0,5.0,5.0,1.0,11.4544,12.032875,0.198907,0.181786,9.9166,10.808025,0.18526,0.186429,1.047186,0.008009357,11.753475,13.8047,0.199701,0.189755,5.5917,7.576425,0.269311,0.181224,2.90105,4.8394,0.258958,0.151851,76.99563,0.412277,0.2994205,0.291505,0.44,0.46,0.08,0.03,0.1,0.0,0.0,0.0,66.8775,6.14,15.585,11.04,16.86,0.0,0.0,0.0,1.735105,1.687518,3.032711,2.285653,6.758766,3.994999,2.921469,2.052029,0.062,39.7,0.339,0.325,0.312,0.046881,0.248411,0.0006244798,0.1686141,2.254865,0.0001261013,0.241317,1.34038
max,289.972,575.7,206.0,6.0,42.0,7.0,11.0,11.0,1.0,22.8673,22.8673,1.221301,1.064085,18.6015,18.6015,1.471536,1.017549,1563.001,10.43078,18.3838,21.153,1.360924,0.889557,11.3365,12.8395,1.920991,1.247962,6.9126,8.5461,3.854187,3.894075,2783.715,71.047919,588.567,608.859044,0.56,0.61,0.25,0.2,0.33,0.1,0.21,0.25,86.05,10.74,39.11,53.95,51.83,47.14,46.97,36.94,25.304933,6.399333,14.72266,4.970768,16189.25,16913.6,9390.0,9837.2,0.173,68.8,0.549,0.606,0.551,0.559853,0.999754,0.1268708,0.4365464,235.64077,0.01632728,0.488617,4.852181


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3890 entries, 0 to 3889
Data columns (total 72 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Efficacy                     3890 non-null   float64
 1   MW                           3890 non-null   float64
 2   TPSA                         3890 non-null   float64
 3   XL                           3890 non-null   float64
 4   HAC                          3890 non-null   int64  
 5   HBDC                         3890 non-null   int64  
 6   HBAC                         3890 non-null   int64  
 7   RBC                          3890 non-null   int64  
 8   CBUC                         3890 non-null   int64  
 9   MMX6                         3890 non-null   float64
 10  MMX                          3890 non-null   float64
 11  SX6                          3890 non-null   float64
 12  SX                           3890 non-null   float64
 13  MMY6                   

[<a href="#content">Back to top</a>]

## Regression Machine Learning <a name="2"></a>

In [6]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Separate the training columns from the target column 'Fit_HillSlope'
X = df.drop(['Efficacy'], axis=1) 
y = df['Efficacy'] 

# Split the data set into train and test parts 
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    random_state=5) 
# # Standardise the data points
sc = StandardScaler()
X_train = sc.fit_transform(X_train_unscaled)
X_test = sc.transform(X_test_unscaled)

# Print the shape of each part
print("Shapes:")
print("X_train: ", X_train.shape)
print("X_test:  ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test:  ", y_test.shape)

Shapes:
X_train:  (3112, 71)
X_test:   (778, 71)
y_train:  (3112,)
y_test:   (778,)


In [7]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate the algorithms that will be used, placing them in a dictionary 
regs = {"SVR":SVR(kernel='linear'),
        "DecisionTree":DecisionTreeRegressor(), 
        "RandomForest":RandomForestRegressor(), 
        "GradientBoost":GradientBoostingRegressor(),}

In [8]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Create statistics with the results of training with different algorithms
def model_fit(regs):
    fitted_model={}
    model_result = pd.DataFrame()
    for model_name, model in regs.items():
        model.fit(X_train,y_train)
        fitted_model.update({model_name:model})
        model_dict = {}
        model_dict['Algorithm'] = model_name
        model_dict['RMSE_Train'] = round(root_mean_squared_error(y_train, model.predict(X_train)),2)
        model_dict['RMSE_Test'] = round(root_mean_squared_error(y_test, model.predict(X_test)),2)
        model_dict['MAE_Train'] = round(mean_absolute_error(y_train, model.predict(X_train)),2)
        model_dict['MAE_Test'] = round(mean_absolute_error(y_test, model.predict(X_test)),2)
        model_dict['R2_Train'] = round(r2_score(y_train, model.predict(X_train)),2)
        model_dict['R2_Test'] = round(r2_score(y_test, model.predict(X_test)),2)
        model_result = model_result._append(model_dict,ignore_index=True)
    return fitted_model, model_result

fitted_model, model_result = model_fit(regs)
model_result.sort_values(by=['MAE_Test'],ascending=True)

Unnamed: 0,Algorithm,RMSE_Train,RMSE_Test,MAE_Train,MAE_Test,R2_Train,R2_Test
2,RandomForest,10.69,28.99,8.37,22.47,0.87,0.04
3,GradientBoost,24.54,29.29,19.46,22.8,0.3,0.02
0,SVR,28.43,29.89,22.18,23.32,0.05,-0.03
1,DecisionTree,0.0,40.39,0.0,32.37,1.0,-0.87


[<a href="#content">Back to top</a>]

## Cross-validation <a name="3"></a>

In [9]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Create statistics with the results of cross-validation
def model_CV(regs):
    fitted_model={}
    model_cv_result = pd.DataFrame()
    for model_name, model in regs.items():
        fitted_model.update({model_name:model})
        scores = cross_val_score(model, X_train, y_train, cv=5,
                        scoring=('neg_mean_absolute_error'))
        scores = -scores
        model_dict = {}
        model_dict['Algorithm'] = model_name
        model_dict['CV_MAE'] = round(np.mean(scores), 2)
        model_dict['Sta Dev MAE'] = round(np.std(scores), 2)
        model_dict['List of MAE'] = np.round(scores, 2)
        model_cv_result = model_cv_result._append(model_dict,ignore_index=True)
    return fitted_model, model_cv_result

fitted_model, model_cv_result = model_CV(regs)
model_cv_result.sort_values(by=['CV_MAE'],ascending= True)

Unnamed: 0,Algorithm,CV_MAE,Sta Dev MAE,List of MAE
2,RandomForest,22.55,0.24,"[22.85, 22.15, 22.74, 22.49, 22.5]"
3,GradientBoost,22.61,0.34,"[22.92, 22.05, 22.96, 22.4, 22.75]"
0,SVR,22.74,0.34,"[23.33, 22.46, 22.9, 22.56, 22.46]"
1,DecisionTree,32.14,0.61,"[33.03, 31.64, 31.31, 32.46, 32.27]"


[<a href="#content">Back to top</a>]

## Calculate the relative error of the Gradient Boosting Regressor model  <a name="4"></a>

In [11]:
import sklearn.metrics as metrics
from sklearn.metrics import r2_score

# Instantiate and train a model
model = SVR().fit(X_train, y_train)

# Predict 
pred = model.predict(X_test)

# Evaluate
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, pred),2))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, pred),2))
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, pred))))
print("R2 score for perfect model is:", round(r2_score(y_test, pred), 2))

Mean Absolute Error (MAE): 22.92
Mean Squared Error (MSE): 864.81
Root Mean Squared Error (RMSE): 29
R2 score for perfect model is: 0.01


In [12]:
# Create a data frame with the test values 
data_verify=pd.DataFrame(y_test.tolist(),columns=["Real Values"])

# Create a data frame with the values predicted 
data_predicted=pd.DataFrame(pred.tolist(),columns=["Predicted Values"])

# Concatenate the data frames with the test and the values predicted
final_output=pd.concat([data_verify,data_predicted],axis=1)

# Create column with the difference between the test and prediction values
final_output["Difference"]= np.abs(final_output["Real Values"]-final_output["Predicted Values"])
final_output["Relative proportion Difference/Real Value"]= (final_output["Difference"]/final_output["Real Values"])

# Display the resulted data frame 
final_output

Unnamed: 0,Real Values,Predicted Values,Difference,Relative proportion Difference/Real Value
0,173.407,135.141082,38.265918,0.220671
1,162.565,129.411074,33.153926,0.203943
2,124.315,138.100866,13.785866,0.110895
3,152.531,129.379056,23.151944,0.151785
4,125.828,133.674898,7.846898,0.062362
...,...,...,...,...
773,147.772,130.352981,17.419019,0.117878
774,134.144,132.477435,1.666565,0.012424
775,123.336,129.008660,5.672660,0.045994
776,139.113,138.340934,0.772066,0.005550


In [13]:
# Mean of the relative error
df_reg_rel_mean = final_output["Relative proportion Difference/Real Value"].mean()
print("Relative error: ", df_reg_rel_mean)

Relative error:  0.18045225184839414


[<a href="#content">Back to top</a>]