This notebook includes a benchmark of performances of the applied models (Random Forest, Gradient Boosted Trees and Light Gradient Boosting Machines) applied for 4 resampled testing sets (RO, GN, SMOTE, ADASYN) and the original testing set (with 500 features selected by Information Gain). Lastly, random forest model's performance is tested across 3 intervals of Topt (8-29°C, 30-36°C, 37-99°C).


# **Import libraries and data**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
import lightgbm
import glob

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [74]:
#read 4 resampled testing sets (RO, GN, SMOTE, ADASYN) and the original testing set 
path = '/content/drive/My Drive/data/resampled_train_sets/'
all_files = glob.glob(path + "/*.csv")
train_set_list=[]

for filename in all_files:
  df = pd.read_csv(filename, index_col=None, header=0)
  train_set_list.append(df)

In [None]:
all_files

['/content/drive/My Drive/data/resampled_train_sets/data_train_ig_ro.csv',
 '/content/drive/My Drive/data/resampled_train_sets/data_train_ig_adasyn.csv',
 '/content/drive/My Drive/data/resampled_train_sets/data_train_ig_gn.csv',
 '/content/drive/My Drive/data/resampled_train_sets/data_train_ig_smote.csv',
 '/content/drive/My Drive/data/resampled_train_sets/data_train.csv']

In [None]:
train_set_list[0]

Unnamed: 0,seq_TEMP,A,C,E,I,K,L,N,Q,R,...,Normalized van der Waals Volume-T2332,Polarity-T1331,Polarizability-T2332,Charge-T1221,Charge-T1331,Solvent accessibility-T1221,Hydrophobicity_CASG920101-G1D100,Hydrophobicity_FASG890101-G1D100,Normalized van der Waals Volume-G1D100,Charge-G2D100
0,52,0.080292,0.004866,0.026764,0.046229,0.017032,0.038929,0.065693,0.043796,0.034063,...,0.136585,0.204878,0.158537,0.085366,0.007317,0.231707,100.000000,99.756691,100.000000,100.000000
1,53,0.062147,0.011299,0.079096,0.080979,0.058380,0.084746,0.035782,0.035782,0.060264,...,0.211321,0.281132,0.252830,0.184906,0.043396,0.320755,100.000000,100.000000,99.435028,99.623352
2,53,0.093750,0.004808,0.072115,0.057692,0.026442,0.100962,0.028846,0.031250,0.074519,...,0.187952,0.257831,0.219277,0.151807,0.031325,0.293976,99.759615,99.038462,99.759615,100.000000
3,53,0.144495,0.013761,0.059633,0.018349,0.011468,0.146789,0.016055,0.006881,0.121560,...,0.140230,0.200000,0.177011,0.209195,0.025287,0.298851,100.000000,100.000000,100.000000,100.000000
4,54,0.080645,0.010081,0.092742,0.046371,0.040323,0.086694,0.034274,0.034274,0.062500,...,0.187879,0.274747,0.222222,0.167677,0.024242,0.307071,100.000000,100.000000,99.798387,99.798387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2579,50,0.037520,0.011419,0.073409,0.053834,0.070147,0.091354,0.071778,0.027732,0.030995,...,0.245098,0.303922,0.284314,0.142157,0.029412,0.302288,100.000000,100.000000,99.184339,99.673736
2580,50,0.081081,0.010811,0.061261,0.054054,0.021622,0.120721,0.028829,0.039640,0.061261,...,0.185921,0.211191,0.222022,0.137184,0.021661,0.270758,100.000000,100.000000,100.000000,99.819820
2581,51,0.100427,0.012821,0.057692,0.059829,0.089744,0.091880,0.040598,0.008547,0.025641,...,0.164882,0.222698,0.188437,0.167024,0.036403,0.284797,100.000000,100.000000,99.786325,99.786325
2582,51,0.126147,0.004587,0.068807,0.064220,0.052752,0.087156,0.038991,0.027523,0.032110,...,0.135632,0.236782,0.163218,0.119540,0.032184,0.287356,99.770642,99.770642,99.541284,100.000000


# **Benchmark of performances of the applied models**

In [76]:
def load_data ():
    #df = pd.read_csv("/content/drive/My Drive/data/data_train_ig_ro_0523.csv")
    y_train= df['seq_TEMP']
    X_train = df.drop('seq_TEMP', axis=1)
    y_test = pd.read_csv("/content/drive/My Drive/data/y_test_0602.csv")
    X_test = pd.read_csv("/content/drive/My Drive/data/X_test_ig_0523.csv")
    #split the test data into bins
    data_test = pd.concat([y_test, X_test], axis=1).reset_index(level=0) 
    data_test = data_test.drop(['index'],axis=1) 
    bin4_29=data_test[data_test['seq_TEMP'] <30]
    X_test4_29 = bin4_29.drop(['seq_TEMP'], axis=1)
    y_test4_29 = bin4_29['seq_TEMP']
    bin30_36=data_test[(data_test['seq_TEMP']>= 30) & (data_test['seq_TEMP'] < 37)]
    X_test30_36 = bin30_36.drop(['seq_TEMP'], axis=1)
    y_test30_36 = bin30_36['seq_TEMP']
    bin37_99=data_test[data_test['seq_TEMP']>= 37]
    X_test37_99 = bin37_99.drop(['seq_TEMP'], axis=1)
    y_test37_99 = bin37_99['seq_TEMP']
    return X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99

def evaluate_model(X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99, model):
    models = model.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    r2 = r2_score(y_test.values.ravel(), y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)**(1/2)
    #evaluare MSE across bins
    y_pred4_29=model.predict(X_test4_29)
    mse4_29 = mean_squared_error(y_test4_29, y_pred4_29)
    y_pred30_36=model.predict(X_test30_36)
    mse30_36 = mean_squared_error(y_test30_36, y_pred30_36)
    y_pred37_99=model.predict(X_test37_99)
    mse37_99 = mean_squared_error(y_test37_99, y_pred37_99)
    return r2, mae, mse, rmse, mse4_29, mse30_36, mse37_99 

def get_models():
    models, names = list(), list()
    models.append(RandomForestRegressor(random_state=0))
    names.append('RF')
    models.append(GradientBoostingRegressor(random_state=0))
    names.append('GBR')
    models.append(LGBMRegressor(random_state=0))
    names.append('LGBM')
    return models, names

## **Random Oversampling**

In [77]:
df=train_set_list[0]

X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99 = load_data()

models, names = get_models()
results = list()

for i in range(len(models)):
    pipeline = Pipeline(steps=[('m', models[i])])    
    scores = evaluate_model(X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99,  pipeline)
    results.append(scores)
    print(names[i], scores)

RF (0.6957037968150084, 7.762737252663622, 120.7530192315766, 10.988767866852799, 83.34684328841809, 47.96057313692038, 211.04079663608564)
GBR (0.6915556373947391, 7.876116192583182, 122.39912184149617, 11.06341366132064, 79.32895624523498, 46.68698976349359, 220.06450532378884)
LGBM (0.7249149947898914, 7.201409096463164, 109.16122047129416, 10.448024716246328, 73.5429376652877, 42.157992314329796, 193.3392430390032)


## **ADASYN**

In [78]:
df=train_set_list[1]

X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99 = load_data()

models, names = get_models()
results = list()

for i in range(len(models)):
    pipeline = Pipeline(steps=[('m', models[i])])    
    scores = evaluate_model(X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99,  pipeline)
    results.append(scores)
    print(names[i], scores)

RF (-0.1290884101184786, 12.305099885844749, 448.05302555254315, 21.167263062392905, 25.522673750000003, 8.753836132983377, 1170.4641811926604)
GBR (-0.017957066137819666, 11.948245875389912, 403.9530822194699, 20.098584084941653, 28.271829174510565, 10.108024767569955, 1048.7928811217503)
LGBM (-0.08651243856975865, 12.104497800919995, 431.15772072318856, 20.764337714533266, 25.34625156666148, 10.037930074605868, 1124.3650914446428)


## **Gaussian Noise**

In [79]:
df=train_set_list[2]

X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99 = load_data()

models, names = get_models()
results = list()

for i in range(len(models)):
    pipeline = Pipeline(steps=[('m', models[i])])    
    scores = evaluate_model(X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99,  pipeline)
    results.append(scores)
    print(names[i], scores)

RF (0.6941422938793327, 7.781124238964992, 121.37266611526509, 11.016926346094227, 85.64015575037664, 51.11388842093176, 208.1825134914203)
GBR (0.6758690621768707, 8.162366673198596, 128.62398202422173, 11.34125134296131, 86.6336336816733, 55.31041242152445, 223.45211486867709)
LGBM (0.7127893578502582, 7.4160221647413005, 113.97300338294785, 10.675813944751372, 75.89224110092668, 47.09020744722868, 200.27898210120142)


## **SMOTE**

In [80]:
df=train_set_list[3]

X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99 = load_data()

models, names = get_models()
results = list()

for i in range(len(models)):
    pipeline = Pipeline(steps=[('m', models[i])])    
    scores = evaluate_model(X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99,  pipeline)
    results.append(scores)
    print(names[i], scores)

RF (0.7011044345660825, 7.705865296803653, 118.60989911574073, 10.890817192283633, 82.85025763700564, 50.38615511909449, 203.86349303431876)
GBR (0.6880754572999039, 7.985513201902159, 123.78015206639677, 11.125652882702964, 80.5359093434398, 43.777894762269476, 224.93496845330998)
LGBM (0.6991913800625438, 7.596470711626616, 119.36905123410098, 10.925614455677126, 78.266862972565, 45.507562236435554, 213.82153974345914)


## **Original**

In [81]:
df=train_set_list[4]

X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99 = load_data()

models, names = get_models()
results = list()

for i in range(len(models)):
    pipeline = Pipeline(steps=[('m', models[i])])    
    scores = evaluate_model(X_train, y_train, X_test, y_test, X_test4_29, y_test4_29, X_test30_36, y_test30_36, X_test37_99, y_test37_99,  pipeline)
    results.append(scores)
    print(names[i], scores)

RF (0.6999753694234909, 7.772682648401827, 119.05794290810503, 10.911367600264645, 82.46396095338983, 45.64270228838582, 209.09676796636086)
GBR (0.7003098813109021, 7.732855622482914, 118.92519948261707, 10.905283099608972, 81.7515846137292, 41.667413751086706, 212.47166419861293)
LGBM (0.7178503825894529, 7.366559016559, 111.9646509576896, 10.581335027192438, 73.59830116482375, 48.92652600539216, 195.54188926588216)


All techniques produce almost similar results to original data set (i.e. without synthesized data) without large improvement in R2 or MSE.