In [None]:
# Hyperparameter tuning for Random Forest to find the best parameters for the MC_DB dataset

In [20]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Read dataset
dataset = pd.read_csv('LipidAdjusted_sigOnly.csv')
dataset = dataset.select_dtypes([np.number]) # select only the numerical columns to act as x and y values in the model

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,C:M Ratio Master,C:M Ratio Lipid Normalized,C:M Ratio Non-Lipid Normalized,# Total Atoms,# Atoms w/o Salt,FilterItLogS,PEOE_VSA6,EState_VSA9,SMR,...,SIC5,AATS0d,ATS3are,GATS1m,GATS1Z,ATS0s,ATSC3i,ATSC4are,BCUTdv-1h,ETA_epsilon_3
0,0,0.001276,0.004057,0.001276,22.0,22.0,-4.689533,17.288447,15.308119,78.7144,...,0.899242,2.813953,564.999,0.765047,0.769887,305.895833,-30.901072,-3.803187,7.014086,0.433333
1,1,0.012264,0.039,0.012264,20.0,20.0,-3.609705,0.0,4.736863,75.7646,...,0.895175,3.277778,501.272,0.867692,0.88302,215.243913,-3.016147,-2.116486,6.017822,0.437931
2,2,0.019018,0.060477,0.019018,20.0,20.0,-3.562566,31.395199,4.736863,82.3238,...,0.822785,2.702128,714.804,1.060515,1.054556,129.645833,-22.574335,0.55408,6.007457,0.437931
3,3,0.01904,0.060546,0.01904,54.0,54.0,-9.071417,74.511145,9.473726,212.5451,...,0.891501,3.0,1580.0348,0.812148,0.807016,390.423525,-25.848412,-2.334104,6.029305,0.44026
4,4,0.026178,0.083246,0.026178,24.0,24.0,-5.802049,18.199101,9.998755,90.914,...,0.896777,3.266667,654.713,0.783113,0.782584,203.951389,-13.861326,-1.324305,7.004057,0.441176


In [4]:
# drop values that have 0 as the ratio
dataset = dataset[dataset['C:M Ratio Lipid Normalized'] != 0]

In [5]:
dataset['C:M Ratio Lipid Normalized'] = dataset['C:M Ratio Lipid Normalized'].replace(0, 0.01) # replace 0 with 0.01
dataset['logCM'] = np.log10(dataset['C:M Ratio Lipid Normalized'])

In [6]:
dataset.shape

(264, 461)

In [7]:
X = dataset.loc[:, 'FilterItLogS':'mZagreb2'].values # select the x values 
y = dataset.loc[:, 'logCM'].values # select the y values for the model 

In [9]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [16]:
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [25]:
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state = 42)
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(train_features, train_labels)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.5min finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [26]:
rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [34]:
regressor = RandomForestRegressor(n_estimators= 1600,
                                 min_samples_split= 2,
                                 min_samples_leaf= 4,
                                 max_features= 'sqrt',
                                 max_depth= 10,
                                 bootstrap= True)