In [1]:
from model_funcs import gsearch, rsearch
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,Density,YP,Temperature,ROP,Pipe rotation,Flow rate,Inclination,Eccentricity,Concentration
0,8.314,20,80,30,0,100,90,0.881,0.35
1,8.314,20,80,30,0,200,90,0.881,0.14
2,8.314,20,80,30,80,100,90,0.881,0.06
3,8.314,20,80,30,80,200,90,0.881,0.02
4,8.314,20,180,30,0,100,90,0.881,0.11


In [3]:
X_train, X_test, y_train, y_test = data_split(data)

## Parameter Tuning

In [4]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

### Random Forest

In [7]:
forest_rparams = {'n_estimators': np.logspace(1, 4, 50).astype(int), 'max_depth': np.logspace(1, 3, 20).astype(int), 'bootstrap': [True, False]}

In [8]:
forest_rsearch = rsearch(iter = 200, est = RandomForestRegressor(criterion='mse', random_state=42), params=forest_rparams, x=X_train, y=y_train)

  warn(


In [19]:
pd.DataFrame(forest_rsearch.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:8]

Unnamed: 0,param_n_estimators,param_max_depth,param_bootstrap,params
197,71,12,True,"{'n_estimators': 71, 'max_depth': 12, 'bootstr..."
199,47,26,True,"{'n_estimators': 47, 'max_depth': 26, 'bootstr..."
104,35,615,True,"{'n_estimators': 35, 'max_depth': 615, 'bootst..."
83,26,183,True,"{'n_estimators': 26, 'max_depth': 183, 'bootst..."
191,62,10,True,"{'n_estimators': 62, 'max_depth': 10, 'bootstr..."
189,62,33,True,"{'n_estimators': 62, 'max_depth': 33, 'bootstr..."
14,910,12,True,"{'n_estimators': 910, 'max_depth': 12, 'bootst..."
24,517,483,True,"{'n_estimators': 517, 'max_depth': 483, 'boots..."
153,686,10,True,"{'n_estimators': 686, 'max_depth': 10, 'bootst..."
176,109,42,True,"{'n_estimators': 109, 'max_depth': 42, 'bootst..."


In [13]:
forest_rsearch.best_estimator_

RandomForestRegressor(criterion='mse', max_depth=12, n_estimators=71)

In [17]:
forest_gparams = {'max_depth': np.arange(20, 200, 5), 'n_estimators': np.arange(5, 70, 2)}

In [21]:
forest_gsearch = gsearch(est = RandomForestRegressor(criterion = 'squared_error', bootstrap=True, random_state=42), 
                    params=forest_gparams, x=X_train, y=y_train)

In [25]:
pd.DataFrame(forest_gsearch.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:8]

Unnamed: 0,param_max_depth,param_n_estimators,params,split0_test_score
1152,190,65,"{'max_depth': 190, 'n_estimators': 65}",-0.041979
30,20,65,"{'max_depth': 20, 'n_estimators': 65}",-0.041979
921,155,65,"{'max_depth': 155, 'n_estimators': 65}",-0.041979
888,150,65,"{'max_depth': 150, 'n_estimators': 65}",-0.041979
855,145,65,"{'max_depth': 145, 'n_estimators': 65}",-0.041979
822,140,65,"{'max_depth': 140, 'n_estimators': 65}",-0.041979
63,25,65,"{'max_depth': 25, 'n_estimators': 65}",-0.041979
1185,195,65,"{'max_depth': 195, 'n_estimators': 65}",-0.041979
789,135,65,"{'max_depth': 135, 'n_estimators': 65}",-0.041979
756,130,65,"{'max_depth': 130, 'n_estimators': 65}",-0.041979


In [26]:
forest_gparams2 = {'max_depth': np.arange(150, 200), 'n_estimators': np.arange(35, 70)}

In [27]:
forest_gsearch2 = gsearch(est = RandomForestRegressor(criterion = 'squared_error', random_state=42, bootstrap=True), 
                    params=forest_gparams2, x=X_train, y=y_train)

In [141]:
pd.DataFrame(forest_gsearch2.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:8]

Unnamed: 0,param_max_depth,param_n_estimators,params,split0_test_score
24,150,59,"{'max_depth': 150, 'n_estimators': 59}",-0.039787
70,152,35,"{'max_depth': 152, 'n_estimators': 35}",-0.038164
681,169,51,"{'max_depth': 169, 'n_estimators': 51}",-0.036818
1564,194,59,"{'max_depth': 194, 'n_estimators': 59}",-0.037362
1689,198,44,"{'max_depth': 198, 'n_estimators': 44}",-0.038176
1186,183,66,"{'max_depth': 183, 'n_estimators': 66}",-0.039807
1664,197,54,"{'max_depth': 197, 'n_estimators': 54}",-0.043121
1590,195,50,"{'max_depth': 195, 'n_estimators': 50}",-0.037107
1379,189,49,"{'max_depth': 189, 'n_estimators': 49}",-0.036948
215,156,40,"{'max_depth': 156, 'n_estimators': 40}",-0.043675


In [32]:
forest_gparams3 = {'max_depth': np.arange(150, 200), 'n_estimators': np.arange(49, 68)}

In [37]:
forest_gsearch3 = gsearch(est = RandomForestRegressor(criterion = 'squared_error', random_state = 42,  
                    bootstrap=True), params=forest_gparams3, x=X_train, y=y_train)

In [135]:
pd.DataFrame(forest_gsearch3.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:8]

Unnamed: 0,param_max_depth,param_n_estimators,params,split0_test_score
492,175,66,"{'max_depth': 175, 'n_estimators': 66}",-0.041804
302,165,66,"{'max_depth': 165, 'n_estimators': 66}",-0.041804
549,178,66,"{'max_depth': 178, 'n_estimators': 66}",-0.041804
815,192,66,"{'max_depth': 192, 'n_estimators': 66}",-0.041804
625,182,66,"{'max_depth': 182, 'n_estimators': 66}",-0.041804
321,166,66,"{'max_depth': 166, 'n_estimators': 66}",-0.041804
207,160,66,"{'max_depth': 160, 'n_estimators': 66}",-0.041804
929,198,66,"{'max_depth': 198, 'n_estimators': 66}",-0.041804
416,171,66,"{'max_depth': 171, 'n_estimators': 66}",-0.041804
36,151,66,"{'max_depth': 151, 'n_estimators': 66}",-0.041804


In [136]:
forest_gsearch3.best_estimator_

RandomForestRegressor(max_depth=150, n_estimators=66, random_state=42)

### Gradient Boosting

In [39]:
gb_rparams = {'loss':['squared_error', 'lad', 'huber'],
             'learning_rate': np.logspace(-3, 0, 10),
             'n_estimators': np.arange(50, 200, 5),
             'subsample': [.6, .7, .8, .9, 1],
             'max_depth': np.arange(2, 11)}

In [None]:
gb_reg = GradientBoostingRegressor(random_state = 42, n_iter_no_change = 5, validation_fraction = .15)
gb_rsearch = rsearch(iter = 4050, est = gb_reg, params=gb_rparams, x=X_train, y=y_train)

In [53]:
pd.DataFrame(gb_rsearch.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:10]

Unnamed: 0,param_subsample,param_n_estimators,param_max_depth,param_loss,param_learning_rate,params
1632,0.9,155,2,ls,0.464159,"{'subsample': 0.9, 'n_estimators': 155, 'max_d..."
2402,0.9,130,2,ls,0.464159,"{'subsample': 0.9, 'n_estimators': 130, 'max_d..."
3021,0.9,185,2,ls,0.464159,"{'subsample': 0.9, 'n_estimators': 185, 'max_d..."
605,0.9,60,2,ls,0.464159,"{'subsample': 0.9, 'n_estimators': 60, 'max_de..."
3433,0.9,150,2,ls,0.464159,"{'subsample': 0.9, 'n_estimators': 150, 'max_d..."
182,1.0,180,2,huber,0.464159,"{'subsample': 1, 'n_estimators': 180, 'max_dep..."
1865,1.0,75,2,huber,0.464159,"{'subsample': 1, 'n_estimators': 75, 'max_dept..."
3864,0.9,155,2,ls,0.215443,"{'subsample': 0.9, 'n_estimators': 155, 'max_d..."
3892,0.9,85,2,ls,0.215443,"{'subsample': 0.9, 'n_estimators': 85, 'max_de..."
1319,0.9,160,2,ls,0.215443,"{'subsample': 0.9, 'n_estimators': 160, 'max_d..."


In [86]:
gb_rsearch.best_estimator_

GradientBoostingRegressor(learning_rate=0.46415888336127775, loss='ls',
                          max_depth=2, n_estimators=60, n_iter_no_change=5,
                          random_state=42, subsample=0.9,
                          validation_fraction=0.15)

In [54]:
gb_gparams = {'learning_rate': np.logspace(-1.5, 0, 10),
             'n_estimators': np.arange(60, 160, 5),
             'subsample': [.75, .8, .85, .9, .95],
             'max_depth': np.arange(2, 9)}

In [55]:
gb_reg2 = GradientBoostingRegressor(random_state = 42, n_iter_no_change = 5, validation_fraction = .15, loss = 'squared_error')
gb_gsearch = gsearch(est = gb_reg2, params=gb_gparams, x=X_train, y=y_train)

In [94]:
pd.DataFrame(gb_gsearch.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:9]

Unnamed: 0,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params
4903,0.464159,2,60,0.9,"{'learning_rate': 0.46415888336127775, 'max_de..."
4923,0.464159,2,80,0.9,"{'learning_rate': 0.46415888336127775, 'max_de..."
4948,0.464159,2,105,0.9,"{'learning_rate': 0.46415888336127775, 'max_de..."
4913,0.464159,2,70,0.9,"{'learning_rate': 0.46415888336127775, 'max_de..."
4933,0.464159,2,90,0.9,"{'learning_rate': 0.46415888336127775, 'max_de..."
4983,0.464159,2,140,0.9,"{'learning_rate': 0.46415888336127775, 'max_de..."
4973,0.464159,2,130,0.9,"{'learning_rate': 0.46415888336127775, 'max_de..."
4938,0.464159,2,95,0.9,"{'learning_rate': 0.46415888336127775, 'max_de..."
4968,0.464159,2,125,0.9,"{'learning_rate': 0.46415888336127775, 'max_de..."
4963,0.464159,2,120,0.9,"{'learning_rate': 0.46415888336127775, 'max_de..."


In [61]:
gb_gparams2 = {'learning_rate': np.logspace(-1, 0, 10),
             'n_estimators': np.arange(60, 160, 5),
             'subsample': [ .84, .86, .88, .9, .92, .94, .96, .98],
             'max_depth': np.arange(2, 5)}

In [62]:
gb_gsearch2 = gsearch(est = gb_reg2, params=gb_gparams2, x=X_train, y=y_train)

In [93]:
pd.DataFrame(gb_gsearch2.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:9]

Unnamed: 0,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params
2485,0.359381,2,110,0.94,"{'learning_rate': 0.35938136638046275, 'max_de..."
2549,0.359381,2,150,0.94,"{'learning_rate': 0.35938136638046275, 'max_de..."
2469,0.359381,2,100,0.94,"{'learning_rate': 0.35938136638046275, 'max_de..."
2493,0.359381,2,115,0.94,"{'learning_rate': 0.35938136638046275, 'max_de..."
2509,0.359381,2,125,0.94,"{'learning_rate': 0.35938136638046275, 'max_de..."
2429,0.359381,2,75,0.94,"{'learning_rate': 0.35938136638046275, 'max_de..."
2557,0.359381,2,155,0.94,"{'learning_rate': 0.35938136638046275, 'max_de..."
2437,0.359381,2,80,0.94,"{'learning_rate': 0.35938136638046275, 'max_de..."
2461,0.359381,2,95,0.94,"{'learning_rate': 0.35938136638046275, 'max_de..."
2517,0.359381,2,130,0.94,"{'learning_rate': 0.35938136638046275, 'max_de..."


In [75]:
gb_gparams3 = {'learning_rate': np.logspace(-0.8, -0.2, 20),
             'n_estimators': np.arange(60, 160, 2),
             'subsample': np.arange(0.9, 1, 0.01),
             'max_depth': [2,3]}

In [76]:
gb_gsearch3 = gsearch(est = gb_reg2, params=gb_gparams3, x=X_train, y=y_train)

In [100]:
pd.DataFrame(gb_gsearch3.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:9]

Unnamed: 0,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params
11360,0.35267,2,132,0.9,"{'learning_rate': 0.352669921417466, 'max_dept..."
11130,0.35267,2,86,0.9,"{'learning_rate': 0.352669921417466, 'max_dept..."
11140,0.35267,2,88,0.9,"{'learning_rate': 0.352669921417466, 'max_dept..."
11141,0.35267,2,88,0.91,"{'learning_rate': 0.352669921417466, 'max_dept..."
11150,0.35267,2,90,0.9,"{'learning_rate': 0.352669921417466, 'max_dept..."
11151,0.35267,2,90,0.91,"{'learning_rate': 0.352669921417466, 'max_dept..."
11160,0.35267,2,92,0.9,"{'learning_rate': 0.352669921417466, 'max_dept..."
11161,0.35267,2,92,0.91,"{'learning_rate': 0.352669921417466, 'max_dept..."
11170,0.35267,2,94,0.9,"{'learning_rate': 0.352669921417466, 'max_dept..."
11171,0.35267,2,94,0.91,"{'learning_rate': 0.352669921417466, 'max_dept..."


In [90]:
gb_gsearch3.best_estimator_

GradientBoostingRegressor(learning_rate=0.352669921417466, max_depth=2,
                          n_estimators=60, n_iter_no_change=5, random_state=42,
                          subsample=0.9, validation_fraction=0.15)

In [109]:
gb_gparams4 = {'learning_rate': np.arange(0.3, 0.4, 0.005),
             'n_estimators': np.arange(60, 140, 1),
             'subsample': [0.92, 0.93]}

In [110]:
gb_reg3 = GradientBoostingRegressor(random_state = 42, n_iter_no_change = 5, validation_fraction = .15, loss = 'squared_error', max_depth = 2)
gb_gsearch4 = gsearch(est = gb_reg3, params=gb_gparams4, x=X_train, y=y_train)

In [114]:
pd.DataFrame(gb_gsearch4.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:9]

Unnamed: 0,param_learning_rate,param_n_estimators,param_subsample,params,split0_test_score
2401,0.375,60,0.93,"{'learning_rate': 0.37500000000000006, 'n_esti...",-0.04029
2400,0.375,60,0.92,"{'learning_rate': 0.37500000000000006, 'n_esti...",-0.04029
2502,0.375,111,0.92,"{'learning_rate': 0.37500000000000006, 'n_esti...",-0.04029
2503,0.375,111,0.93,"{'learning_rate': 0.37500000000000006, 'n_esti...",-0.04029
2504,0.375,112,0.92,"{'learning_rate': 0.37500000000000006, 'n_esti...",-0.04029
2505,0.375,112,0.93,"{'learning_rate': 0.37500000000000006, 'n_esti...",-0.04029
2506,0.375,113,0.92,"{'learning_rate': 0.37500000000000006, 'n_esti...",-0.04029
2507,0.375,113,0.93,"{'learning_rate': 0.37500000000000006, 'n_esti...",-0.04029
2508,0.375,114,0.92,"{'learning_rate': 0.37500000000000006, 'n_esti...",-0.04029
2509,0.375,114,0.93,"{'learning_rate': 0.37500000000000006, 'n_esti...",-0.04029


In [112]:
gb_gsearch4.best_estimator_

GradientBoostingRegressor(learning_rate=0.37500000000000006, max_depth=2,
                          n_estimators=60, n_iter_no_change=5, random_state=42,
                          subsample=0.92, validation_fraction=0.15)

### Adaptive Boosting (AdaBoost)

In [115]:
adb_rparams = {'learning_rate': np.logspace(-3, 1, 12),
          'loss' : ['linear', 'square', 'exponential'],
          'n_estimators': np.arange(5,200, 5),
          'base_estimator': [GradientBoostingRegressor(random_state = 42), Ridge(alpha=0.01, random_state=6, solver='svd'), DecisionTreeRegressor(max_depth = 7)]
          }

In [116]:
adb_rsearch = rsearch(iter = 750, est = AdaBoostRegressor(random_state = 42), params=adb_rparams, x=X_train, y=y_train)

In [127]:
pd.DataFrame(adb_rsearch.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:9]

Unnamed: 0,param_n_estimators,param_loss,param_learning_rate,param_base_estimator,params
688,60,exponential,0.001,GradientBoostingRegressor(random_state=42),"{'n_estimators': 60, 'loss': 'exponential', 'l..."
420,125,exponential,0.001,GradientBoostingRegressor(random_state=42),"{'n_estimators': 125, 'loss': 'exponential', '..."
575,120,exponential,0.00231,GradientBoostingRegressor(random_state=42),"{'n_estimators': 120, 'loss': 'exponential', '..."
298,135,exponential,0.00231,GradientBoostingRegressor(random_state=42),"{'n_estimators': 135, 'loss': 'exponential', '..."
244,130,exponential,0.00231,GradientBoostingRegressor(random_state=42),"{'n_estimators': 130, 'loss': 'exponential', '..."
686,165,exponential,0.001,GradientBoostingRegressor(random_state=42),"{'n_estimators': 165, 'loss': 'exponential', '..."
230,40,exponential,0.00231,GradientBoostingRegressor(random_state=42),"{'n_estimators': 40, 'loss': 'exponential', 'l..."
330,160,exponential,0.001,GradientBoostingRegressor(random_state=42),"{'n_estimators': 160, 'loss': 'exponential', '..."
592,100,exponential,0.00231,GradientBoostingRegressor(random_state=42),"{'n_estimators': 100, 'loss': 'exponential', '..."
3,190,exponential,0.00231,GradientBoostingRegressor(random_state=42),"{'n_estimators': 190, 'loss': 'exponential', '..."


In [118]:
adb_rsearch.best_estimator_

AdaBoostRegressor(base_estimator=GradientBoostingRegressor(random_state=42),
                  learning_rate=0.001, loss='exponential', n_estimators=60,
                  random_state=42)

In [130]:
adb_gparams = gparams = {'learning_rate': np.logspace(-4, -2, 10),
          'n_estimators': np.arange(50, 160, 5)}

In [131]:
adb_gsearch = gsearch(est = AdaBoostRegressor(loss = 'exponential', base_estimator = GradientBoostingRegressor(random_state=42), 
                random_state = 42), params=adb_gparams, x=X_train, y=y_train)

In [134]:
pd.DataFrame(adb_gsearch.cv_results_).sort_values(by = 'mean_test_score', ascending= False).head(10).iloc[:, 4:7]

Unnamed: 0,param_learning_rate,param_n_estimators,params
79,0.000464,115,"{'learning_rate': 0.00046415888336127773, 'n_e..."
78,0.000464,110,"{'learning_rate': 0.00046415888336127773, 'n_e..."
84,0.000464,140,"{'learning_rate': 0.00046415888336127773, 'n_e..."
60,0.000278,130,"{'learning_rate': 0.0002782559402207126, 'n_es..."
15,0.0001,125,"{'learning_rate': 0.0001, 'n_estimators': 125}"
81,0.000464,125,"{'learning_rate': 0.00046415888336127773, 'n_e..."
37,0.000167,125,"{'learning_rate': 0.0001668100537200059, 'n_es..."
13,0.0001,115,"{'learning_rate': 0.0001, 'n_estimators': 115}"
45,0.000278,55,"{'learning_rate': 0.0002782559402207126, 'n_es..."
82,0.000464,130,"{'learning_rate': 0.00046415888336127773, 'n_e..."


In [129]:
adb_gsearch.best_estimator_

AdaBoostRegressor(base_estimator=GradientBoostingRegressor(random_state=42),
                  learning_rate=0.001, loss='exponential', n_estimators=60,
                  random_state=42)

## Saving the Results

In [132]:
import json

In [138]:
random_forest = {'max_depth': 150, 'n_estimators':66, 'random_state':42}
gradient_boost = {'learning_rate':0.37500000000000006, 'max_depth':2,  'n_estimators':60, 'n_iter_no_change':5, 
                    'random_state':42, 'subsample':0.92, 'validation_fraction':0.15}
adaboost = {'base_estimator':'GradientBoostingRegressor(random_state=42)', 'learning_rate':0.001, 
                'loss':'exponential', 'n_estimators':60, 'random_state':42}

hyperparams = {'random_forest':random_forest, 'gradient_boost':gradient_boost, 'adaboost':adaboost}

In [140]:
with open('tuned_hyperparams.json', 'w') as file:
    json.dump(hyperparams, file)