## Grid Search for best hyperparameters of Ransom Forest model
esp. for n_estimators, criterion

In [6]:
import os
import json
import pandas as pd
import rf as rf_lib

datareader = rf_lib.DataReader(data_dir = '../data/',
                               train_valid_file = 'Data_train_validation_2019-1126.xlsx',                                
                               test_file = 'Data_test_2019-1126.xlsx',
                               feature_in = ['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15','A16','A17'],
                               output_col = 'Type',
                               k_fold = 5)

criterion_list = ['gini','entropy']
n_estimators_list = [50, 100, 150, 200, 250, 300]

best_acc = 0
best_hyper_param = dict()
best_performance = dict()
summary = pd.DataFrame(columns=['criterion', 'n_estimators', 'acc_train_avg', 'acc_valid_avg', 'acc_test_avg'])
for criterion in criterion_list:
    for n_estimators in n_estimators_list:        
            
        ## ensure that output dir exists in local
        output_dir = f'output/{datareader.k_fold}-fold_{criterion}_n-{n_estimators}'
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)       

        ## create rf object
        rf = rf_lib.RF(n_estimators = n_estimators,
                       criterion = criterion)

        ## config of datareader and rf
        config = rf_lib.configuration(datareader, rf)
        with open(output_dir+'/config.json', 'w') as fout:
            json.dump(config, fout, indent = 4)

        ## run rf with k-fold validation
        print (output_dir)
        acc_train_avg, acc_valid_avg, acc_test_avg = rf_lib.k_fold_validation(datareader, rf, output_dir)            

        ## find the best hyper param of rf model
        if acc_valid_avg > best_acc:
            best_hyper_param['criterion'] = criterion
            best_hyper_param['n_estimators'] = n_estimators
            best_performance['acc_train_avg'] = acc_train_avg
            best_performance['acc_valid_avg'] = acc_valid_avg
            best_performance['acc_test_avg'] = acc_test_avg
            best_acc = acc_valid_avg

        ## record all output in summary
        summary.loc[len(summary)] = [criterion, n_estimators, acc_train_avg, acc_valid_avg, acc_test_avg]

## check best performace with best hyperparam.
print ()
print ('best_hyper_param of rf model:')
print (best_hyper_param)
print ()
print ('best performance of rf model:')
print (f"average accuracy of train set in {datareader.k_fold}-fold: {best_performance['acc_train_avg']}")
print (f"average accuracy of valid set in {datareader.k_fold}-fold: {best_performance['acc_valid_avg']}")
print (f"average accuracy of test set in {datareader.k_fold}-fold: {best_performance['acc_test_avg']}")
print ()

## achieve summary file to dir
print ('overall summary for all models')
print (summary)
summary.to_excel(f'output/Summary_grid-search_{datareader.k_fold}-fold.xlsx')

output/5-fold_gini_n-50
output/5-fold_gini_n-100
output/5-fold_gini_n-150
output/5-fold_gini_n-200
output/5-fold_gini_n-250
output/5-fold_gini_n-300
output/5-fold_entropy_n-50
output/5-fold_entropy_n-100
output/5-fold_entropy_n-150
output/5-fold_entropy_n-200
output/5-fold_entropy_n-250
output/5-fold_entropy_n-300

best_hyper_param of rf model:
{'criterion': 'entropy', 'n_estimators': 150}

best performance of rf model:
average accuracy of train set in 5-fold: 1.0
average accuracy of valid set in 5-fold: 0.9670776317572145
average accuracy of test set in 5-fold: 0.9695817490494296

overall summary for all models
   criterion n_estimators  acc_train_avg  acc_valid_avg  acc_test_avg
0       gini           50            1.0       0.965425      0.964259
1       gini          100            1.0       0.963772      0.968061
2       gini          150            1.0       0.965411      0.965779
3       gini          200            1.0       0.963772      0.966540
4       gini          250     