## Grid Search for best hyperparameters of Decision Tree model
esp. for criterion

In [2]:
import os
import json
import pandas as pd
import dt as dt_lib

datareader = dt_lib.DataReader(data_dir = '../data/',
                               train_valid_file = 'Data_train_validation_2019-1126.xlsx',                                
                               test_file = 'Data_test_2019-1126.xlsx',
                               feature_in = ['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15','A16','A17'],
                               output_col = 'Type',
                               k_fold = 5)

criterion_list = ['gini','entropy']

best_acc = 0
best_hyper_param = dict()
best_performance = dict()
summary = pd.DataFrame(columns=['criterion', 'acc_train_avg', 'acc_valid_avg', 'acc_test_avg'])
for criterion in criterion_list:

    ## ensure that output dir exists in local
    output_dir = f'output/{datareader.k_fold}-fold_{criterion}'
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)       

    ## create dt object
    dt = dt_lib.DT(criterion = criterion)

    ## config of datareader and dt
    config = dt_lib.configuration(datareader, dt)
    with open(output_dir+'/config.json', 'w') as fout:
        json.dump(config, fout, indent = 4)

    ## run dt with k-fold validation
    print (output_dir)
    acc_train_avg, acc_valid_avg, acc_test_avg = dt_lib.k_fold_validation(datareader, dt, output_dir)            

    ## find the best hyper param of dt model
    if acc_valid_avg > best_acc:
        best_hyper_param['criterion'] = criterion
        best_performance['acc_train_avg'] = acc_train_avg
        best_performance['acc_valid_avg'] = acc_valid_avg
        best_performance['acc_test_avg'] = acc_test_avg
        best_acc = acc_valid_avg

    ## record all output in summary
    summary.loc[len(summary)] = [criterion, acc_train_avg, acc_valid_avg, acc_test_avg]

## check best performace with best hyperparam.
print ()
print ('best_hyper_param of dt model:')
print (best_hyper_param)
print ()
print ('best performance of dt model:')
print (f"average accuracy of train set in {datareader.k_fold}-fold: {best_performance['acc_train_avg']}")
print (f"average accuracy of valid set in {datareader.k_fold}-fold: {best_performance['acc_valid_avg']}")
print (f"average accuracy of test set in {datareader.k_fold}-fold: {best_performance['acc_test_avg']}")
print ()

## achieve summary file to dir
print ('overall summary for all models')
print (summary)
summary.to_excel(f'output/Summary_grid-search_{datareader.k_fold}-fold.xlsx')

output/5-fold_gini
output/5-fold_entropy

best_hyper_param of dt model:
{'criterion': 'gini'}

best performance of dt model:
average accuracy of train set in 5-fold: 1.0
average accuracy of valid set in 5-fold: 0.9539086844601001
average accuracy of test set in 5-fold: 0.937642585551331

overall summary for all models
  criterion  acc_train_avg  acc_valid_avg  acc_test_avg
0      gini            1.0       0.953909      0.937643
1   entropy            1.0       0.948991      0.930038
