In [None]:
import re
import numpy as np
import pandas as pd
import scipy.stats as st

# Parse Log Files

Get data from log files to identify best batch size and learning rate.


In [None]:
def parse_hyperparameter_log_file(log_number):
    best_batch_size_ls = []
    best_lr_ls = []
    best_fold_ls = []
    best_rep_ls = []
    best_epoch_ls = []

    batch_size_ls = []
    lr_ls = []
    fold_ls = []
    rep_ls = []
    epoch_ls = []
    train_ap_ls = []
    val_ap_ls = []
    train_time_ls = []
    val_time_ls = []

    with open('logs/hyperparameter_tuning_%s.log' % log_number, 'r') as f:
        for line in f:
            if re.search('BATCH SIZE', line):
                batch_size = int(re.search('BATCH SIZE: (\d+)', line).group(1))

            if re.search('LR', line):
                lr = float(re.search('LR: (0.\d+)', line).group(1))

            if re.search('Rep', line):
                m1 = re.search('Fold (\d{1,2}); Rep (\d)', line)
                fold = int(m1.group(1))
                rep = int(m1.group(2))

            if re.search('EPOCH #(\d{1,3}): TRAIN AP', line):
                m2 = re.search('EPOCH #(\d{1,4}): TRAIN AP (\d.\d+), VAL AP (\d.\d+), TRAIN TIME (\d+)m (\d+.\d+)s, VAL TIME (\d+)m (\d+.\d+)s', line)
                batch_size_ls.append(batch_size)
                lr_ls.append(lr)
                fold_ls.append(fold)
                rep_ls.append(rep)
                epoch_ls.append(int(m2.group(1)))
                train_ap_ls.append(float(m2.group(2)))
                val_ap_ls.append(float(m2.group(3)))
                train_time_ls.append(float(m2.group(4)) * 60 + float(m2.group(5)))
                val_time_ls.append(float(m2.group(6)) * 60 + float(m2.group(7)))

            if re.search('STOPPING', line):
                m3 = re.search('EPOCH #(\d{1,4})', line)
                best_batch_size_ls.append(batch_size)
                best_lr_ls.append(lr)
                best_fold_ls.append(fold)
                best_rep_ls.append(rep)
                best_epoch_ls.append(int(m3.group(1)))


    best_epoch = pd.DataFrame({'batch_size': best_batch_size_ls,
                               'lr': best_lr_ls,
                               'fold': best_fold_ls,
                               'rep': best_rep_ls,
                               'best_epoch': best_epoch_ls})

    epoch_metrics = pd.DataFrame({'batch_size': batch_size_ls,
                                  'lr': lr_ls,
                                  'fold': fold_ls,
                                  'rep': rep_ls,
                                  'epoch': epoch_ls,
                                  'train_ap': train_ap_ls,
                                  'val_ap': val_ap_ls,
                                  'train_time': train_time_ls,
                                  'val_time': val_time_ls})
    
    best_epoch['best_epoch_flag'] = 1
    epoch_metrics = epoch_metrics.merge(best_epoch, how='left', 
                                        left_on=['batch_size', 'lr', 'fold', 'rep', 'epoch'],
                                        right_on=['batch_size', 'lr', 'fold', 'rep', 'best_epoch'])

    epoch_metrics = epoch_metrics.drop(columns='best_epoch')
    epoch_metrics['best_epoch_flag'] = epoch_metrics['best_epoch_flag'].fillna(0).astype('int')
    epoch_metrics['total_time'] = epoch_metrics['train_time'] + epoch_metrics['val_time']

    return epoch_metrics


In [None]:
epoch_metrics_1 = parse_hyperparameter_log_file(log_number='1') 
epoch_metrics_2 = parse_hyperparameter_log_file(log_number='2') 
epoch_metrics_3 = parse_hyperparameter_log_file(log_number='3') 
epoch_metrics_4 = parse_hyperparameter_log_file(log_number='4') 
epoch_metrics_5 = parse_hyperparameter_log_file(log_number='5') 
epoch_metrics_6 = parse_hyperparameter_log_file(log_number='6') 
epoch_metrics_7 = parse_hyperparameter_log_file(log_number='7') 

epoch_metrics = pd.concat([epoch_metrics_1, epoch_metrics_2, epoch_metrics_3, epoch_metrics_4, epoch_metrics_5, 
                           epoch_metrics_6, epoch_metrics_7]).reset_index(drop=True)

In [None]:
epoch_metrics.loc[epoch_metrics['best_epoch_flag'] == 1].sort_values('val_ap', ascending=False)

## Investigate to find the top hyperparameters


In [None]:
def create_summary_metrics(epoch_metrics):
    batch_sizes = [128, 256, 512]
    lrs = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]

    summary_metrics = pd.DataFrame()
    for batch_size in batch_sizes: 
        for lr in lrs:
            temp = epoch_metrics.loc[(epoch_metrics['best_epoch_flag'] == 1) &
                                     (epoch_metrics['batch_size'] == batch_size) &
                                     (epoch_metrics['lr'] == lr)]
            val_ap_data = temp['val_ap'].values
            ci = st.t.interval(alpha=0.95, df=len(val_ap_data)-1, loc=np.mean(val_ap_data), scale=st.sem(val_ap_data))

            summary_metrics = pd.concat([summary_metrics, 
                                         pd.DataFrame({'batch_size': [batch_size],
                                                       'lr': [lr],
                                                       'mean_val_ap': [np.mean(val_ap_data)], 
                                                       'val_ap_low': [ci[0]], 
                                                       'val_ap_high': [ci[1]]})])
    return summary_metrics

In [None]:
summary_metrics = create_summary_metrics(epoch_metrics=epoch_metrics)
summary_metrics.sort_values('mean_val_ap', ascending=False)

### Best hyperparameters: BATCH SIZE = 256; LEARNING RATE = 0.005