In [None]:
import re
import numpy as np
import pandas as pd
import scipy.stats as st


# Parse Log Files

Get data from log files to identify top performing architecture.

In [None]:
best_epoch = pd.DataFrame()
epoch_metrics = pd.DataFrame()

for model_number in range(1, 13): 
    best_fold_ls = []
    best_rep_ls = []
    best_epoch_ls = []

    fold_ls = []
    rep_ls = []
    epoch_ls = []
    train_ap_ls = []
    val_ap_ls = []
    train_time_ls = []
    val_time_ls = []

    with open('logs/architecture_tuning_%d.log' % model_number, 'r') as f:
        for line in f:
            if re.search('Rep', line):
                m1 = re.search('Fold (\d{1,2}); Rep (\d)', line)
                fold = int(m1.group(1))
                rep = int(m1.group(2))

            if re.search('EPOCH #(\d{1,3}): TRAIN AP', line):
                m2 = re.search('EPOCH #(\d{1,4}): TRAIN AP (\d.\d+), VAL AP (\d.\d+), TRAIN TIME (\d+)m (\d+.\d+)s, VAL TIME (\d+)m (\d+.\d+)s', line)
                fold_ls.append(fold)
                rep_ls.append(rep)
                epoch_ls.append(int(m2.group(1)))
                train_ap_ls.append(float(m2.group(2)))
                val_ap_ls.append(float(m2.group(3)))
                train_time_ls.append(float(m2.group(4)) * 60 + float(m2.group(5)))
                val_time_ls.append(float(m2.group(6)) * 60 + float(m2.group(7)))

            if re.search('STOPPING', line):
                m3 = re.search('EPOCH #(\d{1,4})', line)
                best_fold_ls.append(fold)
                best_rep_ls.append(rep)
                best_epoch_ls.append(int(m3.group(1)))

    best_epoch_temp = pd.DataFrame({'model_number': list(np.repeat([model_number], len(best_fold_ls))),
                                    'fold': best_fold_ls,
                                    'rep': best_rep_ls,
                                    'best_epoch': best_epoch_ls})

    epoch_metrics_temp = pd.DataFrame({'model_number': list(np.repeat([model_number], len(fold_ls))),
                                       'fold': fold_ls,
                                       'rep': rep_ls,
                                       'epoch': epoch_ls,
                                       'train_ap': train_ap_ls,
                                       'val_ap': val_ap_ls,
                                       'train_time': train_time_ls,
                                       'val_time': val_time_ls})

    best_epoch = pd.concat([best_epoch, best_epoch_temp]).reset_index(drop=True)
    epoch_metrics = pd.concat([epoch_metrics, epoch_metrics_temp]).reset_index(drop=True)


In [None]:
best_epoch['best_epoch_flag'] = 1
epoch_metrics = epoch_metrics.merge(best_epoch, how='left', 
                                    left_on=['model_number', 'fold', 'rep', 'epoch'],
                                    right_on=['model_number', 'fold', 'rep', 'best_epoch'])
epoch_metrics = epoch_metrics.drop(columns='best_epoch')
epoch_metrics['best_epoch_flag'] = epoch_metrics['best_epoch_flag'].fillna(0).astype('int')

## Investigate to find the top performing architecture

Models:

<ol>
    <li>basic 1 LSTM layer, set n_layers = 1</li>
    <li>2 LSTM layers, set n_layers = 2</li>
    <li>2 LSTM layers with 0.5 dropout, set n_layers = 2</li>
    <li>1 LSTM layer, dropout layer, set n_layers = 1</li>
    <li>2 LSTM layers, dropout layer, set n_layers = 2</li>
    <li>2 LSTM layers with 0.5 dropout, dropout layer, set n_layers=2</li>
    <li>#1 with gradient clipping</li>
    <li>#2 with gradient clipping</li>
    <li>#3 with gradient clipping</li>
    <li>#4 with gradient clipping</li>
    <li>#5 with gradient clipping</li>
    <li>#6 with gradient clipping</li>
</ol>

In [None]:
# From the best epochs - average val_ap for each number of features and model number
architecture_metrics = pd.DataFrame()
for model_number in range(1, 13):
    temp = epoch_metrics.loc[(epoch_metrics['best_epoch_flag'] == 1) & 
                             (epoch_metrics['model_number'] == model_number)]
    val_ap_data = temp['val_ap'].values
    ci = st.t.interval(alpha=0.95, df=len(val_ap_data)-1, loc=np.mean(val_ap_data), scale=st.sem(val_ap_data))

    architecture_metrics = pd.concat([architecture_metrics, 
                                      pd.DataFrame({'model_number': [model_number], 
                                                    'mean_val_ap': [np.mean(val_ap_data)], 
                                                    'val_ap_low': [ci[0]], 
                                                    'val_ap_high': [ci[1]]})])

In [None]:
architecture_metrics.sort_values('mean_val_ap', ascending=False)

In [None]:
# training time & epochs for each number of features, model number, fold, and rep
epoch_metrics['total_time'] = epoch_metrics['train_time'] + epoch_metrics['val_time']

time_data = epoch_metrics[['model_number', 'fold', 'rep', 'total_time']].groupby(['model_number', 'fold', 'rep']).sum().reset_index()
epochs_data = epoch_metrics[['model_number', 'fold', 'rep', 'epoch']].groupby(['model_number', 'fold', 'rep']).max().reset_index()

time_epochs_data = time_data.merge(epochs_data, how='inner', on=['model_number', 'fold', 'rep'])
training_time_metrics = time_epochs_data.groupby(['model_number']).mean().reset_index()

In [None]:
training_time_metrics.sort_values('total_time')

### Decision: Use model 2 (2 LSTM layers)