# Getting best results per-shard using CANDLE HPO

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import sys
from pathlib import Path
from glob import glob

import sklearn
import numpy as np
import pandas as pd
from glob import glob

from keras.models import load_model

import matplotlib
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
basedir = Path('/vol/ml/apartin/projects/LearningCurves/from_harry/lc_hpo_e700')

In [3]:
# --------------------------
#     UPF file
# --------------------------
# This file lists all the HPO runs (why id number is missing?)
# upf = pd.read_csv(basedir/'upf-lc-hpo.txt', header=None)
# upf.columns = ['id', 'epochs', 'dirpath', 'cv_folds', 'shards_arr', 'model_name', 'opt', 'lr', 'dr_rate', 'batch_size']

# Update df values
# upf['id'] = upf['id'].map(lambda x: x.split('"id": ')[-1].replace('"', ''))
# upf['epochs'] = upf['epochs'].map(lambda x: int(x.split('"epochs": ')[-1]))
# upf['dirpath'] = upf['dirpath'].map(lambda x: x.split('"dirpath": ')[-1].replace('"', ''))
# upf['cv_folds'] = upf['cv_folds'].map(lambda x: int(x.split('"cv_folds": ')[-1]))
# upf['shards_arr'] = upf['shards_arr'].map(lambda x: int(x.split('"shards_arr": ')[-1].replace('[', '').replace(']', '')) )
# upf['model_name'] = upf['model_name'].map(lambda x: x.split('"model_name": ')[-1].replace('"', ''))
# upf['opt'] = upf['opt'].map(lambda x: x.split('"opt": ')[-1].replace('"', ''))
# upf['lr'] = upf['lr'].map(lambda x: float(x.split('"lr": ')[-1]))
# upf['dr_rate'] = upf['dr_rate'].map(lambda x: float(x.split('"dr_rate": ')[-1]))
# upf['batch_size'] = upf['batch_size'].map(lambda x: x.split('"batch_size": ')[-1].rstrip('\}'))

# upf[:5]

In [4]:
# Aggregate results from all HPO runs into a dataframe
hp = []
all_runs = sorted(glob(str(basedir/'run'/'id_*')))
print('Total runs {}'.format( len(all_runs) ))
for i, r in enumerate(all_runs):
    r = glob(str(Path(r)/'output'/'*'))[0]
    
    # Load args into dict
    with open(Path(r)/'args.txt', 'r') as f:
        args = {}
        for l in f:
            k, v = l.rstrip('\n').split(': ')
            args[k] = v
            
    # Add scores
    if (Path(r)/'lrn_crv_scores.csv').exists():
        # lrn_crv_scores.csv doesn't exist if it's timed out
        scr = pd.read_csv(Path(r)/'lrn_crv_scores.csv')
        tr_size = int(args['shards_arr'].replace('[','').replace(']',''))
        aa = scr.loc[(scr['tr_size']==tr_size) & (scr['tr_set']==False), ['metric', 'fold1']].reset_index(drop=True)
        d = {aa.loc[i, 'metric']: aa.loc[i, 'fold1'] for i in range(aa.shape[0])}    

    # Add early stop
    tr_dir = Path(glob(str(Path(r)/(f'cv*_sz'+str(tr_size))))[0])
    if tr_dir.exists():
        if (tr_dir/'krs_history.csv').exists():
            h = pd.read_csv(tr_dir/'krs_history.csv')
            d['epoch_stop'] = h['epoch'].max()

    # Update dict
    args.update(d)
        
    # Agg info from all runs
    hp.append(args)

hp = pd.DataFrame(hp)
print('hp.shape', hp.shape)

# Update values
hp['shards_arr'] = hp['shards_arr'].map(lambda x: int(x.replace('[', '').replace(']', '')))

Total runs 384
hp.shape (384, 38)


In [5]:
hp.columns.values

array(['batch_size', 'batchnorm', 'cell_fea', 'clr_base_lr', 'clr_gamma',
       'clr_max_lr', 'clr_mode', 'cv_folds', 'cv_folds_arr', 'cv_method',
       'dirpath', 'dr_rate', 'drug_fea', 'epoch_stop', 'epochs',
       'experiment_id', 'framework', 'id', 'instance_directory', 'lr',
       'max_shard', 'mean_absolute_error', 'mean_squared_error',
       'median_absolute_error', 'min_shard', 'model_name', 'n_jobs',
       'n_shards', 'n_trees', 'opt', 'r2', 'run_id', 'save', 'scaler',
       'shard_step_scale', 'shards_arr', 'target_name', 'timeout'],
      dtype=object)

In [35]:
# Get a subset of columns
col_subset = ['batch_size', 'lr', 'opt', 'dr_rate', 'batchnorm', 'cell_fea', 'clr_mode', 'cv_folds', 'cv_method',
              'drug_fea', 'epochs', 'framework', 'model_name',
              'n_trees', 'run_id', 'scaler', 'shard_step_scale', 'shards_arr', 'target_name', 'epoch_stop',
              'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'r2']
hp = hp[col_subset]

In [36]:
hp.groupby('shards_arr')['batch_size'].value_counts()

shards_arr  batch_size
65536       128           32
            256           32
            32            32
            64            32
131072      128           32
            256           32
            32            32
            64            32
262144      128           32
            256           32
            32            32
            64            32
Name: batch_size, dtype: int64

In [37]:
# hp.batch_size.value_counts()

In [38]:
# hp.opt.value_counts()

In [39]:
# hp.shards_arr.value_counts()

In [40]:
# hp.lr.value_counts()

In [41]:
# hp.sort_values('mean_absolute_error')

In [42]:
# d1 = hp[hp['shards_arr']==65536].reset_index(drop=True)
d1 = hp.reset_index(drop=True)
print(d1.shape)
d1 = d1[d1['r2']>0.73]
print(d1.shape)

(384, 24)
(46, 24)


In [43]:
d1.groupby('shards_arr').agg({'batch_size': 'unique', 'dr_rate': 'unique', 'opt': 'unique', 'lr': 'unique',
                              'mean_absolute_error': 'min', 'mean_squared_error': 'min', 'r2': 'max'}).reset_index()
# d1.groupby('shards_arr')['batch_size'].value_counts()

Unnamed: 0,shards_arr,batch_size,dr_rate,opt,lr,mean_absolute_error,mean_squared_error,r2
0,65536,"[32, 64, 128, 256]","[0.1, 0, 0.2]","[adam, sgd]","[1e-05, 0.01, 0.0001]",0.04914,0.005496,0.740268
1,131072,"[32, 64, 128, 256]","[0, 0.1, 0.2]","[adam, sgd]","[1e-05, 0.001, 0.01, 0.0001]",0.047997,0.005068,0.760502
2,262144,"[32, 64, 256, 128]","[0, 0.1, 0.2]","[adam, sgd]","[1e-05, 0.001, 0.01, 0.0001]",0.045759,0.004626,0.781399


In [47]:
def top_runs(df, shard_size=None, n=10, sort_by='r2', plot=False):
    if shard_size is not None:
        df = df[df['shards_arr']==shard_size].reset_index(drop=True)
    if sort_by == 'r2':
        df = df.sort_values(sort_by, ascending=False).iloc[:n, :]
    else:
        df = df.sort_values(sort_by).iloc[:n, :]
    
    if plot: display(df.transpose())
    return df.reset_index(drop=True)

In [53]:
df = top_runs(hp, shard_size=65536, n=1)
display(df)

Unnamed: 0,batch_size,lr,opt,dr_rate,batchnorm,cell_fea,clr_mode,cv_folds,cv_method,drug_fea,...,run_id,scaler,shard_step_scale,shards_arr,target_name,epoch_stop,mean_absolute_error,mean_squared_error,median_absolute_error,r2
0,128,0.0001,adam,0.1,False,['GE'],,1,simple,['DD'],...,id_54,stnd,log2,65536,AUC,122,0.051723,0.005496,0.03616,0.740268


In [54]:
df = top_runs(hp, shard_size=131072, n=1)
display(df)

Unnamed: 0,batch_size,lr,opt,dr_rate,batchnorm,cell_fea,clr_mode,cv_folds,cv_method,drug_fea,...,run_id,scaler,shard_step_scale,shards_arr,target_name,epoch_stop,mean_absolute_error,mean_squared_error,median_absolute_error,r2
0,32,0.0001,adam,0.1,False,['GE'],,1,simple,['DD'],...,id_68,stnd,log2,131072,AUC,107,0.048234,0.005068,0.031422,0.760502


In [55]:
df = top_runs(hp, shard_size=262144, n=1)
display(df)

Unnamed: 0,batch_size,lr,opt,dr_rate,batchnorm,cell_fea,clr_mode,cv_folds,cv_method,drug_fea,...,run_id,scaler,shard_step_scale,shards_arr,target_name,epoch_stop,mean_absolute_error,mean_squared_error,median_absolute_error,r2
0,64,0.0001,adam,0.1,False,['GE'],,1,simple,['DD'],...,id_85,stnd,log2,262144,AUC,91,0.045795,0.004626,0.029842,0.781399
