In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import itertools

from scipy.stats import pearsonr
import matplotlib.pyplot as plt

In [None]:
DATAFILE = './data/Bayesian_hyperparameter_tuning.pickle'

if os.path.isfile(DATAFILE):
    results = pickle.load(open(DATAFILE,'rb'))
    print('data to be analysed from ' + str(len(results)) + ' hyperparameter combinations\n')
else:
    print('come back another day ! no results found')

#convert to dataframe for easier everything.
results = pd.DataFrame(results)

print('\n'.join(['results dataframe contains the following columns:'] + list(results.columns)))

Each row of results represents a model and its performance. However, before even considering choices of hyperparameters (dropout position, r and N), we need to look at which METRICS we should be selecting on. That is, which of the mean pairwise/global IOU/DSC predicts TRUE IOU/DSC best?

In [None]:
trues = ['true IOU','true DSC']
predictions = ['mean pairwise DSC','global DSC','mean pairwise IOU','global IOU']

def clean_r2(predictor,result):
    
    'returns r2 from 2 arrays, as well as replacing nans with 0s to make consistent'
    
    predictor[np.isnan(predictor)] = 0
    result[np.isnan(result)]=0
    #return the actual r2 value
    return pearsonr(predictor,result)[0]**2
    
def get_r2(trueName,predictionName):
    
    tr = results.loc[:,[trueName,predictionName]]
    
    #FIXME - when something better shows up, remove np.array conversion (should already be done)
    r2s = tr.apply(lambda x: clean_r2(np.array(x[trueName]),np.array(x[predictionName])) ,axis=1)
#     r2s = results.apply(lambda x:print(x[trueName],axis=1))
    return r2s

def mae(predictor,result):
    
    '''returns the mean absolute error between a set of predictions and their true values'''
    
    predictor[np.isnan(predictor)] = 0
    result[np.isnan(result)]=0
    
    ae = np.abs(result-predictor)
    return np.mean(ae)

def get_mae(trueName,predictionName):
    
    tr = results.loc[:,[trueName,predictionName]]
    
    #FIXME - when something better shows up, remove np.array conversion (should already be done)
    maes = tr.apply(lambda x: mae(np.array(x[trueName]),np.array(x[predictionName])) ,axis=1)

    return maes

In [None]:
plt.figure(figsize = (24,12))

bins = np.arange(0,1,0.05)

for ind,combination in enumerate(itertools.product(trues,predictions)):
    
    #first column, calculate and show
    r2s = get_r2(*combination)
    plt.subplot(4,8,1+ind)
    plt.hist(r2s,bins=bins,orientation='horizontal')
    plt.ylim([0,1])
    plt.title('/'.join(combination))
    if ind==0:
        plt.ylabel('R^2')
    
    
    maes = get_mae(*combination)
    plt.subplot(4,8,9+ind)
    plt.hist(maes,bins=bins,orientation='horizontal')
    plt.ylim([0,1])
    if ind ==0:
        plt.ylabel('MAE')
        
    plt.subplot(4,8,17+ind)
    plt.scatter(maes,r2s)
    plt.ylim([0,1])
    plt.xlim([0,0.7])
    
    plt.subplot(4,8,25+ind)
    plt.hist((1-maes)*r2s,bins=bins,orientation='horizontal')
    plt.ylim([0,1])
    if ind ==0:
        plt.ylabel('(1-MAE)*R^2')

So, it looks like the best agreement could be provided by the treu DSC/mean pairwise IOU.

Interestingly, there are clusters of points where the MAE is low, but so is the R^2. This MUST correspond to a low spread in the test set accuracy (leading to poor linear relationship). Thus, MAE is the more appropriate metric to consider. 

So, in selecting hyperparameters, we are interested in optimising two things simultaneously:
 - *maximising* a single actual performance metric (true DSC or true IOU)
 - *minimising* the MAE between the true DSC and mean pairwise IOU

In [None]:
maes = get_mae('mean pairwise IOU','true DSC')
results.loc[:,'mae'] = maes
results.loc[:,'r2'] = get_r2('mean pairwise IOU','true DSC')
performance = results['true IOU'].apply(np.mean)

plt.figure(figsize =(10,10))

plt.scatter(maes,performance)

plt.xlabel('MAE of accuracy prediction')
plt.ylabel('true performance')

So, there is a clear right answer - where there is a clear maximum of DSC and minimum of MAE. Now, should do the same but colour according to hyperparameters...

In [None]:
plt.figure(figsize =(10,10))

for p in results['r'].unique():
    sel = results['r']==p
    plt.scatter(maes[sel],performance[sel],label=p,alpha = 0.4)

plt.xlabel('MAE of accuracy prediction')
plt.ylabel('true performance')
plt.legend()

In [None]:
plt.figure(figsize =(10,10))

for p in results['N'].unique():
    sel = results['N']==p
    plt.scatter(maes[sel],performance[sel],label=p,alpha = 0.4)

plt.xlabel('MAE of accuracy prediction')
plt.ylabel('true performance')
plt.legend()

In [None]:
plt.figure(figsize =(10,10))

for p in results['dropoutPosition'].unique():
    sel = results['dropoutPosition']==p
    plt.scatter(maes[sel],performance[sel],label=p,alpha = 0.4)

plt.xlabel('MAE of accuracy prediction')
plt.ylabel('true performance')
plt.legend()

So, that's pretty fucking unambiguous. The best dropout rate is 0.15, placed at the end of every residual block, and the sample size doesn't really make a difference after 20.

lets have a look at the scatter plots for the successful examples.

In [None]:
goodModels = results.loc[np.logical_and(maes <0.1,performance > 0.6),:].reset_index()

nSuccesses = goodModels.shape[0]

ncols = 3

nrows = np.ceil(nSuccesses/ncols)

plt.figure(figsize = (5*ncols,5*nrows))


for ind,row in goodModels.iterrows():

    
    plt.subplot(nrows,ncols,ind+1)

    title = '\n'.join( ('N = ' + str(row['N']),'mae = ' + str(row['mae']), 'r^2 = ' + str(row['r2']) ) ) 

    plt.title(title)

    plt.plot([0,1],[0,1],label = 'line of unity',c= 'k')

    plt.scatter(row['mean pairwise IOU'],row['true DSC'])

    plt.xlim([0,1])
    plt.ylim([0,1])
    
plt.legend()               

In [None]:
goodModels