# Stage 0: Filtering

In this stage, models must do better than linear regression in terms of average head-to-head rankings with linear regression,  
based on `r2_score` on test sets in order to continue to stage 1 and 2. 

## Results
Based on the results of this stage, 3 algorithms (HROCH, nsga2-dcgp, and TaylorGP) were eliminated. 

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from pathlib import Path


In [None]:
rdir = '../results_stage0/'

In [None]:
frames = []
i = 0
for f in Path(rdir).rglob('*.json'):
#     print(f)
    with open(f, 'r') as of:
        d = json.load(of)
    frames.append(d)
    i += 1
    
print('loaded',i,'results')
df = pd.DataFrame.from_records(frames)

########################################
# get dataset sizes
dataset_nsamples = {}
dataset_nfeatures = {}
    
for d in df.dataset.unique():
    tmp = pd.read_csv('../experiment/data/stage0/'+d+'.tsv.gz', sep='\t')
    dataset_nsamples[d] = len(tmp)
    dataset_nfeatures[d] = tmp.shape[1]-1

ns = pd.DataFrame({'dataset':dataset_nsamples.keys(),
              'nsamples':dataset_nsamples.values(),
             })
nf = pd.DataFrame({'dataset':dataset_nfeatures.keys(),
              'nfeatures':dataset_nfeatures.values(),
             })
data = pd.merge(ns,nf,on='dataset')
df = df.merge(data,on='dataset')
df['nsize'] = df['nsamples'].apply(lambda x: '>10,000' if x>10000 else '>1000' if x>1000 else '<=1000')
df['fsize'] = df['nfeatures'].apply(lambda x: '>=1000' if x>=1000 else '>=100' if x>=100 else '<100')
########################################
# time transform
df['time_hr'] = df['time_time']/3600
df['time_mins'] = df['time_time']/60

df.head()

In [None]:
df.algorithm.unique()

# check run completion

In [None]:
df.groupby(['nsamples','nfeatures','dataset','algorithm'])['random_state'].count().unstack()-5

# get rankings 

- if a result is missing, it is assigned worst rank. 

In [None]:
seeds = df.random_state.unique()
datasets= df.dataset.unique()
algorithms= df.algorithm.unique()

    
metrics = [c for c in df.columns if c.endswith('test')]
for col in metrics:
    ascending = 'r2' not in col
    df[col+'_rank_per_trial']=(df.groupby(['dataset','random_state'])
                               [col].apply(lambda x:  round(x,3).rank( ascending=ascending))
                              )
rank_metrics = [c+'_rank_per_trial' for c in metrics]
met_worst = {}
for m in rank_metrics:
    met_worst[m] = df[m].max() + 1
    
frames = [] 
for s in seeds:
    for d in datasets:
        for alg in algorithms:
            dfsdalg = df.loc[(df.random_state==s) 
                             & (df.dataset==d)
                             & (df.algorithm==alg)
                            ]
            if len(dfsdalg) == 0:
                entry = {
                    'dataset':d,
                    'random_state':s,
                    'algorithm':alg
                } 
                for m in rank_metrics:
                    entry[m] = met_worst[m]
#                 print(f'missing {s},{d},{alg} filled with {entry}')
                frames.append(entry) 
df_missing = pd.DataFrame.from_records(frames)
df = df.append(df_missing).reset_index()





In [None]:
df

# get difference from linear 

In [None]:

tmp = df.set_index(['random_state','dataset'])
lr = tmp.loc[tmp.algorithm=='LinearRegression']
lr
frames = []
for alg in tmp.algorithm.unique():
    diff = (tmp.loc[tmp.algorithm==alg]['r2_test'] - lr['r2_test'])/np.abs(lr['r2_test'])*100
    diff = pd.DataFrame(diff.rename('r2_test_diff'))
    diff['r2_test_rank_diff'] = (tmp.loc[tmp.algorithm==alg]['r2_test_rank_per_trial'] 
                                 - lr['r2_test_rank_per_trial'])
    diff['algorithm'] = alg
    frames.append(diff)
    
df_diff = pd.concat(frames).reset_index()
# r2_test_diff
# frames
df = pd.merge(df, df_diff, on = ['random_state','dataset','algorithm'])

# summarize trials by dataset
- summarize metrics so that we're looking at aggregates of aggregate dataset performance.

In [None]:
df_sum = df.groupby(['algorithm','dataset'],as_index=False).median()
df_sum['rmse_test'] = df_sum['mse_test'].apply(np.sqrt)
df_sum['log_mse_test'] = df_sum['mse_test'].apply(lambda x: np.log(1+x))

# rankings and normalized scores per dataset
for col in [c for c in df_sum.columns if 'test' in c or c.endswith('size')]:
    ascending = 'r2' not in col or 'rank' in col
    df_sum[col+'_rank']=df_sum.groupby(['dataset'])[col].apply(lambda x: 
                                                                        round(x,3).rank(ascending=ascending)
                                                                  )
    df_sum[col+'_norm'] = df_sum.groupby('dataset')[col].apply(lambda x: (x-x.min())/(x.max()-x.min()))

In [None]:
df_sum.columns

In [None]:
df_sum.groupby(['dataset','algorithm'])['r2_test'].median().unstack().round(3)

In [None]:
df_sum.groupby(['dataset','algorithm'])['r2_test_rank_per_trial_rank'].median().unstack().round(3)

In [None]:
df_sum.groupby(['algorithm','dataset'])['r2_test_diff'].median().unstack().round().transpose()

In [None]:
df_sum.groupby(['algorithm'])['r2_test_diff'].median().round().sort_values()

In [None]:
df.groupby(['algorithm','dataset'])['r2_test_rank_diff'].median().unstack().round().transpose()

In [None]:
import ipdb
seeds = df.random_state.unique()
print(seeds)
datasets= df.dataset.unique()
print(datasets)
missing = pd.DataFrame()
for alg,dfg in df.groupby('algorithm'):
#     print(alg)
    for s in seeds:
        dfgs = dfg.loc[dfg.random_state==s]
        for d in datasets:
            if d not in dfgs['dataset'].unique():
#                 ipdb.set_trace()
                missing.append({'algorithm':alg, 'random_state':s,'dataset':d,'value':'missing'})
#                 print(missing[-1])
#     print(40*'-')
    
df_missing = pd.DataFrame.from_records(missing)

In [None]:
tmp = df.groupby(['algorithm'])['r2_test_diff'].median().sort_values()
alg_order = tmp.index
tmp

In [None]:
df.groupby(['dataset','algorithm'])['r2_test'].median().unstack().round(3)

In [None]:
df_plt = df_sum.copy()
alg_order = df_plt.groupby(['algorithm'])['r2_test_diff'].median().sort_values().index[::-1]
g = sns.pointplot(
#     showfliers=False,
#     dodge=False,
    join=False,
    estimator=np.median,
    data=df_plt,
    y='algorithm',
    order=alg_order,
    hue_order=alg_order,
    x='r2_test_diff',
#     hue='dataset',
    hue='algorithm',
    palette='flare',
)
plt.plot([0, 0],g.get_ylim(),'--r')
g.legend_.remove()
g.set(
#    xlim=(-1,1),
#    xscale='log' 
     )
plt.grid(axis='y')
plt.xlabel('R2 Test Difference from Linear Regression (%)')

In [None]:
df_plt = df_sum.copy()
alg_order = df_sum.groupby(['algorithm'])['r2_test_rank_diff'].mean().sort_values().index
g = sns.pointplot(
#     showfliers=False,
#     dodge=False,
    join=False,
#     estimator=np.median,
    data=df_sum,
    y='algorithm',
    order=alg_order,
    hue_order=alg_order,
    x='r2_test_rank_diff',
#     hue='dataset',
    hue='algorithm',
    palette='flare'
)
y0,y1 = g.get_ylim()
x0,x1 = g.get_xlim()
plt.plot([0, 0],g.get_ylim(),'-r', linewidth=1, alpha=0.7)
# plt.fill_betweenx([0,g.get_xlim()[1], 0, g.get_xlim()[1]],.8*np.asarray(g.get_ylim()), alpha=0.2)
plt.fill([0,x1,x1,0,0],[y0,y0,y1,y1,y0], alpha=0.2)
g.legend_.remove()
g.set(
#    xlim=(-1,1),
#    xscale='log' 
     )
plt.grid(axis='y')
plt.xlabel('$R^2$ test rank difference from LinearRegression (lower is better)')

In [None]:
# df.groupby(['algorithm','dataset'])['r2_test'].median().unstack().loc[alg_order].round(3)
df_plt = df.loc[df.algorithm!='LinearRegression']
alg_order = df_plt.groupby(['algorithm'])['time_hr'].median().sort_values().index
g = sns.stripplot(
#     showfliers=False,
#     dodge=False,
#     join=False,
#     estimator=np.median,
    data=df_plt,
    y='algorithm',
    order=alg_order,
    x='time_hr',
    hue='dataset',
    palette='colorblind'
)
plt.plot([0, 0],g.get_ylim(), '--r')
g.legend_.remove()
g.set(
#    xlim=(-1,1),
   xscale='log' 
     )
plt.grid(axis='y')
plt.xlabel('Time (hr)')

In [None]:
# df.groupby(['algorithm','dataset'])['r2_test'].median().unstack().loc[alg_order].round(3)
df_plt = df.loc[df.algorithm=='eql']
alg_order = df_plt.groupby(['algorithm'])['time_hr'].median().sort_values().index
g = sns.stripplot(
#     showfliers=False,
#     dodge=False,
#     join=False,
#     estimator=np.median,
    data=df_plt,
    y='algorithm',
    order=alg_order,
    x='time_hr',
    hue='dataset',
    palette='colorblind'
)
plt.plot([0, 0],g.get_ylim(), '--r')
# g.legend_.remove()
plt.legend(loc=[1.1,0])
g.set(
#    xlim=(-1,1),
#    xscale='log' 
     )
plt.grid(axis='y')
plt.xlabel('Time (hr)')

# catplot fn

In [None]:
def make_catplot(derder, measure, gkwargs={}, **kwargs):
    df_plt = derder.copy()
    alg_order = df_plt.groupby(['algorithm'])[measure].median().sort_values().index
    if 'r2' in measure: 
        alg_order = alg_order[::-1]
   
    PK = dict(
        kind='point',
        estimator=np.median,
        join=False,
        data=df_plt,
        x=measure,
        y='algorithm',
        col='dataset',
        col_wrap=5,
        order=alg_order,
        hue_order=alg_order,
        palette='flare',
        aspect=0.6,
    )
    PK.update(kwargs)
    g = sns.catplot(**PK)
    if 'r2' in measure and 'diff' not in measure:
        g.set(xlim=[-.1,1.1])
    g.set(**gkwargs)
    
    lr_loc = np.where(np.asarray(alg_order)=='LinearRegression')[0][0]
    for ax in g.axes.flat:
        ax.grid(axis='y')
    for d,ax in g.axes_dict.items():
        lr = df_plt.loc[(df_plt.algorithm=='LinearRegression') 
                    & (df_plt.dataset==d)][measure].median()
        ax.plot([lr, lr],[0,len(alg_order)-1], '--k')
        ax.plot([ax.get_xlim()[0],lr],[lr_loc,lr_loc], '--k')

# r2 test difference from linear

In [None]:
make_catplot(df, 'r2_test_diff', sharex=False)

# mse

In [None]:
make_catplot(df, 'mse_test',sharex=False, gkwargs=dict(xscale='log'))

# r2

In [None]:
make_catplot(df, 'r2_test')