# Comparing estimates of running time to experiments


The goal here is to create four figures:

## Figure 1: average pool size in each step of lexicase

- predicted pool size:
- actual pool size, averaged over generations and trials
- a line representing N, the maximum pool size

<!-- ## Figure 2: expected running time comparison

boxplots of 

- sum of all actual lexicase pool sizes per lexicase selection event
- sum of expected lexicase pool sizes ($\bar{L}$ for boolean problems) per selection event (generation)
- line at old theory: NT -->

## Figure 3: running time as a function of the number of selections

line plots of

- actual running time ( $ d + \sum_{i=0}^{d} { S_i} $) per selection event
- expected running time ( $ \hat{d} + \sum_{i=0}^{\hat{d}} { \hat{S_i}} $) per selection event
- line of $N*T*s$ where $s$ is the selection event number [1:gens*pop]

## Figure 4: measured (actual) running time versus predicted running time

scatterplot of

- actual running time: $ d + \sum_{i=0}^{d} { S_i} $
- expected running time : $ a + b\hat{d} + c\sum_{i=0}^{\hat{d}} { \hat{S_i}} $) per selection event
    - estimate a,b,c using linear regression
    - report R^2

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

**load data**

In [None]:
import pandas as pd
from glob import glob
from copy import copy
from tqdm import tqdm
import numpy as np

# what to capture:

# data frame of: 

#   problem, 
#   trial, 
#   population size
#   training case size
#   generation, 
#   selection_event, 
#   selection_iteration, 
#   iteration_pool_size, 
#   selection_depth, 
#   average_error
dfs = []
problems = ['mirror-image',
        'compare-string-lengths'
       ]
# fraction of generational selections to include
sample_selection = .1

for p in problems:
    for d in glob('population-size-*/'+p):
        pop_size = int(d.split('/')[0].split('population-size-')[-1])
        print(p,'pop_size:',pop_size)
    #     frames = []
        frame = { 
            'problem':[],
            'trial':[],
            'generation':[],
            'selection_event':[],
            'selection_depth':[],
            'selection_iteration':[],
            'iteration_pool_size':[],
            'starting_pool_size':[],
            'N':pop_size
        }
        # load individuals remaining
        print('\tindividuals remaining...')
        for f in tqdm(glob(d + '/individuals_*.csv')):
            trial = int(f.split('individuals_remaining')[-1].split('.csv')[0])
            sel_col_start = -1
            gen_counter = {}
            with open(f) as fp:
                for cnt, line in enumerate(fp):
                    vals  = line.split(',')
                    if cnt == 0:
                        sel_col_start = [i for i,v in enumerate(vals) if v == 'S0'][0]
                        continue
                    # only capture first sample_selection selection events per generation
                    if vals[0] in gen_counter.keys():
                        gen_counter[vals[0]] += 1
                    else:
                        gen_counter[vals[0]] = 1
                    if gen_counter[vals[0]] > sample_selection*pop_size: 
                        continue
                    for i, v in enumerate(vals[sel_col_start:]):
                        v = v.strip()
                        frame['problem'].append(p)
                        frame['trial'].append(trial)
                        frame['generation'].append(vals[0])
                        frame['selection_event'].append( cnt)
                        frame['selection_depth'].append( len(vals[sel_col_start:]))
                        frame['selection_iteration'].append( i)
                        frame['iteration_pool_size'].append( int(v))
                        frame['starting_pool_size'].append( int(vals[sel_col_start]))
    #                     frames.append(frame)
        dfs.append(pd.DataFrame.from_records(frame))
df = pd.concat(dfs)
del dfs
print('len(df):',len(df))


## load behaviors and merge with selections

In [None]:
dfs = []

for p in problems:
    for d in glob('population-size-*/'+p):
        pop_size = int(d.split('/')[0].split('population-size-')[-1])
        print(p,'pop_size:',pop_size)
        print(d)
        Lbar_frame = {'trial':[],
                      'problem':[],
                      'generation':[],
                      'Lbar':[],
                      'N':pop_size
                     }
        # now open the corresponding behaviors file
        print('\tbehaviors file...')
        for f in tqdm(glob(d + '/behaviors_*.csv')):
            trial = int(f.split('behaviors_and_errors')[-1].split('.csv')[0])
            case_col_start = -1

            Lbar = {}
            with open(f) as fp:
                population = 0
                for cnt, line in enumerate(fp):
                    vals  = line.split(',')
                    if cnt == 0:
                        case_col_start = [i for i,v in enumerate(vals) if v == 'TC0'][0]
                        gen_col = [i for i,v in enumerate(vals) if v == 'generation'][0]
                        continue
                    T = len(vals[case_col_start:])
                    mean_error = np.mean([float(v) for v in vals[case_col_start:]])
                    if vals[gen_col] not in Lbar.keys():
                        Lbar[vals[gen_col]] = 0.0

                    Lbar[vals[gen_col]] += mean_error / pop_size

            for i,v in Lbar.items():
                Lbar_frame['trial'].append(trial)
                Lbar_frame['problem'].append(p)
                Lbar_frame['generation'].append(i)
                Lbar_frame['Lbar'].append(v)

        dfs.append(pd.DataFrame.from_records(Lbar_frame))
                                         
df_lbar = pd.concat(dfs)
del dfs
print('len(df_lbar):',len(df_lbar))

df = df.merge(df_lbar, on = ['trial','generation','problem','N'])
print('merged len(df):',len(df))
        
df['source'] = 'Actual'

# print(df.columns)
# for col in df.columns:
#     print(col, df[col].nunique(), df[col].unique())
    
df.to_feather('loaded_sampled_data.feather')
df

### running time estimates

for boolean problems, 

$P_{surv}(i) = 1 - \bar{L} + \bar{L}^{S_i}$

In this version, we have to simulate the values of $S_i$ (i.e. recurse) due to $P_{surv}$'s dependence on it. 

To remove dependence on iteration, it can be estimated as 

$\hat{P}_{surv}(i) \approx 1 - \bar{L} $

Where the expected depth is 

$\hat{d} = log (\hat{P}_{surv}) / log(N)$

And

$ \hat{S} = N \hat{P}^i $

In [None]:
# simulate the pool sizes using the exact equation
def boolean_next_pool_size(S, Lbar, i):
    P_surv = 1  - Lbar + np.power(Lbar, S)
    return S*P_surv

In [None]:
from tqdm import tqdm 

df_math = df[['problem', 
             'trial', 
             'starting_pool_size',
             'generation', 
             'selection_event',
             'Lbar',
             'N'
             ]].drop_duplicates()
print('iterations to run:',len(df_math))
math_frames = []
simple_math_frames = []
for idx, row in tqdm(df_math.iterrows(), total=len(df_math)):
    math_dict = row.to_dict().copy()
    math_dict['source'] = 'Simulated'
    math_dict['selection_iteration'] = 0
    math_dict['iteration_pool_size'] = math_dict['starting_pool_size']
#     print('math_dict:',math_dict)
    math_frames.append(math_dict)
    S0 = row['starting_pool_size']
    S = S0
    i = 1
    while S >= 2 and i < S0:
        math_dict = row.to_dict().copy()
        math_dict['selection_iteration'] = i
        math_dict['source'] = 'Simulated'
        S = boolean_next_pool_size(S, math_dict['Lbar'], i)
        math_dict['iteration_pool_size'] = S
        i += 1
        if S >= 2 or i < S0:
            math_dict['simulated_depth'] = i
#         print('math_dict:',math_dict)
        math_frames.append(math_dict)
        
    ######################################
    # simpler math: loop til expected depth, E[S] = S0*(1-Lbar)^i just multiply E[d]*
    ######################################
    simple_math_dict = row.to_dict().copy()
    simple_math_dict['source'] = 'Expectation'
    P_surv = 1 - simple_math_dict['Lbar']
    Ed = -np.log(S0)/np.log(P_surv)
    simple_math_dict['selection_iteration'] = 0
    simple_math_dict['iteration_pool_size'] = S0
    simple_math_dict['expected_depth'] = Ed 
    simple_math_frames.append(simple_math_dict)
    for i in np.arange(1,round(Ed)):
        simple_math_dict = row.to_dict().copy()
        simple_math_dict['source'] = 'Expectation'
        simple_math_dict['selection_iteration'] = i
        simple_math_dict['iteration_pool_size'] = S0*np.power(P_surv,i)
        simple_math_dict['expected_depth'] = Ed 
        simple_math_frames.append(simple_math_dict)

df_simulation = pd.DataFrame.from_records(math_frames)
df_simulation.to_feather('simulated_data.feather')

df_exp = pd.DataFrame.from_records(simple_math_frames)
df_exp.to_feather('expectation_data.feather')

# clear data

In [None]:
print('done preprocessing. clearing data.')
del df
del df_simulation
del df_exp