# Comparing estimates of running time to experiments


The goal here is to create four figures:

## Figure 1: average pool size in each step of lexicase

- predicted pool size:
- actual pool size, averaged over generations and trials
- a line representing N, the maximum pool size

<!-- ## Figure 2: expected running time comparison

boxplots of 

- sum of all actual lexicase pool sizes per lexicase selection event
- sum of expected lexicase pool sizes ($\bar{L}$ for boolean problems) per selection event (generation)
- line at old theory: NT -->

## Figure 3: running time as a function of the number of selections

line plots of

- actual running time ( $ d + \sum_{i=0}^{d} { S_i} $) per selection event
- expected running time ( $ \hat{d} + \sum_{i=0}^{\hat{d}} { \hat{S_i}} $) per selection event
- line of $N*T*s$ where $s$ is the selection event number [1:gens*pop]

## Figure 4: measured (actual) running time versus predicted running time

scatterplot of

- actual running time: $ d + \sum_{i=0}^{d} { S_i} $
- expected running time : $ a + b\hat{d} + c\sum_{i=0}^{\hat{d}} { \hat{S_i}} $) per selection event
    - estimate a,b,c using linear regression
    - report R^2

In [None]:
import seaborn as sns
sns.set_context("paper") #, font_scale = .5, rc={"grid.linewidth": 0.6})
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

**load data**

In [None]:
import pandas as pd
from glob import glob
from copy import copy
from tqdm.notebook import tqdm
import numpy as np

# what to capture:

# data frame of: 

#   problem, 
#   trial, 
#   population size
#   training case size
#   generation, 
#   selection_event, 
#   selection_iteration, 
#   iteration_pool_size, 
#   selection_depth, 
#   average_error
dfs = []
problems = [
    'compare-string-lengths',
    'count-odds',
    'double-letters',
    'last-index-of-zero',
    'mirror-image',
    'negative-to-zero',
    'vector-average',
    'x-word-lines'
           ]
pop_size=1000
# fraction of generational selections to include
sample_selection = .1

for p in tqdm(problems):
    for d in tqdm(glob('data/lex-theory-*'+p),p, leave=False):
        print(p,'pop_size:',pop_size)
    #     frames = []
        frame = { 
            'problem':[],
            'trial':[],
            'generation':[],
            'selection_event':[],
#             'n_inds_evaluated':[],
            'n_evals':[],
#             'selection_depth':[],
#             'selection_iteration':[],
#             'iteration_pool_size':[],
#             'starting_pool_size':[],
            'N':pop_size
        }
        # load individuals remaining
        print('\tindividuals remaining...')
        for f in tqdm(glob(d + '/individuals_*.csv'),d, leave=False):
            trial = int(f.split('individuals_remaining')[-1].split('.csv')[0])
            sel_col_start = -1
            gen_counter = {}
            with open(f) as fp:
                for cnt, line in enumerate(fp):
                    vals  = line.split(',')
                    if cnt == 0:
                        sel_col_start = [i for i,v in enumerate(vals) if v == 'S0'][0]
                        continue
                    # only capture first sample_selection selection events per generation
                    if vals[0] in gen_counter.keys():
                        gen_counter[vals[0]] += 1
                    else:
                        gen_counter[vals[0]] = 1
                    if gen_counter[vals[0]] > sample_selection*pop_size: 
                        continue
#                     for i, v in enumerate(vals[sel_col_start:]):
#                         v = v.strip()
                    frame['problem'].append(p)
                    frame['trial'].append(trial)
                    frame['generation'].append(vals[0])
                    frame['selection_event'].append( cnt)
#                     nie = np.sum(vals[sel_col_start:])
#                     frame['n_inds_evaluated'].append(nie) # nie!!
                    nir = [int(v) for v in vals[sel_col_start:]]
                    frame['n_evals'].append(np.sum(nir))
#                     frame['selection_iteration'].append( i)
#                     frame['iteration_pool_size'].append( int(v))
#                     frame['starting_pool_size'].append( int(vals[sel_col_start]))
    #                     frames.append(frame)
        dfs.append(pd.DataFrame.from_records(frame))
df = pd.concat(dfs)
del dfs
df.to_parquet('data/empirical_num_evals.parquet')
print('len(df):',len(df))
display(df)
del df

In [None]:
print('done preprocessing. clearing data.')