# reviewing theorem 3

- $C$: number of cases
- $\epsilon$: fraction of cases
- $k$: the size of a set of *any* individuals in the population such that there are at least 2 individuals that differ on $\epsilon C$ cases
    - smaller $k$ makes this harder to achieve
    - larger $\epsilon$ makes this harder to achieve

empirical question:
- what is the smallest $k$, and largest $\epsilon$, that satisfies these constraints?

## approach

- load pop error matrix, B
- set $\epsilon$ to some value

- make NxN adjacency matrix where i,j indicates whether $n_i$ and $n_j$ differ on less than $\epsilon$C cases
- calculate the maximum clique of this matrix, whose size equals $k$ 

# todo

- compare empirical # evals for these runs to new estimate/bound of running time 
- look at larger values of epsilon (up to .6?)
- look at diversity (via covariance) on filtered error vectors instead of # unique in original matrix



**done**
- compare behavioral diversity to values of k, epsilon
    - calculate mean proportion of unique error vectors each gen
- calculate best epsilon per generation? 
    - look at runtime fraction with optimal epsilon?
    - what is the best runtime fraction using optimal epsilon, over generations?

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper', font_scale=1.5)
sns.set_style('whitegrid')
import os
import pypdftk

figdir = 'figs/'
os.makedirs(figdir, exist_ok=True)

def save(h,name, **kwargs):
    h.tight_layout()
    for ft in ['.png','.pdf']:
        h.savefig(figdir+'/'+name+ft, dpi=200, **kwargs)


# load saved data

In [None]:
rootdir = 'data/lex-theory-'
# popsize = '250'
# problem = 'mirror-image'
problems = [
            ('compare-string-lengths',True,),
            ('count-odds',False),
            ('double-letters',False),
            ('last-index-of-zero',False),
            ('mirror-image',True),
            ('negative-to-zero',False),
                ('x-word-lines', False),
            ('vector-average', False)
]

In [None]:
from glob import glob
res = []
for problem,_ in problems:
    print(rootdir+problem)
    for f in glob(rootdir+problem+'/runtime_stats-*.csv'):
        df = pd.read_csv(f) 
        if len(df)==0:
            print('!!!!',problem,'sucks')
            continue
        df['problem'] = problem
        df['eps'] = df['eps'].round(2)
        if len(res)==0:
            res = df
        else:
            res = res.append(df)

res.groupby('problem').describe().transpose().iloc[:120]

# clean up var names
runtime_fraction = 'Running Time Fraction'
clean = {
    'runtime fraction': runtime_fraction
}
res = res.rename(columns=clean)

In [None]:
runtime_fraction = 'Running Time Fraction'

In [None]:
res.loc[res.eps==res.best_eps].groupby('problem')[runtime_fraction].mean().mean()

In [None]:
res.loc[res.eps==res.best_eps].groupby('problem')['best_eps'].mean().mean()

In [None]:
g =sns.relplot(kind='line',
            data=res.loc[res.eps==res.best_eps,:],
            x='g',
            y=runtime_fraction,
#             hue='eps',
            col='problem',
            col_wrap=4,
            facet_kws=dict(sharex=False, legend_out=False),
           )
sns.set(font_scale=1.7, style='whitegrid')
g.set_titles('{col_name}')
g.set_xlabels('generations')
save(plt.gcf(), 'gens_vs_runtime-fraction_best-eps_by_problem')

## comparison of new diversity metric to mean covariance

In [None]:
# smart sample
dfplt = pd.DataFrame()
i = 0
n = 1000
for p in res.problem.unique():
    sample = res.loc[res.problem==p,:].sample(n)
    if i == 0:
        dfplt = sample
    else:
        dfplt = dfplt.append(sample)
    i+=1
    
g=sns.relplot(
    kind='scatter',
#     data=res.sample(100000),
#     data=res.sample(10000),
    data=dfplt,
    col_order=res.problem.unique(),
    y='mean_cov',
    x='k%',
    hue='eps',
    col='problem',
    col_wrap=4,
    facet_kws=dict(sharey=False)#, legend_out=False),
#     legend=False
)
sns.set(font_scale=1.7, style='whitegrid')
g.set_titles('{col_name}')
g.set_xlabels('$\epsilon$-Cluster Similarity (% of N)')
g.set_ylabels('Mean Phenotype Covariance')
# sns.move_legend(g, "upper right")
# g.add_legend(facecolor='w')
# g.add_legend()
# frame = g.legend.get_frame()
# frame.set_facecolor('w')

save(plt.gcf(), 'k-pct_vs_mean-cov_hue-eps-sampled')
# save(plt.gcf(), 'k-pct_vs_mean-cov_hue-eps-sampled', rasterized=True)

In [None]:
# smart sample
dfplt = pd.DataFrame()
i = 0
n = 1000
for p in res.problem.unique():
    sample = res.loc[res.problem==p,:].sample(n)
    if i == 0:
        dfplt = sample
    else:
        dfplt = dfplt.append(sample)
    i+=1
        
g=sns.displot(
#     kind='hist',
    kind='hist',
    data=dfplt,
#     data=res,
#     data=res.sample(1000),
    col_order=res.problem.unique(),
    y='mean_cov',
    x='k%',
#     hue='eps',
    col='problem',
    col_wrap=4,
    facet_kws=dict(sharey=False, legend_out=False),
#     legend=False
#     sharey=False,
    discrete=(False,False),
    bins=20
)
sns.set(font_scale=1.7, style='whitegrid')
g.set_titles('{col_name}')
g.set_xlabels('$\epsilon$-Cluster Similarity (% of N)')
g.set_ylabels('Mean Phenotype Covariance')
# sns.move_legend(g, "upper right")
# g.add_legend(facecolor='w')
# g.add_legend()
# frame = g.legend.get_frame()
# frame.set_facecolor('w')

# save(plt.gcf(), 'k-pct_vs_mean-cov_hue-eps-lineplot-sampled')
# save(plt.gcf(), 'k-pct_vs_mean-cov_hue-eps-sampled', rasterized=True)

## break down of components of running time bound by value of epsilon

- count-odds, double-letters pretty flat but higher epsilon possible

TODO:

- [x] plot separate runtime bounds terms (epsilon term, k term)
- [x] 4n/epsilon + 2kC (divided by worst case)
- [x] annotate with which term is dominating

In [None]:
res.columns

res['$2k|C|$'] = (2*res['k']*res['N*C']/res['N'])/res['N*C']
res['$4N/\epsilon$'] = (4*res['N']/res['eps'])/res['N*C']

In [None]:
df_plt = res.rename(columns={runtime_fraction:'total'})
sns.set(font_scale=1, style='whitegrid')

g = sns.FacetGrid(
    data=df_plt,
    col='problem',
    col_wrap=4,
#     sharey=False,
    sharey=True,
    sharex=True,
    legend_out=True
)
stride=2
xticks = np.arange(res.eps.nunique())[::stride]
xticklabels = [str(i) for i in res.eps.unique()[::stride]]
np.linspace(0,12,25)

lines = [('$2k|C|$','r'),('$4N/\epsilon$','orange'),('total','b')]

for y,c in lines:
    (g.map_dataframe(sns.pointplot,
                    x='eps',
                    y=y,
                    color=c,
                   )
    )
    

g.add_legend()
g.set_titles('{col_name}')
# g.set_style('whitegrid')
    
i=0
for ax in g.axes.flat:
    print(ax.get_label())
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels, fontsize=10)
    if i>2:
        ax.set_xlabel('$\epsilon$')
    if ax.is_first_col():
        ax.set_ylabel(runtime_fraction)
    ax.set_title(ax.get_title().replace('problem = ',''))
#     if i == 0:
#         ax.legend(['a','b','c'])
    i+=1
    
from matplotlib.lines import Line2D
cmap = plt.cm.coolwarm
custom_lines, labels = [],[]
for y, c in lines:
    custom_lines.append(Line2D([0], [0], color=c, lw=4))
    labels.append(y)

g.axes[0].legend(custom_lines, labels)

save(plt.gcf(), 'eps_running-time-portions')

In [None]:
sns.relplot(kind='line',
            data=res,
            x='g',
            y='best_eps',
            hue='problem',
            marker='',
            facet_kws=dict(sharey=False, legend_out=False),
           )
save(plt.gcf(), 'gens_vs_best-eps_hue-problem')

In [None]:
sns.relplot(
    kind='scatter',
    data=res,
    y='clique time',
    x='eps',
    hue='k',
    col='problem',
    col_wrap=4
)
g.set_titles('{col_name}')


## load empirical nevals

In [None]:
nevals = pd.read_parquet('data/empirical_num_evals.parquet')
nevals['g'] = nevals['generation'].astype(int) 
nevals = nevals.loc[nevals.N==1000]

In [None]:
nevals.dtypes

In [None]:
df = res[['problem','g','runtime bound','trial']]
df['source'] = 'theory'
df = df.rename(columns={'runtime bound':'n_evals'})


df_worst = res[['problem','g','N*C','trial']]
df_worst['source'] = 'worst case'
df_worst = df_worst.rename(columns={'N*C':'n_evals'})
nevals['source'] = 'data'
combo = pd.concat((df, df_worst, nevals[['problem','trial','g','n_evals','source']]))
# combo = combo.rename()
combo

In [None]:
sns.set_style('whitegrid')
g = sns.relplot(
    kind='line',
    data=combo,
    x='g',
    y='n_evals',
    hue='source',
    col='problem',
    col_wrap=4, 
    facet_kws=dict(sharex=False, legend_out=False),
)
g.set_titles('{col_name}')
g.set(yscale='log')
g.set(ylabel='# of Evals per Selection Event')

save(plt.gcf(), 'num_evals_comparison')

## plot k over generations, colored by epsilon

In [None]:
sns.relplot(kind='line',
            data=res,
            x='g',
            y='k%',
            hue='eps',
            col='problem',
            col_wrap=4,
#             share_x=False
            facet_kws={'sharex':False}
           )
plt.ylabel('k (% Population Size)')
save(plt.gcf(), 'gens_vs_k-pct_by_problem')

## plot runtime over generations, colored by epsilon

In [None]:
sns.relplot(kind='line',
            data=res,
            x='g',
            y=runtime_fraction,
            hue='eps',
            col='problem',
            col_wrap=4,
            facet_kws={'sharex':False}
           )
save(plt.gcf(), 'gens_vs_runtime-fraction_hue-eps_by_problem')

# runtime fraction for different values of epsilon

In [None]:
# plt.plot(res.eps, res['runtime fraction'], '.')
sns.pointplot(data=res.round(2),
              x='eps',
              y=runtime_fraction,
              markers=''
             )

# k for different values of epsilon

In [None]:
sns.catplot(kind='point',
            data=res.round(2),
            x='eps',
            y='k',
            markers='',
#             ax=ax,
            col='problem',
            col_wrap=3
            )
