# 3) Analyze optimization results and create runs.pkl

After running the model optimizations with the `run_optimizations.py` script, this notebook checks the optimization outputs and it creates a summary file called `runs.pkl` in the `results` folder.

The `runs.pkl` is used in notebook 3 to compare the optimization outputs

In [None]:
import matplotlib.pyplot as plt
import numpy
import os
import pickle
import glob

from pathlib import Path

In [None]:
results_folder = Path('results')

In [None]:
def load_checkp(path):
    
    try:
        chkp_name = os.path.basename(path)[:-4]
        chkp_name = chkp_name.split('_')

        feature_set = chkp_name[0]
        seed = chkp_name[-1][:1]
        
        sample_id = path.split("/")[-2]
        sample_id = sample_id.strip("random_")

        with open(path, 'rb') as fp:
            run = pickle.load(fp)
        
        run = {"nevals": numpy.cumsum(run['logbook'].select("nevals")),
               "population": run['population'],
               "hof": run['halloffame'],
               "logbook": run['logbook'],
               "sample_id": sample_id,
               "seed": seed,
               "feature_set": feature_set,
               "best_fitness": numpy.sum(run['halloffame'][0].fitness.values),
               "best_scores": list(run['halloffame'][0].fitness.values),
               "best_params": list(run['halloffame'][0]),
               "path": path}
        
        return run
        
    except:      
        return None

In [None]:
runs = []
for path in glob.glob('./checkpoints/**/*.pkl', recursive=True):

    run = load_checkp(path)
    if run:
        print(path)
        runs.append(run)
    else:
        print("Failed to read ", path)

In [None]:
ids = set([run["sample_id"] for run in runs]) 
colors = {i: "C{}".format(i) for i in ids}
colors_set = {"extra": "C0", "bap": "C1", "soma": "C2"}

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

for run in runs:

    ax.plot(run["nevals"], 
            run["logbook"].select("min"),
            color=colors[run["sample_id"]],
            ls='--', 
            lw=0.5,
            alpha=0.75)
    
    ax.scatter([run["nevals"][-1]], 
               [numpy.sum(run["hof"][0].fitness.values)],
               color=colors[run["sample_id"]],
               alpha=0.75)
    
ax.set_xlabel("Number of evaluations", size="x-large")
ax.set_ylabel("Minimum fitness (std)", size="x-large")

ax.set_yscale("log")
plt.legend()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

for run in runs:

    ax.plot(run["nevals"], 
            run["logbook"].select("min"),
            color=colors_set[run["feature_set"]],
            ls='--', 
            lw=0.5,
            alpha=0.75)
    
    ax.scatter([run["nevals"][-1]], 
               [numpy.sum(run["hof"][0].fitness.values)],
               color=colors_set[run["feature_set"]],
               alpha=0.75)
    
ax.set_xlabel("Number of evaluations", size="x-large")
ax.set_ylabel("Minimum fitness (std)", size="x-large")

ax.set_yscale("log")
#ax.set_ylim(5, 20)
plt.legend()

In [None]:
runs_selected = []
for feature_set in ['extra', 'bap', 'soma']:
    
    for sample_id in ["0", "1", "2", "3", "4"]:
        
        run_set = [run for run in runs if run['feature_set'] == feature_set and run['sample_id'] == sample_id]
        fit = [run['best_fitness'] for run in run_set]
        
        avg = numpy.mean(fit)
        

        fit, run_set = zip(*sorted(zip(fit, run_set)))
        
        run_selected = run_set[:5]
        
        runs_selected += run_selected
        
        print(feature_set, sample_id, avg, len(run_set))
        print(numpy.mean( [run['best_fitness'] for run in run_selected]))
        print()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

for run in runs_selected:

    ax.plot(run["nevals"], 
            run["logbook"].select("min"),
            color=colors_set[run["feature_set"]],
            ls='--', 
            lw=0.5,
            alpha=0.75)
    
    ax.scatter([run["nevals"][-1]], 
               [numpy.sum(run["hof"][0].fitness.values)],
               color=colors_set[run["feature_set"]],
               alpha=0.75)
    
ax.set_xlabel("Number of evaluations", size="x-large")
ax.set_ylabel("Minimum fitness (std)", size="x-large")

ax.set_yscale("log")
plt.legend()

In [None]:
fig, ax = plt.subplots(1, len(ids), figsize=(20,8))

for i, sample_id in enumerate(ids):

    ordered_runs = list([run for run in runs_selected if run['sample_id'] == sample_id])
    ordered_runs = sorted(ordered_runs, key=lambda kv: kv['best_fitness'])
    
    labels = [run['seed'] for run in ordered_runs]
    ytick_pos = [x for x in range(len(labels))]
    clrs = [colors_set[run['feature_set']] for run in ordered_runs]
    
    for pos, fit, es, c in zip(ytick_pos, ordered_runs, labels, clrs):
        ax[i].barh([pos],
                   [fit['best_fitness']],
                   height=0.5,
                   align='center',
                   color=c,
                   alpha=0.8)

    ax[i].set_yticks(ytick_pos, [])
    ax[i].set_yticklabels(labels, size='large')
    ax[i].set_xlabel("Fitness", size="x-large")
    ax[i].set_ylim(-1, len(labels))
    ax[i].set_title(sample_id)
    
plt.tight_layout()

In [None]:
with open(str(results_folder / "runs.pkl"), 'wb') as fp:
    pickle.dump(runs_selected, fp)