# Data Extraction

In [1]:
import pandas as pd

def build_path(seed, dataset):
    data_path = '../../data/xdpole'
    return f'{data_path}/{dataset}/{seed}_run.csv'

seeds = [f's{i}' for i in range(6,11)]
batch_sizes = [i for i in range(10, 40, 10)]

all_stats = {}
base_conditions = {}
predicted_conditions = {}

for batch_size in batch_sizes:
    all_stats[batch_size] = {}
    predicted_conditions[batch_size] = {}

for seed in seeds:
    base_conditions_path = build_path(seed, 'baseconditions')
    base_conditions[seed] = pd.read_csv(base_conditions_path)

    for batch_size in batch_sizes:
        stats_path = build_path(seed, f'manager/specialist_sp_g{batch_size}')
        all_stats[batch_size][seed] = pd.read_csv(stats_path)

        predicted_conditions_path = build_path(seed, f'predictedbaseconditions/specialist_sp_g{batch_size}')
        predicted_conditions[batch_size][seed] = pd.read_csv(predicted_conditions_path)


# Data Transformation

In [2]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def get_column_mean(column, data):
    dfs = [df[column] for df in data.values()]
    if len(dfs) > 1:
        m = np.mean(dfs)
        return m[~np.isnan(m)]
    else:
        return dfs[0]

# Graphs

In [3]:
import plotly.graph_objects as go
import seaborn as sns

sns.color_palette("mako", as_cmap=True)

def score_graph(dfs, batches, threshold=0.7):
    fg = go.Figure(
        layout=go.Layout(title=f'Specialist Score X Generation {seeds}')
    )

    th_gen = dfs.get(batches[0]).gen

    fg.add_trace(
        go.Scatter(
            x=th_gen,
            y=[threshold for i in range(len(th_gen))],
            fill='tozeroy',
            name='Threshold',
            mode='none',
        ),
    )

    for batch in batches:
        df = dfs.get(batch)
        fg.add_trace(
            go.Scatter(
                x=df.gen,
                y=df.score,
                mode='lines',
                name=f'Specialist Score Mean [{batch}]',
            ),
        )

    fg.update_xaxes(title_text='Generation')
    fg.update_yaxes(title_text='Score')
    fg.show()


def metrics_graph(dfs, batches, metrics=['accuracy', 'recall', 'precision', 'f1_score']):
    for metric in metrics:
        fg = go.Figure(
            layout=go.Layout(title=f'Specialist {metric} X Generation {seeds}')
        )

        for batch in batches:
            df = dfs.get(batch)
            fg.add_trace(
                go.Scatter(
                    x=df.gen,
                    y=df[metric],
                    mode='lines',
                    name=f'Specialist {metric} Mean [{batch}]',
                ),
            )

        fg.update_xaxes(title_text='Generation')
        fg.update_yaxes(title_text=metric)
        fg.show()


In [4]:
score_dfs = {}
metrics_dfs = {}

for batch, stats in all_stats.items():
    gen = get_column_mean('gen', stats)
    score = get_column_mean('specialist_score', stats)
    df_score = pd.DataFrame({'score': score, 'gen': gen})
    score_dfs[batch] = df_score

    tp = get_column_mean('cm_true_positive', stats)
    fp = get_column_mean('cm_false_positive', stats)
    tn = get_column_mean('cm_true_negative', stats)
    fn = get_column_mean('cm_false_negative', stats)
    gen = [gen[i] for i in range(len(tp))]

    a, p, r, f = [], [], [], []

    for i in range(len(gen)):
        tp_i, fp_i, tn_i, fn_i = tp[i], fp[i], tn[i], fn[i]
        a_i = (tp_i+tn_i)/(tp_i+fp_i+fn_i+tn_i)
        p_i = tp_i/(tp_i+fp_i)
        r_i = tp_i/(tp_i+fn_i)
        f_i = 2*(p_i*r_i)/(p_i+r_i)

        a.append(a_i), p.append(p_i), r.append(r_i), f.append(f_i)

    df_metrics = pd.DataFrame({
        'gen': gen,
        'accuracy': a,
        'precision': p,
        'recall': r,
        'f1_score': f,
    })
    metrics_dfs[batch] = df_metrics


score_graph(score_dfs, batch_sizes)
metrics_graph(metrics_dfs, batch_sizes)