In [None]:
import os
import re

import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

print(sns.__version__)
print(pd.__version__)

In [None]:
plt.rcParams['figure.dpi'] = 900
sns.set(style="whitegrid")
pd.set_option('display.precision', 3)

In [None]:
PIPELINE_LABEL = "pipeline"
FIXED_SIZE_LABEL = "fixed-size"
DATASET_LABEL = "dataset"
FITNESS_LABEL = "fitness"
INVOCATIONS_LABEL = "iterations"
SOLUTION_LABEL = "solution"
IMPROVEMENT_LABEL = "Improvement, %"
MIN_FITNESS_LABEL = "Start fitness"
MAX_FITNESS_LABEL = "Final fitness"
SURROGATE_LABEL = "Surrogate"

DATASET_MAPPING = {
    "20newsgroups_sample": "20News Groups",
    "amazon_food_sample": "Amazon Food",
    "banners_sample": "Banners",
    "hotel-reviews_sample": "Hotel Reviews",
    "lenta_ru_sample": "Lenta.ru",
}


In [None]:
with_surrogate = False

In [None]:
def plot_progress(dataset, df):
    df = df.copy().reset_index()
    df["hue"] = df[SOLUTION_LABEL] + df[SURROGATE_LABEL] if with_surrogate else df[SOLUTION_LABEL]
    order = sorted(df["hue"].unique())
    sns.lineplot(data=df, x=INVOCATIONS_LABEL, y=FITNESS_LABEL, hue="hue",
                 errorbar=('ci', 90), markers=True, hue_order=order
                 )
    plt.legend(loc='lower right')
    dataset_title = DATASET_MAPPING[dataset]
    plt.title(f"{dataset_title} Training Progress")
    s = "_surrogate" if with_surrogate else ""
    plt.savefig(f"plot_progress_{dataset}{s}.png")
    # plt.show()
    plt.clf()

In [None]:
def collect_progress_results():
    base_dir = "statistics"
    files_in_directory = os.listdir(base_dir)
    log_files = [os.path.join(base_dir, file) for file in files_in_directory if re.match(r".*_progress.txt", file)]
    names = [DATASET_LABEL, SOLUTION_LABEL, INVOCATIONS_LABEL, FITNESS_LABEL]
    dfs = [pd.read_csv(log_file, header=None, names=names) for log_file in log_files]

    filtered_dfs = dfs if with_surrogate else []
    for df, log_file in zip(dfs, log_files):
        df[SOLUTION_LABEL] = df[SOLUTION_LABEL].map({True: PIPELINE_LABEL, False: FIXED_SIZE_LABEL})
        is_surrogate = "_surrogate_" in log_file
        if with_surrogate:
            df[SURROGATE_LABEL] = "_Surrogate" if is_surrogate else ""
        else:
            if not is_surrogate:
                filtered_dfs.append(df)
    dfs = filtered_dfs

    # dfs = [df for df in dfs if df[DATASET_LABEL].unique().tolist()[0] not in ["banners_sample", "hotel-reviews_sample"]]

    summary_dfs = []
    for df in dfs:
        mn = df.iloc[df[INVOCATIONS_LABEL].idxmin()]
        mx = df.iloc[df[INVOCATIONS_LABEL].idxmax()]
        assert mn[DATASET_LABEL] == mx[DATASET_LABEL] and mn[SOLUTION_LABEL] == mx[SOLUTION_LABEL]
        if mn[INVOCATIONS_LABEL] != 11 or mx[INVOCATIONS_LABEL] != 150:
            print(f"Unexpected invocations: min={mn[INVOCATIONS_LABEL]} max={mx[INVOCATIONS_LABEL]}")
        solution = [mn[SOLUTION_LABEL] + mn[SURROGATE_LABEL]] if with_surrogate else [mn[SOLUTION_LABEL]]
        series = pd.DataFrame(data={DATASET_LABEL: [mn[DATASET_LABEL]], SOLUTION_LABEL: solution,
                                 MIN_FITNESS_LABEL: [mn[FITNESS_LABEL]], MAX_FITNESS_LABEL: [mx[FITNESS_LABEL]]})
        summary_dfs.append(series)

    return pd.concat(dfs, ignore_index=True), pd.concat(summary_dfs, ignore_index=True)

In [None]:
df, summary_df = collect_progress_results()
df

In [None]:
summary_df

In [None]:
summary_df.groupby([DATASET_LABEL, SOLUTION_LABEL]).mean().reset_index()

In [None]:
summary_df.groupby([DATASET_LABEL, SOLUTION_LABEL]).std().reset_index()

In [None]:
for dataset in df[DATASET_LABEL].unique():
    plot_progress(dataset, df[df[DATASET_LABEL] == dataset])

In [None]:
def plot_results(df, name, column):
    df["d"] = df[DATASET_LABEL].map(lambda x: DATASET_MAPPING[x].replace(' ', '\n'))
    order = sorted(df[SOLUTION_LABEL].unique())
    sns.boxplot(data=df, x="d", y=column, hue=SOLUTION_LABEL,
                hue_order=order)
    plt.legend(loc='best')
    plt.xlabel(DATASET_LABEL)
    plt.ylabel(FITNESS_LABEL)
    plt.title(f"{name} solutions comparison")
    s = "_surrogate" if with_surrogate else ""
    plt.savefig(f"plot_boxplot_{name}_results{s}.png")
    # plt.show()
    plt.clf()

plot_results(summary_df.copy(), "Start", MIN_FITNESS_LABEL)
plot_results(summary_df.copy(), "Final", MAX_FITNESS_LABEL)

In [None]:
def group_by_type(df):
    return df.groupby([DATASET_LABEL, SOLUTION_LABEL])[[MIN_FITNESS_LABEL, MAX_FITNESS_LABEL]]

def split_by_type(df):
    fixed_df = df[df[SOLUTION_LABEL] == FIXED_SIZE_LABEL].reset_index()
    pipeline_df = df[df[SOLUTION_LABEL] == PIPELINE_LABEL].reset_index()
    return fixed_df, pipeline_df

def to_percent(df):
    return (df * 100).round(1)

def find_improvement(df, fitness_column):
    fixed_df, pipeline_df = split_by_type(df)
    improvement = pipeline_df[fitness_column] / fixed_df[fitness_column]
    return to_percent(improvement - 1)


def summary(df):
    by_type = group_by_type(df)
    mean = by_type.mean().reset_index()
    fixed_mean, pipeline_mean = split_by_type(mean)
    fixed_std, pipeline_std = split_by_type(by_type.std().reset_index())
    min_improvement = find_improvement(mean, MIN_FITNESS_LABEL)
    max_improvement = find_improvement(mean, MAX_FITNESS_LABEL)
    assert fixed_mean[DATASET_LABEL].tolist() == pipeline_mean[DATASET_LABEL].tolist()
    fixed_progress = to_percent((fixed_mean[MAX_FITNESS_LABEL] - fixed_mean[MIN_FITNESS_LABEL]) / fixed_mean[MIN_FITNESS_LABEL])
    pipeline_progress = to_percent((pipeline_mean[MAX_FITNESS_LABEL] - pipeline_mean[MIN_FITNESS_LABEL]) / pipeline_mean[MIN_FITNESS_LABEL])
    progress_improvement = pipeline_progress - fixed_progress
    return pd.DataFrame(data={DATASET_LABEL: fixed_mean[DATASET_LABEL],
                              "Start Improvement, %": min_improvement,
                              "Start Fixed Mean": fixed_mean[MIN_FITNESS_LABEL],
                              "Start Pipeline Mean": pipeline_mean[MIN_FITNESS_LABEL],
                              "Start Fixed Std": fixed_std[MIN_FITNESS_LABEL],
                              "Start Pipeline Std": pipeline_std[MIN_FITNESS_LABEL],
                              "Final Improvement, %": max_improvement,
                              "Final Fixed Mean": fixed_mean[MAX_FITNESS_LABEL],
                              "Final Pipeline Mean": pipeline_mean[MAX_FITNESS_LABEL],
                              "Final Fixed Std": fixed_std[MAX_FITNESS_LABEL],
                              "Final Pipeline Std": pipeline_std[MAX_FITNESS_LABEL],
                              "Progress Improvement, %": progress_improvement,
                              "Progress Fixed, %": fixed_progress,
                              "Progress Pipeline, %": pipeline_progress,
                              })

In [None]:
stats_df = summary(summary_df)
stats_df

In [None]:
columns = [DATASET_LABEL] + [column for column in stats_df.columns if column.startswith("Start")]
start_df = stats_df[columns]
print(start_df["Start Improvement, %"].mean().round(1))
start_df.rename(lambda x: x.removeprefix("Start "), axis='columns')

In [None]:
columns = [DATASET_LABEL] + [column for column in stats_df.columns if column.startswith("Final ")]
start_df = stats_df[columns]
print(start_df["Final Improvement, %"].mean().round(1))
start_df.rename(lambda x: x.removeprefix("Final "), axis='columns')

In [None]:
columns = [DATASET_LABEL] + [column for column in stats_df.columns if column.startswith("Progress ")]
start_df = stats_df[columns]
print(start_df["Progress Improvement, %"].mean().round(1))
start_df.rename(lambda x: x.removeprefix("Progress "), axis='columns')