In [None]:
import os
import re
from collections import defaultdict

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
plt.rcParams['figure.dpi'] = 300
sns.set(style="whitegrid")
pd.set_option('display.precision', 3)

In [None]:
PIPELINE_LABEL = "pipeline"
FIXED_SIZE_LABEL = "fixed-size"
DATASET_LABEL = "dateset"
FITNESS_LABEL = "fitness"
INVOCATIONS_LABEL = "ARTM invocations"
SOLUTION_LABEL = "solution"
IMPROVEMENT_LABEL = "Improvement, %"

DATASET_MAPPING = {
    "20newsgroups_sample": "20News Groups",
    "amazon_food_sample": "Amazon Food",
    "banners_sample": "Banners",
    "hotel-reviews_sample": "Hotel Reviews",
    "lenta_ru_sample": "Lenta.ru",
}

In [None]:
def replace_with_max(array):
    array = np.array(array)
    array = np.maximum.accumulate(array)
    return array.tolist()


def transform_to_invocations(iterations, fitness):
    invocations = list(range(iterations[0], iterations[-1] + 1))
    new_fitness = [0] * len(invocations)
    min_inv = iterations[0]
    for i in range(len(iterations) - 1):
        inv = iterations[i]
        next_inv = iterations[i + 1]
        for j in range(inv - min_inv, next_inv - min_inv):
            new_fitness[j] = fitness[i]
    new_fitness[-1] = fitness[-1]
    cut_length = min(len(invocations), 150 - min_inv + 1)
    return invocations[:cut_length], new_fitness[:cut_length]

In [None]:
def load_file(log_file, datasets):
    with open(log_file, 'r') as file:
        for line in file:
            parts = re.split(',(?![^\[]*\])', line.strip())
            dataset = DATASET_MAPPING[parts[1]]
            assert parts[2] == "True" or parts[2] == "False"
            solution = PIPELINE_LABEL if "True" == parts[2] else FIXED_SIZE_LABEL
            iterations = list(map(int, re.findall(r'\d+', parts[3])))
            fitness = list(map(float, re.findall(r'\d+\.?\d*', parts[4])))
            fitness = replace_with_max(fitness)
            datasets[dataset].append((solution, iterations, fitness))

In [None]:
def group_by_type(df):
    return df.groupby([DATASET_LABEL, SOLUTION_LABEL])[FITNESS_LABEL]


def plot_progress(dataset, df):
    order = sorted(df[SOLUTION_LABEL].unique())
    sns.lineplot(data=df, x=INVOCATIONS_LABEL, y=FITNESS_LABEL, hue=SOLUTION_LABEL,
                 errorbar=('ci', 90), markers=True, hue_order=order)
    plt.legend(loc='lower right')
    plt.title(f"{dataset} Training Progress")
    dataset_file_name = next(filter(lambda x: x[1] == dataset, DATASET_MAPPING.items()))[0]
    plt.savefig(f"plot_progress_{dataset_file_name}.png")
    # plt.show()
    plt.clf()


def collect_results(tuple_results):
    ds = []
    ps = []
    fs = []
    for dataset, solutions, final_fitness in tuple_results:
        ds.append(dataset.replace(" ", "\n"))
        ps.append(solutions)
        fs.append(final_fitness)
    return pd.DataFrame(data={DATASET_LABEL: ds, SOLUTION_LABEL: ps, FITNESS_LABEL: fs})


def plot_results(df, name):
    order = sorted(df[SOLUTION_LABEL].unique())
    sns.boxplot(data=df, x=DATASET_LABEL, y=FITNESS_LABEL, hue=SOLUTION_LABEL,
                hue_order=order)
    plt.legend(loc='best')
    plt.xlabel("")
    plt.title(f"{name} solutions comparison")
    plt.savefig(f"plot_boxplot_{name}_results.png")
    # plt.show()
    plt.clf()

In [None]:
files_in_directory = os.listdir()
log_files = [file for file in files_in_directory if re.match(r"log-\d{6}-\d{6}\.txt", file)]

datasets = defaultdict(list)
for log_file in log_files:
    load_file(log_file, datasets)

In [None]:
first_results = []
final_results = []
for dataset in datasets:
    solutions = []
    invocations = []
    fitness = []
    for p, i, f in datasets[dataset]:
        i, f = transform_to_invocations(i, f)
        invocations += i
        fitness += f
        solutions += [p] * len(i)
        first_results.append((dataset, p, f[0]))
        final_results.append((dataset, p, f[-1]))

    df = pd.DataFrame(data={INVOCATIONS_LABEL: invocations, FITNESS_LABEL: fitness,
                            SOLUTION_LABEL: solutions})
    plot_progress(dataset, df)


In [None]:
def split_by_type(df):
    fixed_df = df[df[SOLUTION_LABEL] == FIXED_SIZE_LABEL].reset_index()
    pipeline_df = df[df[SOLUTION_LABEL] == PIPELINE_LABEL].reset_index()
    return fixed_df, pipeline_df


def find_improvement(df):
    fixed_df, pipeline_df = split_by_type(df)
    improvement = pipeline_df[FITNESS_LABEL] / fixed_df[FITNESS_LABEL]
    return ((improvement - 1) * 100).round(1)


def summary(df):
    by_type = group_by_type(df)
    fixed_mean, pipeline_mean = split_by_type(by_type.mean().reset_index())
    fixed_std, pipeline_std = split_by_type(by_type.std().reset_index())
    improvement = find_improvement(by_type.mean().reset_index())
    return pd.DataFrame(data={DATASET_LABEL: fixed_mean[DATASET_LABEL],
                              IMPROVEMENT_LABEL: improvement,
                              "Fixed Mean": fixed_mean[FITNESS_LABEL],
                              "Pipeline Mean": pipeline_mean[FITNESS_LABEL],
                              "Fixed Std": fixed_std[FITNESS_LABEL],
                              "Pipeline Std": pipeline_std[FITNESS_LABEL],
                              })

In [None]:
df_final = collect_results(final_results)
plot_results(df_final, "Best")
df_sum = summary(df_final)
print(f"Best results: avg improvement: {df_sum[IMPROVEMENT_LABEL].mean().round(1)}")
df_sum

In [None]:
df_first = collect_results(first_results)
plot_results(df_first, "First iteration")
df_sum = summary(df_first)
print(f"First iteration results: avg improvement: {df_sum[IMPROVEMENT_LABEL].mean().round(1)}")
df_sum

In [None]:
def progress_summary(df):
    fixed_df, pipeline_df = split_by_type(df.reset_index())
    improvement = pipeline_df[FITNESS_LABEL] - fixed_df[FITNESS_LABEL]
    return pd.DataFrame(data={DATASET_LABEL: fixed_df[DATASET_LABEL],
                              IMPROVEMENT_LABEL: improvement.round(1),
                              "Fixed Growth, %": fixed_df[FITNESS_LABEL].round(1),
                              "Pipeline Growth, %": pipeline_df[FITNESS_LABEL].round(1),
                              })

final_avg = group_by_type(df_final).mean()
first_avg = group_by_type(df_first).mean()
progress = ((final_avg - first_avg) / first_avg) * 100
df_sum = progress_summary(progress)
print(f"Progress difference avg: {df_sum[IMPROVEMENT_LABEL].mean().round(1)}")
df_sum