In [None]:
import os
import re
from collections import defaultdict
from typing import List

import numpy as np
import pandas as pd
from IPython.display import Image
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, export_graphviz, export_text

from autotm.params import FixedListParams, PipelineParams, iterations_of_type
from autotm.schemas import IndividualDTO


In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

plt.rcParams['figure.dpi'] = 900
sns.set(style="whitegrid")
pd.set_option('display.precision', 3)

In [None]:
DATASET_MAPPING = {
    "20newsgroups_sample": "20News Groups",
    "amazon_food_sample": "Amazon Food",
    "banners_sample": "Banners",
    "hotel-reviews_sample": "Hotel Reviews",
    "lenta_ru_sample": "Lenta.ru",
}

# Collection of the data

Firstly we collect the results from all the AutoTM runs

In [None]:
def collect_all_parameters():
    base_dir = "statistics"
    files_in_directory = os.listdir(base_dir)
    log_files = [os.path.join(base_dir, file) for file in files_in_directory if re.match(r".*_parameters.txt", file)]
    print(len(log_files))
    params = []
    for log_file in log_files:
        if "_surrogate_" in log_file:
            continue
        with open(log_file) as file:
            params += file.read().splitlines()
    return [IndividualDTO.model_validate_json(param) for param in params]


all_params = collect_all_parameters()
all_params[0].params, all_params[0].dataset

# Feature extraction

In [None]:
DATASET_LABEL = 'dataset'
SOLUTION_LABEL = 'solution'
PIPELINE_LABEL = "pipeline"
FIXED_SIZE_LABEL = "fixed-size"
STAGES_NUMBER_LABEL = 'stages'
FITNESS_LABEL = "Fitness"
ITERATIONS_NUMBER_LABEL = 'Iterations'
ITERATIONS_DECORRELATION_LABEL = 'Iterations\ndecorrelation'
ITERATIONS_SPARSE_LABEL = 'Iterations\nsparse'
ITERATIONS_SMOOTH_LABEL = 'Iterations\nsmooth'
DECORRELATION_A_LABEL = 'Decorrelation\na'
DECORRELATION_B_LABEL = 'Decorrelation\nb'
SPARSE_A_LABEL = 'Sparse\na'
SPARSE_B_LABEL = 'Sparse\nb'
SMOOTH_A_LABEL = 'Smooth\na'
SMOOTH_B_LABEL = 'Smooth\nb'

In [None]:
def iterations_number(stages):
    return sum(stage.values[0] for stage in stages)

def avg_param(stages, index):
    values = [stage.values[index] for stage in stages]
    if len(values) == 0:
        return 0
    return np.mean(values)

def extract_features(dtos: List[IndividualDTO]):
    features = defaultdict(list)
    for dto in dtos:
        features[DATASET_LABEL].append(dto.dataset)
        features[FITNESS_LABEL].append(dto.fitness_value["avg_coherence_score"])

        if isinstance(dto.params, FixedListParams):
            features[SOLUTION_LABEL].append(FIXED_SIZE_LABEL)
            params = dto.params.to_pipeline_params()
        elif isinstance(dto.params, PipelineParams):
            features[SOLUTION_LABEL].append(PIPELINE_LABEL)
            params = dto.params
        else:
            raise ValueError(f"Unexpected type {dto.params}")

        stages = params.pipeline.stages
        features[STAGES_NUMBER_LABEL].append(len(stages))
        features[ITERATIONS_NUMBER_LABEL].append(iterations_number(stages))
        
        decorrelation_iterations = iterations_of_type(stages, "DecorrelatorPhiRegularizer")
        features[ITERATIONS_DECORRELATION_LABEL].append(iterations_number(decorrelation_iterations))
        # features[DECORRELATION_A_LABEL].append(avg_param(decorrelation_iterations, 1))
        # features[DECORRELATION_B_LABEL].append(avg_param(decorrelation_iterations, 2))
        
        sparse_iterations = iterations_of_type(stages, "SparseThetaRegularizer")
        features[ITERATIONS_SPARSE_LABEL].append(iterations_number(sparse_iterations))
        # features[SPARSE_A_LABEL].append(avg_param(sparse_iterations, 1))
        # features[SPARSE_B_LABEL].append(avg_param(sparse_iterations, 2))
        
        smooth_iterations = iterations_of_type(stages, "SmoothThetaRegularizer")
        features[ITERATIONS_SMOOTH_LABEL].append(iterations_number(smooth_iterations))
        # features[SMOOTH_A_LABEL].append(avg_param(smooth_iterations, 1))
        # features[SMOOTH_B_LABEL].append(avg_param(smooth_iterations, 2))

    return pd.DataFrame(features).drop_duplicates()

In [None]:
df = extract_features(all_params)
df

In [None]:
datasets = sorted(df[DATASET_LABEL].unique())
features = [f for f in df.columns.tolist() if f not in [DATASET_LABEL, SOLUTION_LABEL, FITNESS_LABEL]]
order = sorted(df[SOLUTION_LABEL].unique())

In [None]:
next(param.params for param in all_params if isinstance(param.params, PipelineParams))

# Data exploration

Here we analyse distributions for all features in all datasets

In [None]:
def select(df, dataset, solution=None):
    data = df[df[DATASET_LABEL] == dataset]
    data = data.drop([DATASET_LABEL], axis=1)
    if solution is not None:
        data = data[data[SOLUTION_LABEL] == solution]
        data = data.drop([SOLUTION_LABEL], axis=1)
    return data

In [None]:
font_size = 18
fig, axs = plt.subplots(len(features), len(datasets), figsize=(3 * len(datasets), 3 * len(features)), sharex='col', sharey='row')
for i, dataset in enumerate(datasets):
    data = select(df, dataset).sample(frac=1).reset_index(drop=True)
    for j, f in enumerate(features):
        sns.scatterplot(data, x=FITNESS_LABEL, y=f, ax=axs[j, i], hue=SOLUTION_LABEL, hue_order=order)
        axs[j, i].set_xlabel(FITNESS_LABEL, fontsize=font_size)
        axs[j, i].set_ylabel(f, fontsize=font_size)
        if i != 0 or j != 0:
            axs[j, i].get_legend().remove()
    axs[0, i].set_title(DATASET_MAPPING[dataset], fontsize=font_size)

# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
    ax.label_outer()

plt.subplots_adjust(wspace=0.05, hspace=0.05)
plt.savefig(f"feature_analysis.png")
# plt.clf()

In [None]:
def build_descision_tree(data):
    y = data[FITNESS_LABEL]
    X = data.drop(FITNESS_LABEL, axis=1)
    feature_names = X.columns
    clf = DecisionTreeRegressor(max_depth=3, criterion="absolute_error", min_samples_leaf=100)
    model = clf.fit(X.to_numpy(), y.to_numpy())
    return model, feature_names.tolist()

In [None]:
for solution in [PIPELINE_LABEL, FIXED_SIZE_LABEL]:
    for dataset in datasets:
        print(f"Dataset: {dataset} solution: {solution}")
        data = select(df, dataset, solution)
        model, feature_names = build_descision_tree(data)
        print(export_text(model, feature_names=[f.replace("\n", "_") for f in feature_names]))

In [None]:
dataset = "hotel-reviews_sample"
solution = PIPELINE_LABEL

data = select(df, dataset, solution)
clf, feature_names = build_descision_tree(data)

export_graphviz(clf, out_file='tree.dot', feature_names=[f.replace("\n", "_") for f in feature_names],
                rounded=True, proportion=False, precision=2, filled=True)
!dot -Tpng tree.dot -o tree.png -Gdpi=600
Image(filename='tree.png')

# Best solutions analysis

For each dataset we take top 10% solutions and analyse their structure

In [None]:
solution = PIPELINE_LABEL

fs = [FITNESS_LABEL] + features
fs.remove(ITERATIONS_SPARSE_LABEL) # all equal 0

fig, axs = plt.subplots(len(fs), len(datasets), figsize=(3 * len(datasets), 3 * len(fs)), sharex='col', sharey='row')
for i, dataset in enumerate(datasets):
    data = select(df, dataset, solution)
    target_fitness = np.percentile(data[FITNESS_LABEL], 95, axis=0)
    data = data[data[FITNESS_LABEL] >= target_fitness]
    assert (data[ITERATIONS_SPARSE_LABEL] == 0).all()
    for j, f in enumerate(fs):
        sns.histplot(data, y=f, ax=axs[j, i])
    axs[0, i].set_title(dataset)

# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
    ax.label_outer()

plt.subplots_adjust(wspace=0.05, hspace=0.05)

# Surrogate learning

In [None]:
for dataset in datasets:
    for solution in [PIPELINE_LABEL, FIXED_SIZE_LABEL]:
        print(f"Dataset: {dataset} solution: {solution}")
        data = select(df, dataset, solution)
        y = data[FITNESS_LABEL].to_numpy()
        X = data.drop(FITNESS_LABEL, axis=1).to_numpy()
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
        clf = RandomForestRegressor(max_depth=10, min_samples_leaf=10)
        model = clf.fit(X_train, y_train)
        
        print(f"Train R^2: {model.score(X_train, y_train)}")
        print(f"Test  R^2: {model.score(X_test, y_test)}")
        print()

In [None]:
def extract_all_features(dtos: List[IndividualDTO]):
    features = defaultdict(list)
    for dto in dtos:
        if isinstance(dto.params, FixedListParams):
            continue
        elif isinstance(dto.params, PipelineParams):
            params = dto.params
        else:
            raise ValueError(f"Unexpected type {dto.params}")
        features[DATASET_LABEL].append(dto.dataset)
        features[FITNESS_LABEL].append(dto.fitness_value["avg_coherence_score"])
        vector = params.to_vector()
        for i, v in enumerate(vector):
            features[f"f_{i}"].append(v)
    return pd.DataFrame(features).drop_duplicates()

In [None]:
df = extract_all_features(all_params)
df

In [None]:
for dataset in datasets:
    print(f"Dataset: {dataset}")
    data = select(df, dataset)
    y = data[FITNESS_LABEL].to_numpy()
    X = data.drop(FITNESS_LABEL, axis=1).to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    clf = RandomForestRegressor(max_depth=10, min_samples_leaf=10)
    model = clf.fit(X_train, y_train)

    print(f"Train R^2: {model.score(X_train, y_train)}")
    print(f"Test  R^2: {model.score(X_test, y_test)}")
    print()

In [None]:
pca = PCA(n_components=12)
pca.fit(df.drop([FITNESS_LABEL, DATASET_LABEL], axis=1).to_numpy())
sum(pca.explained_variance_ratio_)

In [None]:
pca.transform(select(df, dataset).drop(FITNESS_LABEL, axis=1).to_numpy())

In [None]:
for dataset in datasets:
    print(f"Dataset: {dataset}")
    data = select(df, dataset)
    y = data[FITNESS_LABEL].to_numpy()
    X = data.drop(FITNESS_LABEL, axis=1).to_numpy()
    X = pca.transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    clf = RandomForestRegressor(max_depth=10, min_samples_leaf=10)
    model = clf.fit(X_train, y_train)

    print(f"Train R^2: {model.score(X_train, y_train)}")
    print(f"Test  R^2: {model.score(X_test, y_test)}")
    print()