# Goal
Investigate if there is anything interesting in the interaction of pipeline steps and/or individual pipeline components and the rmse.

Obviously very adhoc but such is the nature of exploratory research

In [420]:
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from collections import defaultdict
from plotly.subplots import make_subplots
from scipy import stats
from scipy.stats import friedmanchisquare
import numpy as np
from itertools import chain

import json

from constants import SEED
from datasets import load_dataset

def get_all_results_paths(root_dir: Path):
    return list(root_dir.glob("**/*results.json"))

def single_layer_unnest(node: dict, keys: list[str]):
    """
    Unnest a single layer of a nested dictionary
    """
    to_merge_later = {}
    to_delete_later = []
    for top_key, top_item in node.items():
        if isinstance(top_item, dict):
            for child_key, child_item in top_item.items():
                if child_key in keys:
                    to_merge_later[f"{top_key}_{child_key}"] = child_item
                    to_delete_later.append((top_key, child_key))
    for top_key, child_key in to_delete_later:
        del node[top_key][child_key]
    for new_key, new_item in to_merge_later.items():
        node[new_key] = new_item
    return node

def get_all_results(root_dir: Path, response_is_normalised: bool = False, log_response: bool = False):
    results = []
    for path in get_all_results_paths(root_dir):
        result = json.load(open(path))
        ## Handle nested metrics
        result = single_layer_unnest(result, ["train", "val", "test"])
        to_delete = []
        for key, val in result.items():
            if not val:
                to_delete.append(key)
        for key in to_delete:
            del result[key]
        ## Map model names
        result["model"] = result["model"].replace("relu_mlp", "mlp")
        result["model"] = result["model"].replace("no_activation_mlp", "deep_linear")
        ## Handle finetune
        if "finetune" not in result:
            result["finetune"] = False
        ## Handle empty preprocessers
        if "preprocessers" not in result:
            result["preprocessers"] = ["empty"]
        ## Handle missing seed
        if "seed" not in result:
            seed_part = [x for x in path.parts if "seed" in x][0]
            result["seed"] = int(seed_part.split("_")[-1])
        ## Order preprocessers
        result["preprocessers"] = sorted(result["preprocessers"])

        nice_table_map = {
            "empty": "empty",
            "lemmatize": "Le",
            "lowercase": "Lo",
            "no_numbers_simple": "NN",
            "no_numbers_spacy": "NNSC",
            "no_punctuation": "NP",
            "no_stopwords_nltk": "NSN",
            "no_stopwords_spacy": "NSCC",
            "stem": "S",
        }
        cleaned_processers = [nice_table_map.get(x, x) for x in result["preprocessers"]]
        result["preprocesser(s)"] = ",".join(sorted(cleaned_processers))

        if response_is_normalised:
            normaliser_path = path.parent / "normalisation_params.json"
            with open(normaliser_path, "r") as f:
                normaliser_constants = json.load(f)
            std = normaliser_constants["std"]
            for col in result.keys():
                if "rmse" in col:
                    result[col] = result[col] * std
        if log_response:
            for col in result.keys():
                if "rmse" in col:
                    result[col] = np.log(result[col])
        results.append(result)
    return results


def permutation_test(x, y, n_permutations=100000, one_sided: bool = False):
    """
    Perform permutation test for difference in means
    H0: mean(x) = mean(y)
    H1: mean(x) != mean(y) 

    If one_sided is True, then we test if mean(x) > mean(y)

    Return p-value and observed difference
    """
    observed_diff = np.mean(x) - np.mean(y)
    
    # Combine samples
    combined = np.concatenate([x, y])
    n_x = len(x)
    
    # Generate permutations
    diffs = np.zeros(n_permutations)
    for i in range(n_permutations):
        np.random.shuffle(combined)
        perm_x = combined[:n_x]
        perm_y = combined[n_x:]
        diffs[i] = np.mean(perm_x) - np.mean(perm_y)
    
    # Calculate one-sided p-value
    if one_sided:
        delta = diffs >= observed_diff
    else:
        delta = np.abs(diffs) >= np.abs(observed_diff)
    delta = np.concatenate([delta, [True]]) # Effectively do (C+1/N+1) as in https://arxiv.org/pdf/1603.05766
    p_value = np.mean(delta)
    
    return p_value, observed_diff

def process_latex_table(latex_table: str):
    latex_table = latex_table.replace("_", "-")
    latex_table = latex_table.replace("%", "\%")
    latex_table = latex_table.replace("rmse-test", "RMSE")
    latex_table = latex_table.replace("featuriser", "featurizer")
    latex_table = latex_table.replace("tokeniser", "tokenizer")
    latex_table = latex_table.replace("rmse-test", "rmse")
    latex_table = latex_table.replace("<", "\\textless")
    latex_table = latex_table.replace(">", "\\textgreater")
    latex_table = latex_table.replace("obl", "\\named{obl}")
    latex_table = latex_table.replace("jcp", "\\named{jcp}")
    return latex_table

def simple_format(x: float):
    if abs(x) < 0.001:
        # Assumes no negative values between -0.001 and 0.001
        return "<0.001"
    elif abs(x) < 1:
        return f"{x:.3f}"
    else:
        return f"{x:,.0f}"

## Configure

In [421]:
import datetime
import logging

datasets = [
    "jc_penney_products", 
    "online_boat_listings",
    "online_boat_listings_no_foreign_languages",
    "california_house_prices" 
]
## Change analysis dataset
dataset = datasets[1]

## Change whether we look at downsampled results (only for jc_penney_products)
downsamples = [0.2, 0.01, None]
downsample = downsamples[2]

## Change whether we look at normalised response
response_is_normalised = False

filter_out_replicates = False
filter_only_replicates = 4 # Can be int (4) or None, if int then only include results with this number of replicates, cannot be used with filter_out_replicates
filter_out_worst = False

log_response = False # Currently too difficult to interpret

which_ds_string = f"{dataset} at {datetime.datetime.now().isoformat()}"
print(which_ds_string)

online_boat_listings at 2025-03-14T09:19:17.276335


In [422]:

### Configure analysis
import sys


raw_metric = "rmse_test"
metric = raw_metric.removesuffix("_test")

pipeline_cols = ["preprocessers", "tokeniser", "featuriser", "model", "finetune"]
# Dont include finetune as we will alter the model name
hash_cols = ["preprocesser(s)", "tokeniser", "featuriser", "model"]
filter_worst_k_for = {
    "jc_penney_products": 0,
    "online_boat_listings": 0,
    "california_house_prices": 0
}
alpha = 0.05
### Configure plot saving
image_ext = "png"
OUTPUT_DIR = Path("notebook_output")
SAVE_HEIGHT = 400
SAVE_WIDTH = 500
table_dir = OUTPUT_DIR / dataset / "tables"
histogram_dir = OUTPUT_DIR / dataset / "1d_histograms"
boxplot_dir = OUTPUT_DIR / dataset / "1d_boxplots"
### Configure latex columns
latex_cols = ["preprocesser(s)", "tokeniser", "featuriser", "model", metric]
dataset_name_map = {
    "jc_penney_products": "jcp",
    "online_boat_listings": "obl",
}
###
histogram_dir.mkdir(exist_ok=True, parents=True)
table_dir.mkdir(exist_ok=True, parents=True)
boxplot_dir.mkdir(exist_ok=True, parents=True)

config_str = metric
if filter_out_replicates:
    config_str += "_no_replicates"
if filter_only_replicates is not None:
    config_str += f"_only_{filter_only_replicates}_replicates"
if filter_out_worst:
    config_str += f"_filter_worst_{filter_worst_k_for[dataset]}"
if downsample is not None:
    config_str += f"_downsample_{downsample}"
if response_is_normalised:
    config_str += "_normalise_response"

logger = logging.getLogger("analysis")
logger.setLevel(logging.DEBUG)
for handler in logger.handlers:
    logger.removeHandler(handler)
# Configure logging to file
log_file = OUTPUT_DIR / f"{dataset}/{config_str}.log"
log_file.parent.mkdir(parents=True, exist_ok=True)
# Remove all existing handlers
for handler in logger.handlers[:]:
    logger.removeHandler(handler)

file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.info(which_ds_string)

2025-03-14 09:19:17,298 - analysis - INFO - online_boat_listings at 2025-03-14T09:19:17.276335


In [423]:
print(f"Config: {config_str}")

Config: rmse_only_4_replicates


In [424]:

if filter_out_replicates and (filter_only_replicates is not None):
    raise ValueError("Cannot filter out replicates and only include replicates")

root_results = Path(f"results/")
if downsample is not None:
    root_results = root_results / f"downsample_{downsample}"
if response_is_normalised:
    root_results = root_results / f"normalised_response"

results = get_all_results(root_results / dataset, response_is_normalised=response_is_normalised, log_response=log_response)
logger.info(f"Found {len(results)} results for {dataset}")
train, val, test = load_dataset(dataset, seed=SEED)
mean = pd.concat([train, val])["label"].mean()
if "rmse" in metric:
    perf_of_mean = ((test["label"] - mean) ** 2).mean() ** 0.5
    if log_response:
        perf_of_mean = np.log(perf_of_mean)
    logger.info(f"Performance of mean with seed {SEED}: {perf_of_mean:.2f}")
else:
    raise NotImplementedError(f"Metric {metric} not implemented")

original_results = pd.DataFrame(results)
# Quickly strip suffix from metric name for latex tables/figures
original_results.rename(columns={raw_metric: metric}, inplace=True)
original_results.head()

2025-03-14 09:19:18,632 - analysis - INFO - Found 2611 results for online_boat_listings
2025-03-14 09:19:18,683 - analysis - INFO - Performance of mean with seed 97: 446169.26


Unnamed: 0,preprocessers,tokeniser,featuriser,model,total_time,preprocess_time,model_time,rmse_train,rmse_val,rmse,...,mae_test,r2_train,r2_val,r2_test,medae_train,medae_val,medae_test,finetune,seed,preprocesser(s)
0,"[no_numbers_spacy, no_punctuation, no_stopword...",whitespace,bow_count,deep_linear,247.05084,189.886259,57.16458,588951.913069,380334.980025,435165.781187,...,217546.242308,0.299485,0.029071,0.04868,17833.585938,107035.929688,106243.359375,False,1,"NNSC,NP,NSCC"
1,"[no_numbers_spacy, no_punctuation, no_stopword...",whitespace,bow_count,deep_linear,473.019562,291.487679,181.531883,616215.278808,435807.265898,453233.188583,...,218790.125425,0.208084,0.034577,-0.031955,20866.675781,103357.390625,98122.90625,False,3,"NNSC,NP,NSCC"
2,"[no_numbers_spacy, no_punctuation, no_stopword...",whitespace,bow_count,deep_linear,144.767496,93.166733,51.600763,437477.178106,616502.402047,431240.592578,...,227628.456063,0.534557,0.049541,0.065764,11708.318359,118735.382812,116868.945312,False,97,"NNSC,NP,NSCC"
3,"[no_numbers_spacy, no_punctuation, no_stopword...",whitespace,bow_count,deep_linear,130.952616,95.192159,35.760457,605763.905186,383874.376108,436196.534765,...,212738.081348,0.257906,0.027952,0.044168,20705.427734,101362.714844,99162.875,False,2,"NNSC,NP,NSCC"
4,"[no_numbers_spacy, no_punctuation, no_stopword...",whitespace,glove_mean,xgboost,181.587633,181.266419,0.321214,603065.026969,392533.109383,450019.469014,...,251278.50397,0.26551,-0.034208,-0.017372,171848.546875,168575.039062,177649.3125,False,1,"NNSC,NP,NSCC"


In [425]:
joined_data = pd.concat([train, val])
all_data = pd.concat([train, val, test])
mean_response = all_data["label"].mean()
standard_deviation_of_response = all_data["label"].std()
logger.info(f"Mean response (n data={len(all_data)}): {mean_response:.2f} +- {standard_deviation_of_response:.2f}")

2025-03-14 09:19:18,743 - analysis - INFO - Mean response (n data=1850): 247605.08 +- 605892.85


## Filter

In [426]:
filter_out = {
    "preprocessers": ["number"],
    "model": ["linear_regression"], # We filter out these results as they are just too bad, deep_linear models sorta do that anyway
    "tokeniser": [],
    "featuriser": [],
}
if filter_out_replicates:
    filter_out["seed"] = [1, 2, 3]

filtered_results = original_results.copy()
for feature, values in filter_out.items():
    if feature == "preprocessers":
        for value in values:
            filtered_results = filtered_results[~filtered_results["preprocessers"].apply(lambda x: any(value in y for y in x))]
    else:
        for value in values:
            filtered_results = filtered_results[filtered_results[feature] != value]
filtered_results.loc[filtered_results['finetune'], 'model'] = filtered_results.loc[filtered_results['finetune'], 'model'] + '_ft'



Analyse which seeds are missing in certain replicates if we want to increase sample size

In [427]:
replicates = filtered_results.groupby(hash_cols)["seed"].transform("count")
def seed_missing(seeds):
    return list(set([1,2,3,97]) - set(seeds))
missing_seeds = filtered_results[(1 < replicates) & (replicates < 4)].groupby(["preprocesser(s)", "tokeniser", "featuriser", "model"]).agg({"seed": lambda x: seed_missing(x)})
missing_seeds["seed"].value_counts()


seed
[1]    5
Name: count, dtype: int64

In [428]:
if filter_only_replicates is not None:
    filtered_results = filtered_results[replicates == filter_only_replicates]

logger.info(f"Filtered to {len(filtered_results)} results")
filtered_results = filtered_results.sort_values(metric).reset_index(names="idx")
filtered_results["pipeline_hash"] = filtered_results[hash_cols].apply(lambda x: hash(tuple(x)), axis=1)

2025-03-14 09:19:18,831 - analysis - INFO - Filtered to 560 results


In [429]:
dupe_cols = ["pipeline_hash"]
if filter_only_replicates:
    dupe_cols.append("seed")
dupe_idx = filtered_results.duplicated(subset=dupe_cols, keep=False)
dupes = filtered_results[dupe_idx]
logger.info(f"Found {len(dupes)} duplicates")
display(dupes.sort_values("pipeline_hash").head(5))

2025-03-14 09:19:18,869 - analysis - INFO - Found 0 duplicates


Unnamed: 0,idx,preprocessers,tokeniser,featuriser,model,total_time,preprocess_time,model_time,rmse_train,rmse_val,...,r2_train,r2_val,r2_test,medae_train,medae_val,medae_test,finetune,seed,preprocesser(s),pipeline_hash


In [430]:
fig = px.box(filtered_results, y=metric, points="outliers")
def quantiles(df: pd.DataFrame):
    min_metric, max_metric = df[metric].min(), df[metric].max()
    q1_metric, q3_metric = df[metric].quantile(0.25), df[metric].quantile(0.75)
    median_metric = df[metric].median()
    return min_metric, q1_metric, median_metric, q3_metric, max_metric
min_metric, q1_metric, median_metric, q3_metric, max_metric = quantiles(filtered_results)
fig.update_yaxes(
    range=[
        filtered_results[metric].min()*0.9,
        (filtered_results[metric].median() + 1.5 * (filtered_results[metric].quantile(0.75) - filtered_results[metric].quantile(0.25)))*1.1
    ]
)
fig.add_hline(y=perf_of_mean, line_dash="dash", line_color="red", annotation_text=f"Mean baseline {perf_of_mean:.2f}")

fig.write_image(OUTPUT_DIR / f"{dataset}_{config_str}_boxplot.{image_ext}")

In [431]:
def threshold_histogram(df: pd.DataFrame, metric: str, min_val: float = None, max_val: float = None, mean_line: float = None):
    if min_val is not None:
        df = df[df[metric] >= min_val]
    if max_val is not None:
        df = df[df[metric] <= max_val]
    fig = px.histogram(df, x=metric, nbins=100)
    if mean_line is not None:
        fig.add_vline(x=mean_line, line_dash="dash", line_color="red", annotation_text=f"Mean baseline {mean_line:.2f}")
    return fig

fig = threshold_histogram(filtered_results, metric, min_metric, max_metric, perf_of_mean)
fig.show()

In [432]:
if filter_only_replicates:
    mean_rmse = filtered_results.groupby("pipeline_hash").agg({metric: "mean"})
    min_mean_rmse = mean_rmse[metric].min()
    max_mean_rmse = mean_rmse[metric].max()
    fig = threshold_histogram(mean_rmse, metric, min_mean_rmse, max_mean_rmse, perf_of_mean)
    fig.show()

# Timing of different groups of model

In [433]:
# Create histogram comparing timing of finetuned vs non-finetuned models
ft_pipelines = filtered_results[filtered_results["model"].str.contains("_ft", na=False)].copy()
bow_pipelines = filtered_results[filtered_results["featuriser"].isin(["bow_binary", "bow_count", "tf_idf"])].copy()
ft_times = ft_pipelines["total_time"].values
bow_times = bow_pipelines["total_time"].values

fig = go.Figure()
fig.add_trace(go.Histogram(x=ft_times, name="Finetuned Models", histnorm="percent", nbinsx=20))
fig.add_trace(go.Histogram(x=bow_times, name="BOW Models", histnorm="percent", nbinsx=20))

fig.update_layout(
    title="Training Time Distribution: Finetuned vs BOW Models",
    xaxis_title="Total Training Time (s)",
    yaxis_title="Count",
    barmode='overlay'
)

fig.update_traces(opacity=0.75)
fig.write_image(OUTPUT_DIR / f"{dataset}_{config_str}_timing_histogram.{image_ext}")
fig.show()


In [434]:
p_value, diff = permutation_test(bow_times, ft_times)

results = pd.DataFrame({
    "dataset": [dataset_name_map[dataset]],
    "mean-difference (BOW - FT)": [diff],
    "p_value": [p_value],
})

display(results)

latex_table = results.to_latex(float_format=simple_format, index=False)
latex_table = process_latex_table(latex_table)
with open(table_dir / f"{dataset}_{config_str}_time_permutation_test.tex", "w") as f:
    f.write(latex_table)


Unnamed: 0,dataset,mean-difference (BOW - FT),p_value
0,obl,-1298.536568,1e-05


# Reduction from best to worst pipeline conditional on model

In [435]:
filtered_results.head(2)

Unnamed: 0,idx,preprocessers,tokeniser,featuriser,model,total_time,preprocess_time,model_time,rmse_train,rmse_val,...,r2_train,r2_val,r2_test,medae_train,medae_val,medae_test,finetune,seed,preprocesser(s),pipeline_hash
0,721,[no_stopwords_nltk],bert,bert_mean,mlp_ft,882.849295,27.127307,855.721988,312632.655761,347906.523124,...,0.802609,0.187581,0.437324,56293.304688,78129.964752,77999.021606,True,1,NSN,-365488063582128812
1,569,[empty],bert,bert_cls,deep_linear_ft,1143.500595,17.745727,1125.754869,294493.771088,310971.115851,...,0.82461,0.362104,0.42694,23241.0,71045.010742,69712.90625,True,2,empty,-5606085006502300781


## Raw total reduction

In [449]:
worst_to_best_df = filtered_results.copy()

In [456]:
table_dir.mkdir(exist_ok=True, parents=True)
def calc_reduction_in_error(df: pd.DataFrame, metric: str):
    df = df.copy()
    very_best = df[metric].sort_values().head(1).values[0]
    very_worst = df[metric].sort_values().tail(1).values[0]
    reduction_in_error = dict()
    unconditional_reduction_in_error = (very_worst - very_best) / very_worst * 100
    unique_models = df["model"].unique()
    for model in unique_models:
        model_results = df[df["model"] == model]
        very_best = model_results[metric].sort_values().head(1).values[0]
        very_worst = model_results[metric].sort_values().tail(1).values[0]
        reduction_in_error[model] = {
            "reduction": (very_worst - very_best) / very_worst * 100,
            "best": very_best,
            "worst": very_worst,
            "sem": model_results[metric].sem()
        }

    reduction_in_error = pd.DataFrame(reduction_in_error).sort_index(axis=1)
    return reduction_in_error, unconditional_reduction_in_error

reduction_in_error, unconditional_reduction_in_error = calc_reduction_in_error(worst_to_best_df, metric)
display(reduction_in_error)

latex_table = reduction_in_error.to_latex(float_format=simple_format)
latex_table = process_latex_table(latex_table)

with open(table_dir / f"{dataset}_{config_str}_reduction_in_error.tex", "w") as f:
    f.write(latex_table)
print(f"Unconditional reduction in error: {unconditional_reduction_in_error:.2f}%")

Unnamed: 0,catboost,deep_linear,deep_linear_ft,mlp,mlp_ft,resnet,resnet_ft,xgboost
reduction,16.461563,18.696972,12.941351,25.069113,20.147586,15.964982,16.908434,56.351265
best,394155.94372,366433.594525,337746.866655,361421.341939,334672.924494,390477.780873,357138.704818,376578.98475
worst,471825.85424,450701.039501,387953.257554,482339.599134,419114.347742,464660.793763,429813.423372,862748.91172
sem,609.312972,2166.518622,3191.938797,2873.730175,6836.656431,1720.606747,4085.463558,6982.61543


Unconditional reduction in error: 61.21%


## Mean pipeline reduction

Can be a bit more precise if we take averages of replicates so we do not conflate variability; but a bit harder to interpret as averaging naturally can reduce variance

In [459]:
worst_to_best_df_average_replicates = filtered_results.groupby(hash_cols).agg({metric: "mean"}).reset_index()
reduction_in_error, unconditional_reduction_in_error = calc_reduction_in_error(worst_to_best_df_average_replicates, metric)
display(reduction_in_error)

latex_table = reduction_in_error.to_latex(float_format=simple_format)
latex_table = process_latex_table(latex_table)

with open(table_dir / f"{dataset}_{config_str}_reduction_in_error_mean.tex", "w") as f:
    f.write(latex_table)
print(f"Unconditional reduction in error: {unconditional_reduction_in_error:.2f}%")


Unnamed: 0,catboost,deep_linear,deep_linear_ft,mlp,mlp_ft,resnet,resnet_ft,xgboost
reduction,4.578743,17.248734,4.27687,21.557073,4.217479,9.01447,7.332287,27.674745
best,432137.574607,371669.056646,350867.200857,367445.97519,367336.171585,407425.87977,374181.006172,403536.037431
worst,452873.488251,449140.025079,366543.803222,468424.608811,383510.651617,447791.95324,403787.892242,557946.234553
sem,687.317437,4274.857367,2958.890549,5670.963741,4896.140624,2517.404072,5580.987561,8071.425854


Unconditional reduction in error: 37.11%


# Outlier filtering

If the range of interest is compressed by outliers for plots we can filter them below; following config from above

In [256]:
# Filter top k
if filter_out_worst:
    k = filter_worst_k_for[dataset]
else: 
    k = 0
worst = filtered_results.sort_values(metric).tail(k)
results_df = filtered_results[~filtered_results.index.isin(worst.index)]

In [257]:
fig = threshold_histogram(results_df, metric, min_metric, max_metric, perf_of_mean)
fig.show()
fig.write_image(OUTPUT_DIR / f"{dataset}_{config_str}_filtered_{k}_histogram.{image_ext}")


# Record all performances in one CSV

In [258]:
_sorted = original_results[pipeline_cols + ["preprocesser(s)", "seed", metric]].sort_values(metric)
pipelines_cols_to_hash = pipeline_cols.copy()
pipelines_cols_to_hash.remove("preprocessers")
pipelines_cols_to_hash.append("preprocesser(s)")
_sorted["hash"] = _sorted.apply(lambda x: hash(tuple(x[pipelines_cols_to_hash])), axis=1)
_sorted.to_csv(table_dir / f"{config_str}_all_performances.csv")

# Look at best and worst performers

In [259]:
k = 10
if filter_out_replicates:
    num_replicates = 1
else:
    num_replicates = 4
unique_pipeline_cols = [col for col in pipeline_cols if col not in ["preprocessers", "finetune"]]
unique_pipeline_cols = ["preprocesser(s)"] + unique_pipeline_cols
unique_pipelines = filtered_results.groupby(unique_pipeline_cols)[[metric, "seed"]].agg(
 {
     metric: ["mean", "sem"],
     "seed" : "count"
 }
)

In [260]:
unique_pipelines_with_replicates = unique_pipelines[unique_pipelines["seed", "count"] == num_replicates]

In [261]:
if filter_out_replicates:
    columns_to_save = [[metric, "mean"]]
else:
    columns_to_save = [[metric, "mean"],[metric, "sem"]]
    
logger.info("Best performers")
best = unique_pipelines_with_replicates.sort_values((metric, "mean")).head(k)
latex_table = best[columns_to_save].to_latex(float_format=simple_format)
latex_table = process_latex_table(latex_table)
with open(table_dir / f"{config_str}_best_{k}.tex", "w") as f:
    f.write(latex_table)
display(best)
logger.info("Worst performers")
worst = unique_pipelines_with_replicates.sort_values((metric, "mean")).tail(k)
latex_table = worst[columns_to_save].to_latex(float_format=simple_format)
latex_table = process_latex_table(latex_table)
with open(table_dir / f"{config_str}_worst_{k}.tex", "w") as f:
    f.write(latex_table)
display(worst)


2025-03-13 16:58:52,262 - analysis - INFO - Best performers


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,rmse,rmse,seed
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,sem,count
preprocesser(s),tokeniser,featuriser,model,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
empty,bert,bert_mean,deep_linear_ft,350867.200857,1645.266271,4
"Le,Lo",bert,bert_mean,deep_linear_ft,351301.832889,6202.251458,4
empty,bert,bert_cls,deep_linear_ft,354328.818128,8510.649599,4
NP,bert,bert_cls,deep_linear_ft,359783.353527,9632.145181,4
"Le,Lo,NP,NSN",bert,bert_cls,deep_linear_ft,366543.803222,7608.146768,4
empty,bert,bert_cls,mlp_ft,367336.171585,6514.259578,4
Le,bpe,tf_idf,mlp,367445.97519,2354.992179,4
empty,bert,bert_mean,mlp_ft,371003.074074,8945.019364,4
Le,bpe,tf_idf,deep_linear,371669.056646,2116.859016,4
"Le,NSN",bert,bert_mean,resnet_ft,374181.006172,8615.468414,4


2025-03-13 16:58:52,284 - analysis - INFO - Worst performers


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,rmse,rmse,seed
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,sem,count
preprocesser(s),tokeniser,featuriser,model,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
empty,whitespace,bow_binary,xgboost,461134.171131,27716.435925,4
NSCC,whitespace,bow_count,mlp,461816.274199,4801.034471,4
"Le,NSCC",treebank,bow_count,mlp,465625.543803,3609.738392,4
Lo,treebank,bow_count,mlp,468424.608811,5086.328134,4
S,bert,bert_cls,xgboost,470257.424131,4629.749061,4
Le,bert,bert_mean,xgboost,478427.381822,25990.831515,4
"Le,Lo",bert,bert_mean,xgboost,478427.381822,25990.831515,4
"NSCC,S",whitespace,glove_mean,xgboost,490151.336132,16207.672852,4
Lo,bert,bert_cls,xgboost,557946.234553,101684.646283,4
empty,bert,bert_cls,xgboost,557946.234553,101684.646283,4


Brief foray into looking at sem distribution - noting it is much more variable for finetuning

In [262]:
# Create histogram of standard errors, split by finetuned vs non-finetuned models
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df = unique_pipelines_with_replicates.copy().reset_index()

# Get boolean mask for finetuned models
ft_mask = df['model'].str.contains('_ft')

# Create figure
fig = go.Figure()

# Add histograms
fig.add_trace(go.Histogram(
    x=df[~ft_mask][metric, 'sem'],
    name='Non-finetuned',
    opacity=0.5,
    nbinsx=20,
    histnorm="percent"
))

fig.add_trace(go.Histogram(
    x=df[ft_mask][metric, 'sem'], 
    name='Finetuned',
    opacity=0.5,
    nbinsx=20,
    histnorm="percent"
))

# Update layout
fig.update_layout(
    title='Distribution of Standard Errors by Model Type',
    xaxis_title='Standard Error of Mean',
    yaxis_title='Percentage',
    barmode='overlay',
    showlegend=True,
    template='plotly_white'
)

# Log summary statistics
logger.info("\nStandard Error Summary Statistics:")
logger.info(f"Non-finetuned mean: {df[~ft_mask][metric, 'sem'].mean():.2f}")
logger.info(f"Finetuned mean: {df[ft_mask][metric, 'sem'].mean():.2f}")

2025-03-13 16:58:52,339 - analysis - INFO - 
Standard Error Summary Statistics:
2025-03-13 16:58:52,341 - analysis - INFO - Non-finetuned mean: 5819.22
2025-03-13 16:58:52,343 - analysis - INFO - Finetuned mean: 8108.25


# Confirm coverage of component choices

In [263]:
def count_unique_values(df: pd.DataFrame, feature: str):
    return df[feature].value_counts().to_dict()

def count_unique_values_of_set(df: pd.DataFrame, feature: str):
    counts = {}
    for row in df.itertuples():
        in_the_bag = getattr(row, feature)
        for feature_value in in_the_bag:
            counts[feature_value] = counts.get(feature_value, 0) + 1
    return counts

def create_pipeline_counts_table(df: pd.DataFrame):
    tables = []
    # Get counts for each pipeline step
    counts = {}
    counts["preprocessers"] = count_unique_values_of_set(df, "preprocessers")
    counts["tokeniser"] = count_unique_values(df, "tokeniser") 
    counts["featuriser"] = count_unique_values(df, "featuriser")
    counts["model"] = count_unique_values(df, "model")
    
    for feature in ["preprocessers", "tokeniser", "featuriser", "model"]:
        counts_df = pd.DataFrame({feature: pd.Series(counts[feature])})
        tables.append(counts_df)
        latex_table = counts_df.to_latex(
            float_format=simple_format,
            na_rep='',
            header=False,
            escape=False
        )
        latex_table = latex_table.replace("_", "-")

        # Save to file
        table_path = table_dir / f"{config_str}_pipeline_counts_{feature}.tex"
        with open(table_path, 'w') as f:
            f.write(latex_table)
        
    return tables

# Generate and display the table
pipeline_counts = create_pipeline_counts_table(results_df)
for table in pipeline_counts:
    display(table)


Unnamed: 0,preprocessers
no_stopwords_nltk,100
empty,168
lemmatize,136
lowercase,196
no_punctuation,168
stem,136
no_stopwords_spacy,104


Unnamed: 0,tokeniser
bert,248
whitespace,116
stanford,92
bpe,56
treebank,48


Unnamed: 0,featuriser
bow_binary,116
tf_idf,104
bert_cls,92
bert_mean,84
bow_count,84
glove_mean,80


Unnamed: 0,model
mlp,124
deep_linear,112
catboost,104
xgboost,96
resnet,72
deep_linear_ft,20
resnet_ft,20
mlp_ft,12


# Look at spread associated with each component

In [264]:
def histogram_on_feature(
        df: pd.DataFrame, 
        feature: str, 
        metric: str, 
        threshold: float = None, 
        histogram: bool = True,
        mean_line: float = None,
        **kwargs
    ):
    if threshold is not None:
        df = df[df[metric] < threshold]
    unique_features = df[feature].unique()
    unique_features = sorted(unique_features)
    if histogram:
        fig = make_subplots(rows=len(unique_features), cols=1, shared_xaxes=True, subplot_titles=unique_features)
    else:
        fig = make_subplots(rows=1, cols=len(unique_features), shared_yaxes=True, subplot_titles=unique_features)
    for i, feature_value in enumerate(unique_features):
        feature_data = df[df[feature] == feature_value][metric]
        if histogram:
            fig.add_trace(
                go.Histogram(
                x=feature_data, 
                name=feature_value, 
                texttemplate="%{y:.2f}",
                histnorm="percent",
                marker_color="gray",
                **kwargs
            ), 
                row=i+1, col=1
            )
        else:
            fig.add_trace(
                go.Box(
                    y=feature_data,
                    name=feature_value,
                    marker_color="gray",
                    **kwargs
                ),
                row=1, col=i+1
            )
    if mean_line is not None:
        if histogram:
            fig.add_vline(x=mean_line, line_dash="dash", line_color="red")
        else:
            fig.add_hline(y=mean_line, line_dash="dash", line_color="red")
    if histogram:
        fig.update_layout(height=200 * len(unique_features))
    else:
        fig.update_layout(width=200 * len(unique_features))
    fig.update_layout(title=f"Distribution of {metric} by {feature}")
    fig.update_layout(showlegend=False)
    return fig

def histogram_on_set(
        df: pd.DataFrame, 
        feature: str, 
        metric: str, 
        threshold: float = None,
        histogram: bool = True,
        mean_line: float = None,
        **kwargs
    ):
    if threshold is not None:
        df = df[df[metric] < threshold]
    bags = defaultdict(list)
    for row in df.itertuples():
        bag = getattr(row, feature)
        for feature_value in bag:
            bags[feature_value].append(getattr(row, metric))
        if len(bag) == 0:
            bags["Empty"].append(getattr(row, metric))
    bags = dict(sorted(bags.items()))
    if histogram:
        fig = make_subplots(rows=len(bags), cols=1, shared_xaxes=True, subplot_titles=list(bags.keys()))
    else:
        fig = make_subplots(rows=1, cols=len(bags), shared_yaxes=True, subplot_titles=list(bags.keys()))
    for i, (feature_value, values) in enumerate(bags.items()):
        if histogram:
            fig.add_trace(
                go.Histogram(
                    x=values,
                    name=feature_value,
                    texttemplate="%{y:.2f}",
                    histnorm="percent",
                    marker_color="gray",
                    **kwargs
                ),
                row=i+1, col=1
            )
        else:
            fig.add_trace(
                go.Box(
                    y=values,
                    name=feature_value,
                    marker_color="gray",
                    **kwargs
                ),
                row=1, col=i+1
            )
    if mean_line is not None:
        if histogram:
            fig.add_vline(x=mean_line, line_dash="dash", line_color="red")
        else:
            fig.add_hline(y=mean_line, line_dash="dash", line_color="red")
    if histogram:
        fig.update_layout(height=200 * len(bags))
    else:
        fig.update_layout(width=200 * len(bags))
    fig.update_layout(title=f"Distribution of {metric} for {feature}")
    fig.update_layout(showlegend=False)
    return fig


fig = histogram_on_set(results_df, "preprocessers", metric, mean_line=perf_of_mean)
fig.write_image(histogram_dir / f"{config_str}_preprocessers_histogram.{image_ext}")
for feature in ["tokeniser", "featuriser", "model"]:
    fig = histogram_on_feature(results_df, feature, metric, mean_line=perf_of_mean)
    fig.write_image(histogram_dir / f"{config_str}_{feature}_histogram.{image_ext}")


In [265]:
fig = histogram_on_set(results_df, "preprocessers", metric, histogram=False, mean_line=perf_of_mean)
# fig.show()
# fig.write_html(OUTPUT_DIR / f"{dataset}_{metric}_filtered_{k}_preprocessers_boxplot.html")
fig.write_image(boxplot_dir / f"{config_str}_preprocessers_boxplot.{image_ext}")

for feature in ["tokeniser", "featuriser", "model"]:
    fig = histogram_on_feature(results_df, feature, metric, histogram=False, mean_line=perf_of_mean)
    # fig.show()
    # fig.write_html(OUTPUT_DIR / f"{dataset}_{metric}_filtered_{k}_{feature}_boxplot.html")
    fig.write_image(boxplot_dir / f"{config_str}_{feature}_boxplot.{image_ext}")


# Develop filtering/conditioning logic

In [266]:
def condition_on(
        equals_conditions: list[tuple[str, str]], 
        not_equals_conditions: list[tuple[str, str]], 
        df: pd.DataFrame
    ):
    """
    A condition is a tuple of (column, value)

    We want to filter the dataframe to only include rows where the equals 
    conditions are true AND the not equals conditions are false.
    """
    subset = df.copy()  
    for condition in equals_conditions:
        idx = subset[condition[0]].str.contains(condition[1])
        subset = subset.loc[idx, :]
    for condition in not_equals_conditions:
        idx = subset[condition[0]] != condition[1]
        subset = subset.loc[idx, :]
    return subset

def condition_featuriser(df: pd.DataFrame, option: str):
    condition = [("featuriser", option)]
    return condition_on(condition, [], df)

def condition_not_featuriser(df: pd.DataFrame, option: str):
    condition = [("featuriser", option)]
    return condition_on([], condition, df)

featurisers = ["glove_mean", "bow_count", "bow_binary", "tf_idf", "bert_cls", "bert_mean"]
conditioned_on = {}
conditioned_not_on = {}

for condition in featurisers:
    conditioned_on[condition] = condition_featuriser(filtered_results, condition)
    conditioned_not_on[condition] = condition_not_featuriser(filtered_results, condition)

# BoW for DL

In [267]:
if filter_only_replicates:
    to_consider_rmse = filtered_results.groupby(["preprocesser(s)", "tokeniser", "featuriser", "model"]).agg({metric: ["mean", "sem"]}).reset_index()
    to_consider_rmse.columns = ["preprocesser(s)", "tokeniser", "featuriser", "model", f"{metric}", f"{metric}_sem"]
    title = "dist of mean RMSE for each pipeline"
else:
    to_consider_rmse = filtered_results
    title = "dist of RMSE for each pipeline"

In [268]:
dl_models = ["mlp", "resnet", "deep_linear"]
gbdt_models = ["catboost", "xgboost"]
bow_models = ["bow_count", "bow_binary", "tf_idf"]
ignore_glove = to_consider_rmse["featuriser"] != "glove_mean"
bow_comparison_df = to_consider_rmse[ignore_glove]
dl_models = dl_models
ft_dl_models = [f"{x}_ft" for x in dl_models]
is_dl_model = bow_comparison_df["model"].isin(dl_models)
is_ft_dl_model = bow_comparison_df["model"].isin(ft_dl_models)
is_bow_model = bow_comparison_df["featuriser"].isin(bow_models)
is_gbdt_model = bow_comparison_df["model"].isin(gbdt_models)

bow_dl_results = bow_comparison_df[
    is_dl_model & is_bow_model
]
bow_ft_dl_results = bow_comparison_df[
    is_ft_dl_model & is_bow_model
]
bow_gbdt_results = bow_comparison_df[
    is_gbdt_model & is_bow_model
]
not_bow_dl_results = bow_comparison_df[
    is_dl_model & ~is_bow_model
]
not_bow_ft_dl_results = bow_comparison_df[
    is_ft_dl_model & ~is_bow_model
]
bow_gbdt_results["model_type"] = "GBDT w/ BoW"
bow_dl_results["model_type"] = "DL w/ BoW"
bow_ft_dl_results["model_type"] = "DL w/ BoW + Finetuning" # Wait this shouldnt be possible?
not_bow_dl_results["model_type"] = "DL w/o BoW"
not_bow_ft_dl_results["model_type"] = "DL w/o BoW + Finetuning"

comparison = pd.concat([
    bow_dl_results, 
    not_bow_dl_results, 
    bow_ft_dl_results, 
    not_bow_ft_dl_results,
    bow_gbdt_results
])
fig = px.box(
    comparison,
    x="model_type",
    y=metric,
    points="outliers",
    title=title
)
fig.add_hline(y=perf_of_mean, line_dash="dash", line_color="gray")
fig.write_image(
    OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_bow_dl_gbdt.{image_ext}",
    height=SAVE_HEIGHT,
    width=SAVE_WIDTH,
    scale=2
)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

# DL w/ Finetuning vs w/out

Look into whether we have graphically noticable differences between pipelines that had BERT featurisation using and not using finetuning

In [269]:
plot_version = 4

In [270]:
unique_models = filtered_results["model"].unique()
unique_models 

array(['mlp_ft', 'deep_linear_ft', 'resnet_ft', 'mlp', 'deep_linear',
       'xgboost', 'resnet', 'catboost'], dtype=object)

In [271]:
if False:
    to_consider_rmse = filtered_results.groupby(["preprocesser(s)", "tokeniser", "featuriser", "model"]).agg({metric: ["mean", "sem"]}).reset_index()
    to_consider_rmse.columns = ["preprocesser(s)", "tokeniser", "featuriser", "model", f"{metric}", f"{metric}_sem"]
else:
    to_consider_rmse = filtered_results

In [272]:
using_ft = condition_on([("model", "ft")], [], to_consider_rmse)
with_mlp = to_consider_rmse["model"] == "mlp"
with_linear = to_consider_rmse["model"] == "deep_linear"
with_resnet = to_consider_rmse["model"] == "resnet"
using_dl = with_mlp | with_linear | with_resnet
with_bert_featurizer = (to_consider_rmse["featuriser"] == "bert_cls") | (to_consider_rmse["featuriser"] == "bert_mean")
with_bow_featurizer = (to_consider_rmse["featuriser"] == "bow_count") | (to_consider_rmse["featuriser"] == "bow_binary") | (to_consider_rmse["featuriser"] == "tf_idf")
with_glove_featurizer = (to_consider_rmse["featuriser"] == "glove_mean")

using_dl_no_ft = using_dl & with_bert_featurizer
incompatible_dl = using_dl & ~with_bert_featurizer
bert_no_dl = ~using_dl & with_bert_featurizer
using_dl_w_glove = using_dl & with_glove_featurizer
dl_w_bow = using_dl & with_bow_featurizer

subsampled = {}

subsampled["using_ft"] = using_ft
subsampled["using_dl_no_ft"] = to_consider_rmse[using_dl_no_ft]
subsampled["using_dl_no_ft_incompatible"] = to_consider_rmse[incompatible_dl]
subsampled["using_dl_w_bow"] = to_consider_rmse[dl_w_bow]
subsampled["using_dl_w_glove"] = to_consider_rmse[using_dl_w_glove]
subsampled["using_bert_no_dl"] = to_consider_rmse[bert_no_dl]
subsampled["using_bow"] = to_consider_rmse[with_bow_featurizer]

compare_dl_ft = pd.DataFrame({
    "Model Type": ["DL + BERT w/ Finetuning", "DL + BERT w/o Finetuning", "DL + BoW"],
    "Mean": [
        subsampled["using_ft"][metric].mean(), 
        subsampled["using_dl_no_ft"][metric].mean(), 
        subsampled["using_dl_w_bow"][metric].mean()
    ],
    "Standard Error": [
        subsampled["using_ft"][metric].sem(), 
        subsampled["using_dl_no_ft"][metric].sem(), 
        subsampled["using_dl_w_bow"][metric].sem()
    ]
}).round(0)

latex_table = compare_dl_ft.to_latex(
    index=False,
    float_format=simple_format
)
latex_table = process_latex_table(latex_table)
with open(table_dir / f"{config_str}_compare_dl_ft.tex", "w") as f:
    f.write(latex_table)

subsampled["using_ft"]["model_type"] = "BERT+DL w/ Finetuning"
subsampled["using_dl_no_ft"]["model_type"] = "BERT+DL w/o Finetuning"
subsampled["using_dl_no_ft_incompatible"]["model_type"] = "Not BERT+DL"
subsampled["using_bert_no_dl"]["model_type"] = "BERT w/o DL"
subsampled["using_dl_w_bow"]["model_type"] = "BoW+DL"
subsampled["using_dl_w_glove"]["model_type"] = "GloVe+DL"
subsampled["using_bow"]["model_type"] = "BoW"

# Can also group by pipeline_hash and model_type - to compare main effects, not thought it through though so will just use raw data
grouped_subsampled = {}
for sampled in subsampled:
    grouped_subsampled[sampled] = subsampled[sampled].groupby(["pipeline_hash", "model_type"]).agg({metric: "mean"}).reset_index()

if plot_version == 5:
    to_analyze = grouped_subsampled
else:
    to_analyze = subsampled

to_combine = [
    to_analyze["using_ft"], 
    to_analyze["using_dl_no_ft"]
]
if plot_version == 2:
    to_combine.append(to_analyze["using_dl_no_ft_incompatible"])
if plot_version == 3:
    to_combine.append(to_analyze["using_dl_w_bow"])
    to_combine.append(to_analyze["using_dl_w_glove"])
    to_combine.append(to_analyze["using_bert_no_dl"])
if plot_version == 4 or plot_version == 5:
    to_combine.append(to_analyze["using_bow"])
combined = pd.concat(to_combine)


fig = px.box(
    combined,
    y=metric,
    x="model_type",
    points="outliers"
)
fig.add_hline(y=perf_of_mean, line_dash="dash", line_color="gray")
savepath = OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_compare_dl_ft.{image_ext}"
if plot_version == 2:
    savepath = OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_compare_dl_ft_incompatible.{image_ext}"
if plot_version == 3:
    savepath = OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_compare_dl_ft_bow.{image_ext}"
if plot_version == 4:
    savepath = OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_compare_dl_ft_bow_v4.{image_ext}"
if plot_version == 5:
    savepath = OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_compare_dl_ft_bow_v5.{image_ext}"
fig.write_image(
    savepath,
    height=SAVE_HEIGHT,
    width=SAVE_WIDTH,
    scale=2
)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [273]:
using_gbdt_w_bow = to_consider_rmse[
    ~using_dl & with_bow_featurizer
]
using_gbdt_w_bow["model_type"] = "BoW + GBDT"
using_gbdt_w_glove = to_consider_rmse[
    ~using_dl & with_glove_featurizer
]
using_gbdt_w_glove["model_type"] = "GloVe + GBDT"
using_gbdt_w_bert = to_consider_rmse[
    ~using_dl & with_bert_featurizer
]
using_gbdt_w_bert["model_type"] = "BERT + GBDT"

subsampled["using_gbdt_w_bow"] = using_gbdt_w_bow
subsampled["using_gbdt_w_glove"] = using_gbdt_w_glove
subsampled["using_gbdt_w_bert"] = using_gbdt_w_bert

if plot_version == 5:
    grouped_subsampled["using_gbdt_w_bow"] = using_gbdt_w_bow.groupby(["pipeline_hash", "model_type"]).agg({metric: "mean"}).reset_index()
    grouped_subsampled["using_gbdt_w_glove"] = using_gbdt_w_glove.groupby(["pipeline_hash", "model_type"]).agg({metric: "mean"}).reset_index()
    grouped_subsampled["using_gbdt_w_bert"] = using_gbdt_w_bert.groupby(["pipeline_hash", "model_type"]).agg({metric: "mean"}).reset_index()

all_combined = pd.concat([
    combined, to_analyze["using_gbdt_w_bow"]
])

if plot_version == 3 or plot_version == 4 or plot_version == 5:
    all_combined = pd.concat([
        all_combined, 
        to_analyze["using_gbdt_w_glove"], 
        to_analyze["using_gbdt_w_bert"]
    ])

fig = px.box(
    all_combined,
    y=metric,
    x="model_type",
    points="outliers"
)
fig.add_hline(y=perf_of_mean, line_dash="dash", line_color="gray")
savepath = OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_compare_dl_ft_and_bow.{image_ext}"
if plot_version == 2:
    savepath = OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_compare_dl_ft_and_bow_incompatible.{image_ext}"
if plot_version == 3:
    savepath = OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_compare_dl_ft_and_bow_w_wout_dl.{image_ext}"
if plot_version == 4:
    savepath = OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_compare_dl_ft_and_bow_v4.{image_ext}"
if plot_version == 5:
    savepath = OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_compare_dl_ft_and_bow_v5.{image_ext}"
fig.write_image(
    savepath,
    height=SAVE_HEIGHT,
    width=SAVE_WIDTH,
    scale=2
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



For BERT featurization to reach the performance of BoW (with or without DL, surprisingly DL performing strongly with BoW) then we must use finetuning

## Testing for equivalence to validate reading of graph

In [274]:
pairs = [
    (("ft", to_analyze["using_ft"]), ("no_ft", to_analyze["using_dl_no_ft"])),
    (("ft", to_analyze["using_ft"]), ("bow", to_analyze["using_bow"])),
    (("no_ft", to_analyze["using_dl_no_ft"]), ("bow", to_analyze["using_bow"])),
]
rows = []
for (a, b) in pairs:
    a_name, a_df = a
    b_name, b_df = b
    p_val, diff = permutation_test(a_df[metric], b_df[metric])
    row = {
        'comparison': f'{a_name} vs {b_name}',
        'mean_difference': diff,
        'p_value': p_val
    }
    rows.append(row)
results = pd.DataFrame(rows)
table = results.to_latex(
    index=False,
    float_format=simple_format
)
table = process_latex_table(table)
savepath = OUTPUT_DIR / dataset / "tables" / f"{config_str}_compare_dl_w_wout_ft_and_bow.tex"
if plot_version == 5:
    savepath = OUTPUT_DIR / dataset / "tables" / f"{config_str}_compare_dl_w_wout_ft_and_bow_v5.tex"
with open(savepath, "w") as f:
    f.write(table)
display(results)
print(f"Saved to {savepath}")

Unnamed: 0,comparison,mean_difference,p_value
0,ft vs no_ft,-52351.657714,1e-05
1,ft vs bow,-49932.133591,1e-05
2,no_ft vs bow,2419.524123,0.526265


Saved to notebook_output/online_boat_listings/tables/rmse_only_4_replicates_compare_dl_w_wout_ft_and_bow.tex


## Do this blocked pairwise on the seeds
Most powerful approach - see if this gives substantially different results

In [275]:
# Create a dictionary similar to subsampled but with seed filtering
subsampled_by_seed = {}
seeds = subsampled["using_ft"]["seed"].unique()
for seed in seeds:
    if seed not in subsampled_by_seed:
        subsampled_by_seed[seed] = {}
    # Group by key and get mean RMSE for each seed
    for key, df in subsampled.items():
        df_seed = df[df["seed"] == seed]
        subsampled_by_seed[seed][key] = df_seed[metric].values


In [276]:
subsampled_by_seed.keys()

dict_keys([1, 2, 3, 97])

In [277]:
def blocked_permutation_test(nested_dict, key1, key2, n_permutations=10000):
    """
    Perform a blocked permutation test comparing two groups within blocks.
    
    Args:
        nested_dict: Dictionary of dictionaries where first level is blocks and second level is groups, final level should be numpy arrays
        key1: First group to compare
        key2: Second group to compare 
        n_permutations: Number of permutations to perform
        
    Returns:
        p_value: P-value from permutation test
        observed_diff: Observed difference in means between groups
    """
    # Calculate observed difference in means
    group_1 = []
    group_2 = []
    blocks = nested_dict.keys()
    for block in blocks:
        group_1.append(nested_dict[block][key1])
        group_2.append(nested_dict[block][key2])
    observed_diff = np.mean(np.concatenate(group_1)) - np.mean(np.concatenate(group_2))
    
    # Perform permutations within each block
    count = 0
    for _ in range(n_permutations):
        perm_group_1 = []
        perm_group_2 = []
        for block in blocks:
            # Get values for both groups in this block
            vals1 = nested_dict[block][key1]
            vals2 = nested_dict[block][key2]
            all_vals = np.concatenate([vals1, vals2])
            
            # Randomly permute and split back into groups
            np.random.shuffle(all_vals)
            perm1 = all_vals[:len(vals1)]
            perm2 = all_vals[len(vals1):]
            perm_group_1.append(perm1)
            perm_group_2.append(perm2)
            
        # Calculate overall permuted difference
        perm_mean_diff = np.mean(np.concatenate(perm_group_1)) - np.mean(np.concatenate(perm_group_2))
        if abs(perm_mean_diff) >= abs(observed_diff):
            count += 1
            
    p_value = (count + 1) / (n_permutations + 1)
    
    return p_value, observed_diff

In [278]:

# Create synthetic data from normal distribution to demonstrate importance of blocked testing
# Create two blocks with different means but same treatment effect
n_per_group = 20
blocked_vals = {
    "block1": {},
    "block2": {}
}
blocked_vals["block1"]["control"] = np.random.normal(loc=100, scale=10, size=n_per_group)
blocked_vals["block2"]["control"] = np.random.normal(loc=150, scale=10, size=n_per_group)

blocked_vals["block1"]["treatment"] = np.random.normal(loc=105, scale=10, size=n_per_group) # Effect size = 5
blocked_vals["block2"]["treatment"] = np.random.normal(loc=155, scale=10, size=n_per_group)  

# Naive t-test ignoring blocks
all_control = np.concatenate([blocked_vals["block1"]["control"], blocked_vals["block2"]["control"]])
all_treatment = np.concatenate([blocked_vals["block1"]["treatment"], blocked_vals["block2"]["treatment"]])

p_val, observed_diff = permutation_test(all_treatment, all_control)
print(f"Naive permutation test p-value: {p_val:.3f}")
print(f"Observed difference: {observed_diff:.3f}")


p_val, observed_diff = blocked_permutation_test(blocked_vals, "treatment", "control")
print(f"Blocked permutation test p-value: {p_val:.3f}")
print(f"Observed difference: {observed_diff:.3f}")


Naive permutation test p-value: 0.362
Observed difference: 5.971
Blocked permutation test p-value: 0.006
Observed difference: 5.971


In [279]:
name_to_col_map = {
    "ft": "using_ft",
    "no_ft": "using_dl_no_ft",
    "bow": "using_bow",
}
p_vals = []
for a,b in pairs:
    a_name, a_df = a
    b_name, b_df = b    
    p_val, observed_diff = blocked_permutation_test(subsampled_by_seed, name_to_col_map[a_name], name_to_col_map[b_name])
    print(f"Blocked permutation test p-value for {a_name} vs {b_name}: {p_val:.4f}")
    print(f"Observed difference: {observed_diff:.4f}")
    p_vals.append(p_val)


Blocked permutation test p-value for ft vs no_ft: 0.0001
Observed difference: -52351.6577
Blocked permutation test p-value for ft vs bow: 0.0001
Observed difference: -49932.1336
Blocked permutation test p-value for no_ft vs bow: 0.5201
Observed difference: 2419.5241


In [280]:
# Apply Benjamini-Hochberg correction to p-values
p_vals = np.array(p_vals)
n_tests = len(p_vals)
sorted_p_idx = np.argsort(p_vals)
sorted_p_vals = p_vals[sorted_p_idx]

# Calculate BH critical values
bh_values = np.arange(1, n_tests + 1) * alpha / n_tests

# Find largest p-value that is less than corresponding BH critical value
is_significant = sorted_p_vals <= bh_values
if np.any(is_significant):
    max_sig_idx = np.where(is_significant)[0][-1]
    adjusted_alpha = bh_values[max_sig_idx]
    logger.info(f"\nAfter Benjamini-Hochberg correction (alpha={alpha}):")
    logger.info(f"Adjusted significance threshold: {adjusted_alpha:.4f}")
    logger.info(f"First {max_sig_idx+1} comparisons are significant")
else:
    logger.info("\nNo comparisons significant after Benjamini-Hochberg correction")

# Plot p-values against their rank with BH line
fig = go.Figure()

# Plot points
rank = np.arange(1, len(sorted_p_vals) + 1)
fig.add_trace(
    go.Scatter(
        x=rank,
        y=sorted_p_vals,
        mode='markers',
        name='P-values',
        marker=dict(color='blue')
    )
)

# Plot BH line
fig.add_trace(
    go.Scatter(
        x=rank,
        y=bh_values,
        mode='lines',
        name=f'BH line (α={alpha})',
        line=dict(color='red', dash='dash')
    )
)

# Update layout
fig.update_layout(
    title='P-values vs Benjamini-Hochberg Critical Values',
    xaxis_title='Rank',
    yaxis_title='P-value',
    showlegend=True,
    template='plotly_white'
)

fig.show()




2025-03-13 16:59:02,762 - analysis - INFO - 
After Benjamini-Hochberg correction (alpha=0.05):
2025-03-13 16:59:02,763 - analysis - INFO - Adjusted significance threshold: 0.0333
2025-03-13 16:59:02,764 - analysis - INFO - First 2 comparisons are significant


# bert_cls vs glove: Pairwise comparison

In [281]:
bert_v_glove_version = 2

In [282]:
cols_of_interest = latex_cols + ["seed", "preprocessers"]

bert_performances = pd.concat([conditioned_on["bert_cls"], conditioned_on["bert_mean"]])[cols_of_interest]
if bert_v_glove_version == 1:
    glove_performances = conditioned_on["glove_mean"][cols_of_interest]
elif bert_v_glove_version == 2:
    glove_performances = condition_on(
        equals_conditions=[("featuriser", "glove_mean")],
        not_equals_conditions=[("model", "ft")],
        df=filtered_results
    )[cols_of_interest]

# def filter_out_stemming(df):
#     return df[df["preprocessers"].apply(lambda x: "stem" not in x)]

# glove_performances = filter_out_stemming(glove_performances)
# bert_performances = filter_out_stemming(bert_performances)

shared_pipelines = glove_performances.merge(
    bert_performances,
    on=[
        "preprocesser(s)",  # Unsure if we should also merge on preprocesser(s) as it fragments results, but it strengthens test
        "model",
        "seed"
    ],
    suffixes=["_glove", "_bert"],
    how="inner"
)
shared_pipelines[metric+"_diff"] = shared_pipelines[metric+"_bert"] - shared_pipelines[metric+"_glove"]
shared_pipelines = shared_pipelines.sort_values(metric+"_diff")
print(shared_pipelines.shape)

(36, 12)


In [283]:
from itertools import product

def binary_counter_gen(len):
    for mask in product([True, False], repeat=len):
        yield np.array(mask)


In [284]:
def paired_permutation_test(a, b, n_permutations=10000, random=True):
    """
    Tests that the mean difference between two paired samples is significantly different from 0.

    Do so by shuffling membership of a and b and checking how often the mean difference is greater 
    than the observed difference.
    """

    if len(a) != len(b):
        raise ValueError("a and b must have the same length")
    max_num_perms = 2**len(a)
    if n_permutations > max_num_perms:
        if random:
            logger.warning(f"n_permutations ({n_permutations:.0e}) is greater than the maximum number of permutations ({max_num_perms:.0e}). This may give misleading results.")
        else:
            logger.warning(f"n_permutations ({n_permutations:.0e}) is greater than the maximum number of permutations ({max_num_perms:.0e}). Flooring to {max_num_perms}.")
            n_permutations = max_num_perms
    elif n_permutations < max_num_perms and not random:
        logger.warning(f"n_permutations ({n_permutations:.0e}) is less than the maximum number of permutations ({max_num_perms:.0e}). This may give misleading results. As such we will use all permutations.")
        n_permutations = max_num_perms
    a_copy = a.copy()
    b_copy = b.copy()
    observed_diff = (a_copy - b_copy).mean()
    more_extreme_than_observed = 0.
    if not random:
        binary_counter = iter(binary_counter_gen(len(a)))
    for i in range(n_permutations):
        new_a = a_copy.copy()
        new_b = b_copy.copy()
        if random:
            mask = np.random.choice([True, False], size=len(a), replace=True)
            new_a[mask] = b_copy[mask]
            new_b[mask] = a_copy[mask]
            mean_diff = (new_a - new_b).mean()
            if abs(mean_diff) >= abs(observed_diff):
                more_extreme_than_observed += 1
        else:
            mask = next(binary_counter)
            new_a[mask] = b_copy[mask]
            new_b[mask] = a_copy[mask]
            mean_diff = (new_a - new_b).mean()
            if abs(mean_diff) >= abs(observed_diff):
                more_extreme_than_observed += 1
    return (more_extreme_than_observed+1) / (n_permutations+1), observed_diff

In [285]:
# Test that paired permutation test works correctly on identical data
normal_sample = np.random.normal(size=100)
same_normal_sample = np.random.normal(size=100)
p_val, observed_diff = paired_permutation_test(normal_sample, same_normal_sample, random=True)
logger.info(f"\nPaired permutation test on identical data:")
logger.info(f"Observed difference: {observed_diff:.2f}")
logger.info(f"P-value: {p_val}")
logger.info(f"Statistically {'different' if p_val < alpha else 'not different'} from 0 at {alpha} significance")


2025-03-13 16:59:03,282 - analysis - INFO - 
Paired permutation test on identical data:
2025-03-13 16:59:03,282 - analysis - INFO - Observed difference: -0.09
2025-03-13 16:59:03,284 - analysis - INFO - P-value: 0.45695430456954306
2025-03-13 16:59:03,285 - analysis - INFO - Statistically not different from 0 at 0.05 significance


In [286]:
p_val, observed_diff = paired_permutation_test(
    shared_pipelines[metric+"_bert"].values, 
    shared_pipelines[metric+"_glove"].values, 
    random=True,
    n_permutations=100000
)
row = {
    "dataset": dataset_name_map[dataset],
    "mean_difference (BERT - GloVE)": observed_diff,
    "p_value": p_val
}
results = pd.DataFrame([row])
latex_table = results.to_latex(
    index=False,
    float_format=simple_format
)
latex_table = process_latex_table(latex_table)

savepath = OUTPUT_DIR / dataset / "tables" / f"{config_str}_glove_vs_bert.tex"
with open(savepath, "w") as f:
    f.write(latex_table)
print(f"Saved to {savepath}")
display(results)

Saved to notebook_output/online_boat_listings/tables/rmse_only_4_replicates_glove_vs_bert.tex


Unnamed: 0,dataset,mean_difference (BERT - GloVE),p_value
0,obl,-32909.403372,1e-05


Sanity check with a validated t test implementation

In [287]:
# Compare BERT and GloVE with paired t-test - sanity check on permutation test
t_stat, t_pval = stats.ttest_rel(shared_pipelines[metric+"_bert"].values, shared_pipelines[metric+"_glove"].values)

logger.info(f"\nPaired t-test results:")
logger.info(f"T-statistic: {t_stat:.3f}")
logger.info(f"P-value: {t_pval:.3f}")
logger.info(f"Statistically {'different' if t_pval < alpha else 'not different'} at {alpha} significance")


2025-03-13 16:59:06,639 - analysis - INFO - 
Paired t-test results:
2025-03-13 16:59:06,641 - analysis - INFO - T-statistic: -11.841
2025-03-13 16:59:06,642 - analysis - INFO - P-value: 0.000
2025-03-13 16:59:06,643 - analysis - INFO - Statistically different at 0.05 significance


# Do preprocessers matter?

## Preprocessers for traditional NLP

In [None]:
featurisers = ["bow_count", "bow_binary", "tf_idf"]
trad_nlp_results = filtered_results[
    (filtered_results["featuriser"].isin(featurisers))
]
unique_preprocessers = sorted(set(chain.from_iterable(trad_nlp_results["preprocessers"].values)))
uses_preprocessing = trad_nlp_results["preprocessers"].apply(lambda x: x[0] != "empty")
uses_specifically = {}
for preprocesser in unique_preprocessers:
    uses_specifically[preprocesser] = trad_nlp_results["preprocessers"].apply(lambda x: preprocesser in x)

trad_nlp_results["uses_preprocessing"] = uses_preprocessing

fig = px.box(
    trad_nlp_results,
    y=metric,
    color="uses_preprocessing",
    points="outliers"
)
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
# Create subplots, one for each preprocesser
fig = make_subplots(cols=len(unique_preprocessers), rows=1, 
                    subplot_titles=unique_preprocessers,
                    shared_yaxes=True,
                    vertical_spacing=0.05)

p_values = []

for i, preprocesser in enumerate(unique_preprocessers, 1):
    _with = trad_nlp_results[uses_specifically[preprocesser]][metric]
    _without = trad_nlp_results[~uses_specifically[preprocesser]][metric]
    # Test normality of each distribution
    _, p_with = stats.normaltest(_with)
    _, p_without = stats.normaltest(_without)
    
    # Perform appropriate statistical test based on normality
    if p_with > 0.05 and p_without > 0.05:
        # Both normal - use t-test
        _, p_val = stats.ttest_ind(_with, _without)
        test_type = "t-test"
    else:
        # At least one non-normal - use Mann-Whitney U test
        _, p_val = stats.mannwhitneyu(_with, _without)
        test_type = "Mann-Whitney U"
    p_values.append(p_val)
    annotation = f"{test_type} p-value: {p_val:.3f}"
    logger.info(f"\n{preprocesser}:")
    logger.info(f"With normality p-value: {p_with:.3f}")
    logger.info(f"Without normality p-value: {p_without:.3f}")
    logger.info(annotation)

    # Add box plot for when preprocesser is used
    fig.add_trace(
        go.Box(
            y=_with,
            name="with",
            boxpoints='outliers',
            marker_color="green"
        ),
        row=1, col=i
    )
    
    # Add box plot for when preprocesser is not used
    fig.add_trace(
        go.Box(
            y=_without,
            name="without",
            boxpoints='outliers',
            marker_color="red"
        ),
        row=1, col=i
    )

    fig.add_hline(y=perf_of_mean, line_dash="dash", line_color="gray")

fig.update_layout(
    width=200*len(unique_preprocessers),
    # showlegend=True,
    title_text="Impact of Different Preprocessors",
    yaxis_title=metric
)


fig.show()

2025-03-13 16:59:06,759 - analysis - INFO - 
empty:
2025-03-13 16:59:06,760 - analysis - INFO - With normality p-value: 0.000
2025-03-13 16:59:06,760 - analysis - INFO - Without normality p-value: 0.000
2025-03-13 16:59:06,761 - analysis - INFO - Mann-Whitney U p-value: 0.259
2025-03-13 16:59:06,793 - analysis - INFO - 
lemmatize:
2025-03-13 16:59:06,794 - analysis - INFO - With normality p-value: 0.000
2025-03-13 16:59:06,794 - analysis - INFO - Without normality p-value: 0.652
2025-03-13 16:59:06,795 - analysis - INFO - Mann-Whitney U p-value: 0.002
2025-03-13 16:59:06,831 - analysis - INFO - 
lowercase:
2025-03-13 16:59:06,832 - analysis - INFO - With normality p-value: 0.000
2025-03-13 16:59:06,833 - analysis - INFO - Without normality p-value: 0.785
2025-03-13 16:59:06,834 - analysis - INFO - Mann-Whitney U p-value: 0.010
2025-03-13 16:59:06,880 - analysis - INFO - 
no_punctuation:
2025-03-13 16:59:06,881 - analysis - INFO - With normality p-value: 0.007
2025-03-13 16:59:06,882 - 

Only takeaway: stemming seems to be universally bad, lemmatizing could be good. Generally this is not meaningful as not stratified by other choices

If something passes then we know at least something even FWER corrected is meaningful to remove/keep

## Specifically for 'modern' dl approaches?

In [290]:
deep_learning_models = ["mlp", "resnet", "deep_linear"]
deep_learning_models = deep_learning_models + [f"{x}_ft" for x in deep_learning_models]
deep_learning_featurizers = ["bert_cls", "bert_mean"]
deep_learning_tokenizers = ["bert"]

deep_learning_results = filtered_results[
    (filtered_results["model"].isin(deep_learning_models)) &
    (filtered_results["featuriser"].isin(deep_learning_featurizers)) &
    (filtered_results["tokeniser"].isin(deep_learning_tokenizers))
]

z = deep_learning_results.sort_values(by=metric).reset_index(drop=True)
logger.info(deep_learning_results.shape)
fig = px.box(deep_learning_results, y=metric, color="model")
fig.add_hline(y=deep_learning_results[metric].mean(), line_dash="dash", line_color="gray")
fig.show()

2025-03-13 16:59:07,336 - analysis - INFO - (112, 24)


In [291]:
logger.info(f"Best deep learners varied over preprocessers:")
best_subset = deep_learning_results.head(5)
display(best_subset[["preprocessers", metric]])

logger.info(f"Worst deep learners varied over preprocessers:")
worst_subset = deep_learning_results.tail(5)
display(worst_subset[["preprocessers", metric]])

2025-03-13 16:59:07,401 - analysis - INFO - Best deep learners varied over preprocessers:


Unnamed: 0,preprocessers,rmse
0,[no_stopwords_nltk],334672.924494
1,[empty],337746.866655
2,"[lemmatize, lowercase]",339429.741674
3,"[lemmatize, lowercase]",343361.329935
4,[no_punctuation],346313.463233


2025-03-13 16:59:07,414 - analysis - INFO - Worst deep learners varied over preprocessers:


Unnamed: 0,preprocessers,rmse
456,[lowercase],447711.804419
462,"[lowercase, no_punctuation, stem]",448440.049344
474,"[lowercase, no_punctuation, stem]",448858.488372
506,"[no_stopwords_spacy, stem]",453542.935731
507,"[no_stopwords_spacy, stem]",453551.932735


### Look at with and without preprocesser

In [None]:
no_preprocesser_idx = deep_learning_results["preprocessers"].apply(lambda x: x==["empty"])
no_preprocessers_subset = deep_learning_results[no_preprocesser_idx]
with_preprocessers_subset = deep_learning_results[~no_preprocesser_idx]
no_preprocessers_subset["model_type"] = "w/o Preprocessers"
with_preprocessers_subset["model_type"] = "w/ Preprocessers"
joint = pd.concat([no_preprocessers_subset, with_preprocessers_subset])

color_order = sorted(joint["model"].unique())

fig = px.box(
    joint,
    x="model_type",
    y=metric,
    # color="model",
    points="outliers",
    category_orders={"model": color_order}
)
fig.add_hline(y=perf_of_mean, line_dash="dash", line_color="gray")

fig.write_image(
    OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_deep_learners_w_wout_preprocessers.{image_ext}",
    height=SAVE_HEIGHT,
    width=SAVE_WIDTH,
    scale=2
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
p_value, observed_diff = permutation_test(with_preprocessers_subset[metric], no_preprocessers_subset[metric])

logger.info(f"\nPermutation Test Results:")
logger.info(f"Observed difference in means (with - without): {observed_diff:.2f}; n = {len(with_preprocessers_subset)}, m = {len(no_preprocessers_subset)}")
logger.info(f"P-value: {p_value:.4f}")
logger.info(f"Statistically {'different' if p_value < alpha else 'not different'} at {alpha} significance")


2025-03-13 16:59:09,116 - analysis - INFO - 
Permutation Test Results:
2025-03-13 16:59:09,118 - analysis - INFO - Observed difference in means (with - without): 18753.18; n = 72, m = 40
2025-03-13 16:59:09,119 - analysis - INFO - P-value: 0.0031
2025-03-13 16:59:09,120 - analysis - INFO - Statistically different at 0.05 significance


Too difficult to draw meaningful conclusions from these results

## 'Safe' vs 'Unsafe' preprocessers

What about specifically preprocessers that do and dont change the token distribution?

In [294]:
# Remind of all preprocessers
all_preproc = set()
for preproc in filtered_results["preprocessers"].values:
    all_preproc.update(preproc)
all_preproc = list(all_preproc)
all_preproc

['lowercase',
 'no_punctuation',
 'no_stopwords_spacy',
 'stem',
 'no_stopwords_nltk',
 'empty',
 'lemmatize']

In [295]:
safe_preproc = ["empty", "no_stopwords_spacy", "lowercase", "no_stopwords_nltk", "no_punctuation", "lemmatize"]
unsafe_preproc = [x for x in all_preproc if x not in safe_preproc]
safe_preproc, unsafe_preproc

(['empty',
  'no_stopwords_spacy',
  'lowercase',
  'no_stopwords_nltk',
  'no_punctuation',
  'lemmatize'],
 ['stem'])

In [None]:

# Condition on unsafe preprocessers
unsafe_results = filtered_results[filtered_results["preprocessers"].apply(
    lambda x: any(y in unsafe_preproc for y in x)
)]

# Condition on safe preprocessers
safe_results = filtered_results[filtered_results["preprocessers"].apply(
    lambda x: all(y in safe_preproc for y in x)
)]

# Create boxplot comparing safe vs unsafe preprocessers
fig = px.box(
    pd.concat([
        safe_results.assign(preproc_type=f"Safe"),
        unsafe_results.assign(preproc_type=f"Unsafe ({unsafe_preproc})")
    ]),
    x="preproc_type",
    y=metric,
    points="outliers",
    color="model",
    title="Performance with Safe vs Unsafe Preprocessors"
)

fig.update_layout(
    xaxis_title="Preprocessor Type",
    yaxis_title=metric.replace("_", " ").title(),
    showlegend=True,
    boxmode='group'
)

In [297]:

fig.write_image(
    OUTPUT_DIR / dataset / "1d_boxplots" / f"{config_str}_safe_vs_unsafe_preprocessers.{image_ext}",
    height=SAVE_HEIGHT,
    width=SAVE_WIDTH,
    scale=2
)

In [None]:
# Run statistical test
p_value, observed_diff = permutation_test(
    safe_results[metric], 
    unsafe_results[metric]
)

logger.info(f"\nPermutation Test Results for Safe vs Unsafe Preprocessors:")
logger.info(f"Observed difference in means (safe - unsafe): {observed_diff:.2f}; n = {len(safe_results)}, m = {len(unsafe_results)}")
logger.info(f"P-value: {p_value:.4f}")
logger.info(f"Statistically {'different' if p_value < alpha else 'not different'} at {alpha} significance")


2025-03-13 16:59:11,544 - analysis - INFO - 
Permutation Test Results for Safe vs Unsafe Preprocessors:
2025-03-13 16:59:11,545 - analysis - INFO - Observed difference in means (safe - unsafe): -9065.66; n = 424, m = 136
2025-03-13 16:59:11,546 - analysis - INFO - P-value: 0.0270
2025-03-13 16:59:11,546 - analysis - INFO - Statistically different at 0.05 significance


## Systematically, each preprocesser

In [None]:
# Create plots for each preprocesser showing performance with/without

for preprocesser in all_preproc:
    # Create mask for results that include this preprocesser
    uses_specifically = filtered_results['preprocessers'].apply(lambda x: preprocesser in x)
    
    # Split data
    with_prep = filtered_results[uses_specifically].copy()
    without_prep = filtered_results[~uses_specifically].copy()
    
    # Add labels
    with_prep['preprocesser_usage'] = f'With {preprocesser}'
    without_prep['preprocesser_usage'] = f'Without {preprocesser}'
    
    # Combine data
    plot_data = pd.concat([with_prep, without_prep])
    
    # Create plot
    fig = px.box(
        plot_data,
        x='preprocesser_usage',
        y=metric,
        color='model',
        points='outliers',
        title=f'Performance With vs Without {preprocesser}'
    )
    
    fig.update_layout(
        xaxis_title='Preprocesser Usage',
        yaxis_title=metric.replace('_', ' ').title(),
        showlegend=True,
        boxmode='group'
    )
    
    # Save plot
    plot_dir = OUTPUT_DIR / dataset / '1d_boxplots' / 'preprocessers'
    plot_dir.mkdir(parents=True, exist_ok=True)
    fig.write_image(
        plot_dir / f'{config_str}_with_without_{preprocesser}.{image_ext}',
        height=SAVE_HEIGHT,
        width=SAVE_WIDTH,
        scale=2
    )
    
    # Run statistical test
    p_value, observed_diff = permutation_test(
        with_prep[metric],
        without_prep[metric]
    )
    
    logger.info(f"\nPermutation Test Results for {preprocesser}:")
    logger.info(f"Observed difference in means (with - without): {observed_diff:.2f}; n = {len(with_prep)}, m = {len(without_prep)}")
    logger.info(f"P-value: {p_value:.4f}")
    logger.info(f"Statistically {'different' if p_value < alpha else 'not different'} at {alpha} significance")



2025-03-13 16:59:13,825 - analysis - INFO - 
Permutation Test Results for lowercase:
2025-03-13 16:59:13,826 - analysis - INFO - Observed difference in means (with - without): 167.54; n = 196, m = 364
2025-03-13 16:59:13,827 - analysis - INFO - P-value: 0.9642
2025-03-13 16:59:13,827 - analysis - INFO - Statistically not different at 0.05 significance
2025-03-13 16:59:16,072 - analysis - INFO - 
Permutation Test Results for no_punctuation:
2025-03-13 16:59:16,073 - analysis - INFO - Observed difference in means (with - without): -4662.55; n = 168, m = 392
2025-03-13 16:59:16,073 - analysis - INFO - P-value: 0.2292
2025-03-13 16:59:16,074 - analysis - INFO - Statistically not different at 0.05 significance
2025-03-13 16:59:18,331 - analysis - INFO - 
Permutation Test Results for no_stopwords_spacy:
2025-03-13 16:59:18,331 - analysis - INFO - Observed difference in means (with - without): 9235.40; n = 104, m = 456
2025-03-13 16:59:18,332 - analysis - INFO - P-value: 0.0408
2025-03-13 16:

Conclusion seems to be: don't use stemming; at least when looking at the model partitioned plots.

Stopword removal also works irrespective of pipeline?

# Vector embeddings for GBDTs

In [None]:
vector_embeddings = ["bert_cls", "bert_mean", "glove_mean"]
dl_models = ["mlp", "resnet", "deep_linear"]
dl_models = dl_models + [f"{x}_ft" for x in dl_models]
trad_nlp_with_dl_feat = filtered_results[
    (~filtered_results["model"].isin(dl_models)) &
    (filtered_results["featuriser"].isin(vector_embeddings))
]
trad_nlp_without_dl_feat = filtered_results[
    (~filtered_results["model"].isin(dl_models)) &
    (~filtered_results["featuriser"].isin(vector_embeddings))
]
trad_nlp_with_dl_feat = trad_nlp_with_dl_feat.sort_values(by=metric).reset_index(drop=True)
trad_nlp_without_dl_feat = trad_nlp_without_dl_feat.sort_values(by=metric).reset_index(drop=True)

w_embedding_label = "w/ Vector Embeddings"
wout_embedding_label = "w/o Vector Embeddings"
trad_nlp_with_dl_feat["model_type"] = w_embedding_label
trad_nlp_without_dl_feat["model_type"] = wout_embedding_label
joint = pd.concat([trad_nlp_with_dl_feat, trad_nlp_without_dl_feat])
# Get unique models from both dataframes
all_models = pd.concat([
    trad_nlp_with_dl_feat['model'], 
    trad_nlp_without_dl_feat['model']
]).unique()

# Create color mapping using plotly's default color sequence
from plotly.colors import qualitative
def iqr_threshold(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    return q3 + 1.5 * (q3 - q1) 
model_colors = {
    model: qualitative.Set1[i] for i, model in enumerate(sorted(all_models))
}
model_order = sorted(all_models)

save_name = f"{config_str}_gbdt_w_wout_vec_embeddings"

to_plot = joint
fig = px.box(
    to_plot,
    x="model_type",
    y=metric,
    color="model",
    points="outliers",
    color_discrete_map=model_colors,  # Add this line,
    category_orders={"model": model_order, "model_type": [w_embedding_label, wout_embedding_label]}
)
# fig.update_layout(title="Traditional NLP w/ and w/out vector embeddings")
fig.add_hline(y=perf_of_mean, line_dash="dash", line_color="gray")

fig.write_image(
    OUTPUT_DIR / dataset / "1d_boxplots" / f"{save_name}.{image_ext}",
    height=SAVE_HEIGHT,
    width=SAVE_WIDTH,
    scale=2
)

# Replicate figure without outliers for easier reading

save_name += "_no_outliers"
threshold = iqr_threshold(joint[metric])
k = 10
without_worst_k = joint.sort_values(by=metric).head(len(joint)-k)
to_plot = without_worst_k
fig = px.box(
    to_plot,
    x="model_type",
    y=metric,
    color="model",
    points="outliers",
    color_discrete_map=model_colors,  # Add this line,
    category_orders={"model": model_order, "model_type": [w_embedding_label, wout_embedding_label]}
)
# fig.update_layout(title="Traditional NLP w/ and w/out vector embeddings")
fig.add_hline(y=perf_of_mean, line_dash="dash", line_color="gray")

fig.write_image(
    OUTPUT_DIR / dataset / "1d_boxplots" / f"{save_name}.{image_ext}",
    height=SAVE_HEIGHT,
    width=SAVE_WIDTH,
    scale=2
)
print(f"Saved to {OUTPUT_DIR / dataset / '1d_boxplots' / f'{save_name}.{image_ext}'}")
print(f"Model order: {model_order}")

Saved to notebook_output/online_boat_listings/1d_boxplots/rmse_only_4_replicates_gbdt_w_wout_vec_embeddings_no_outliers.png
Model order: ['catboost', 'xgboost']


In [301]:
# Get scores for each group
with_vec = trad_nlp_with_dl_feat[metric].values
without_vec = trad_nlp_without_dl_feat[metric].values

# Perform permutation test
p_value, diff = permutation_test(without_vec, with_vec)
comparison_df = pd.DataFrame({
    'dataset': [dataset_name_map[dataset]],
    'mean_difference (BoW - Vector)': [diff],
    'p_value': [p_value]
})

latex_table = comparison_df.to_latex(
    index=False,
    float_format=lambda x: f"{x:.3f}"
)
latex_table = process_latex_table(latex_table)
with open(table_dir / f"{config_str}_compare_gbdt_w_wout_vec_embeddings.tex", "w") as f:
    f.write(latex_table)


# Should we choose CLS or Mean?

In [302]:
# Extract RMSE values for CLS and MEAN
cls_rmse = filtered_results[filtered_results['featuriser'] == 'bert_cls'][metric]
mean_rmse = filtered_results[filtered_results['featuriser'] == 'bert_mean'][metric]

# Perform Mann-Whitney U test since we can't assume normality
statistic, p_value = stats.mannwhitneyu(cls_rmse, mean_rmse, alternative='two-sided')

logger.info(f"Mann-Whitney U test results:")
logger.info(f"Statistic: {statistic}")
logger.info(f"p-value: {p_value}")
if p_value < alpha:
    logger.info(f"There is a difference between BERT CLS and BERT MEAN at {alpha} significance")
else:
    logger.info(f"No difference between BERT CLS and BERT MEAN at {alpha} significance")

# Create box plot comparing distributions
fig = go.Figure()

fig.add_trace(go.Box(
    y=cls_rmse,
    name='BERT CLS',
    boxpoints='all',
    jitter=0.3,
    pointpos=-1.8
))

fig.add_trace(go.Box(
    y=mean_rmse,
    name='BERT MEAN', 
    boxpoints='all',
    jitter=0.3,
    pointpos=-1.8
))

fig.update_layout(
    title='RMSE Distribution: BERT CLS vs MEAN',
    yaxis_title='RMSE',
    showlegend=True
)

fig.show()


2025-03-13 16:59:30,600 - analysis - INFO - Mann-Whitney U test results:
2025-03-13 16:59:30,602 - analysis - INFO - Statistic: 3860.0
2025-03-13 16:59:30,603 - analysis - INFO - p-value: 0.9917286754498297
2025-03-13 16:59:30,605 - analysis - INFO - No difference between BERT CLS and BERT MEAN at 0.05 significance


# Can we look at spread associated with choosing a model?

In [303]:
to_analyze = filtered_results.copy()
nlp_steps = ["preprocesser(s)", "tokeniser", "featuriser"]
to_analyze["nlp_hash"] = to_analyze[nlp_steps].apply(lambda x: "_".join(x), axis=1)

## Expected Variance/Standard deviation

In [349]:
from itertools import chain
def expected_agg_over_model(aggregate, df, verbose=False):
    """ 
    Gets V(Model)
    """
    sub_pipeline_cols = ["preprocesser(s)", "tokeniser", "featuriser"]
    # First remove variance from choosing replicate
    hashable_replicates = df.copy()
    hashable_replicates = hashable_replicates.groupby(sub_pipeline_cols+["model"]).agg(
        {metric: "mean", "seed": "count"}
    ).reset_index()
    hashable_replicates["upstream_hash"] = hashable_replicates[sub_pipeline_cols].apply(lambda x: "_".join(x), axis=1)
    # Find every pipeline that has just the model different at least once
    replicate_idx = hashable_replicates.duplicated(subset="upstream_hash", keep=False)
    aggregate_dict = {
        metric: aggregate
    }
    if verbose:
        aggregate_dict["model"] = lambda row: ",".join(row.values)
    variance_from_choosing_model = hashable_replicates[replicate_idx].groupby("upstream_hash").agg(
        aggregate_dict
    )
    result = variance_from_choosing_model.reset_index()
    expected_variance_from_choosing_model = variance_from_choosing_model[metric].mean()
    logger.info(f"Expected {aggregate} from choosing model: {expected_variance_from_choosing_model:.2f}")
    return result, expected_variance_from_choosing_model

In [364]:
def expected_agg_over_step(aggregate, df, step="model"):
    # Find every pipeline that has a fixed model but varying pipeline
    # We guarantee that each model has duplicates in the dataset
    to_analyze = df.copy()
    to_analyze = to_analyze.groupby(["preprocesser(s)", "tokeniser", "featuriser", "model"]).agg(
        {metric: "mean", "seed": "count"}
    ).reset_index()
    variance_per_model = to_analyze.groupby(step).agg(
        {metric: aggregate, "seed": "count"}
    )
    result = variance_per_model.reset_index()
    variance_marginalised_over_model = variance_per_model[metric].mean()
    logger.info(f"Expected {aggregate} from choosing everything else marginalised over {step}: {variance_marginalised_over_model:.2f}")
    return result, variance_marginalised_over_model

In [365]:
def expected_agg_over_replicate(aggregate, df):
    pipeline_cols = ["preprocesser(s)", "tokeniser", "featuriser", "model"]
    hashable_replicates = df.copy()
    hashable_replicates["hash"] = hashable_replicates[pipeline_cols].apply(lambda x: "_".join(x), axis=1)
    replicate_idx = hashable_replicates.duplicated(subset="hash", keep=False)
    agg_per_replicate = hashable_replicates[replicate_idx].groupby("hash").agg(
        {metric: aggregate}
    ).reset_index()

    result = agg_per_replicate
    agg_marginalised_over_replicate = agg_per_replicate[metric].mean()
    logger.info(f"Expected {aggregate} from choosing replicate: {agg_marginalised_over_replicate:.2f}")
    return result, agg_marginalised_over_replicate


In [None]:
agg = "var"

to_analyze = filtered_results
per_model, overall_model = expected_agg_over_model(agg, to_analyze)
per_upstream, overall_upstream = expected_agg_over_step(agg, to_analyze)
# _ = expected_agg_over_step("std", step="featuriser")
# _ = expected_agg_over_step("std", step="tokeniser")
per_replicate, overall_replicate = expected_agg_over_replicate(agg, to_analyze)
if agg == "var":
    logger.info(f"Overall variance: {filtered_results[metric].var():.2f}")
elif agg == "std":
    logger.info(f"Overall standard deviation: {filtered_results[metric].std():.2f}")

# Create LaTeX table comparing variances
agg_map = {
    "var": "V",
    "std": "S"
}

latex_table = (
    "\\begin{tabular}{lrr}\n"
    "\\toprule\n"
    f"Dataset & $ \\mathbb{{E}}_\\text{{model}}[{agg_map[agg]}]$ & $ \\mathbb{{E}}_\\text{{upstream}}[{agg_map[agg]}]$ \\\\\n"
    "\\midrule\n"
    f"{dataset_name_map[dataset]} & {overall_model:.1e} & {overall_upstream:.1e} \\\\\n"
    "\\bottomrule\n"
    "\\end{tabular}\n"
)
with open(table_dir / f"{config_str}_variance_comparison.tex", "w") as f:
    f.write(latex_table)

print(latex_table)



2025-03-13 18:02:51,871 - analysis - INFO - Expected var from choosing model: 1576752499.94
2025-03-13 18:02:51,881 - analysis - INFO - Expected var from choosing everything else marginalised over model: 433746406.96
2025-03-13 18:02:51,890 - analysis - INFO - Expected var from choosing replicate: 792789611.18
2025-03-13 18:02:51,892 - analysis - INFO - Overall variance: 1742233901.41


\begin{tabular}{lrr}
\toprule
Dataset & $ \mathbb{E}_\text{model}[V]$ & $ \mathbb{E}_\text{upstream}[V]$ \\
\midrule
obl & 1.6e+09 & 4.3e+08 \\
\bottomrule
\end{tabular}



In [367]:
# Create DataFrame comparing variances
variance_df = pd.DataFrame({
    'Dataset': [dataset_name_map[dataset]],
    'var_from_model': [overall_model],
    'var_from_upstream': [overall_upstream], 
    'var_from_replicates': [overall_replicate]
})
print(variance_df.to_latex(
    index=False,
    float_format=lambda x: f"{x:.0e}"
))

\begin{tabular}{lrrr}
\toprule
Dataset & var_from_model & var_from_upstream & var_from_replicates \\
\midrule
obl & 2e+09 & 4e+08 & 8e+08 \\
\bottomrule
\end{tabular}



In [379]:
print(f"Pipeline groups for V(Model), n = {per_model.shape[0]}")
print(f"Model groups for V(Upstream), n = {per_upstream.shape[0]}")
print(f"Replicate groups for V(Monte Carlo), n = {per_replicate.shape[0]}")

Pipeline groups for V(Model), n = 20
Model groups for V(Upstream), n = 8
Replicate groups for V(Monte Carlo), n = 140


## Partitioning on finetuning

In [369]:
ft_selector = to_analyze["model"].str.contains("_ft")
using_ft, not_using_ft = to_analyze[ft_selector], to_analyze[~ft_selector]

print(f"Sample size using ft {using_ft.shape}")
print(f"Sample size not using ft {not_using_ft.shape}")

Sample size using ft (52, 24)
Sample size not using ft (508, 24)


In [None]:

using_ft_V_per_model, using_ft_V_model = expected_agg_over_model(agg, using_ft, verbose=True)
not_using_ft_V_per_model, not_using_ft_V_model = expected_agg_over_model(agg, not_using_ft, verbose=True)

display(using_ft_V_per_model)
print(f"Not finetuned has {len(not_using_ft_V_per_model)} samples compared to the above {len(using_ft_V_per_model)}")

2025-03-13 18:13:59,153 - analysis - INFO - Expected var from choosing model: 221107158.31
2025-03-13 18:13:59,167 - analysis - INFO - Expected var from choosing model: 1065677282.93


Unnamed: 0,upstream_hash,rmse,model
0,empty_bert_bert_cls,269978400.0,"deep_linear_ft,mlp_ft,resnet_ft"
1,empty_bert_bert_mean,172236000.0,"deep_linear_ft,mlp_ft,resnet_ft"


Not finetuned has 17 samples compared to the above 2


In [381]:
using_ft_V_per_upstream, using_ft_V_upstream = expected_agg_over_step(agg, using_ft)
not_using_ft_V_per_upstream, not_using_ft_V_upstream = expected_agg_over_step(agg, not_using_ft)

print("Showing the number of upstream pipelines used per modelling approach")
print("With finetuning")
display(using_ft_V_per_upstream)
print("Without finetuning")
display(not_using_ft_V_per_upstream)

2025-03-13 18:17:10,584 - analysis - INFO - Expected var from choosing everything else marginalised over model: 90476285.40
2025-03-13 18:17:10,593 - analysis - INFO - Expected var from choosing everything else marginalised over model: 639708479.90


Showing the number of upstream pipelines used per modelling approach
With finetuning


Unnamed: 0,model,rmse,seed
0,deep_linear_ft,43775170.0,5
1,mlp_ft,71916580.0,3
2,resnet_ft,155737100.0,5


Without finetuning


Unnamed: 0,model,rmse,seed
0,catboost,12282540.0,26
1,deep_linear,511683400.0,28
2,mlp,996954700.0,31
3,resnet,114071800.0,18
4,xgboost,1563550000.0,24


In [382]:

using_ft_V_per_replicate, using_ft_V_replicate = expected_agg_over_replicate(agg, using_ft)
not_using_ft_V_per_replicate, not_using_ft_V_replicate = expected_agg_over_replicate(agg, not_using_ft)

ft_result = f"""
Using finetuning:
V(model): {using_ft_V_model:.1e}
V(upstream): {using_ft_V_upstream:.1e}
V(replicate): {using_ft_V_replicate:.1e}
"""
not_ft_result = f"""
Not using finetuning:
V(model): {not_using_ft_V_model:.1e}
V(upstream): {not_using_ft_V_upstream:.1e}
V(replicate): {not_using_ft_V_replicate:.1e}
"""

print(ft_result)
print(not_ft_result)

with open(table_dir / f"{config_str}_ft_variance_comparison.txt", "w") as f:
    f.write(ft_result)
    f.write(not_ft_result)




2025-03-13 18:17:20,297 - analysis - INFO - Expected var from choosing replicate: 323495859.45
2025-03-13 18:17:20,315 - analysis - INFO - Expected var from choosing replicate: 840827554.27



Using finetuning:
V(model): 2.2e+08
V(upstream): 9.0e+07
V(replicate): 3.2e+08


Not using finetuning:
V(model): 1.1e+09
V(upstream): 6.4e+08
V(replicate): 8.4e+08



# Error improvement?

In [None]:
class CustomReprFunc:

    def __init__(self, f, custom_repr):
        self.f = f
        self.custom_repr = custom_repr

    def __call__(self, *args, **kwargs):
        return self.f(*args, **kwargs)

    def __repr__(self):
        return self.custom_repr(self.f)


def set_repr(custom_repr):
    def set_repr_decorator(f):
        return CustomReprFunc(f, custom_repr)
    return set_repr_decorator

@set_repr(lambda f: f.__name__)
def maximum_improvement(series):
    lowest_metric = series.min()
    improvement = series - lowest_metric
    return improvement.mean() # Mean == expected value

_ = expected_agg_over_model(maximum_improvement, df=to_analyze)
_ = expected_agg_over_step(maximum_improvement, df=to_analyze)
_ = expected_agg_over_replicate(maximum_improvement, df=to_analyze)


2025-03-13 18:04:35,865 - analysis - INFO - Expected maximum_improvement from choosing model: 22960.54


2025-03-13 18:04:35,877 - analysis - INFO - Expected maximum_improvement from choosing everything else marginalised over model: 24559.40
2025-03-13 18:04:35,919 - analysis - INFO - Expected maximum_improvement from choosing replicate: 11243.97
