In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import illoominate
import os
import random

In [14]:
# Define model and parameters
model = "vmis"
metric = "mrr@20"
params = {"m": 200, "k": 200}

location='../data/24410_private'


train_df = pd.read_csv(f"{location}/train.csv", sep='\t').rename(
    columns={'SessionId':'session_id','ItemId':'item_id','Time':'timestamp'}
)
val_df = pd.read_csv(f"{location}/valid.csv", sep='\t').rename(
    columns={'SessionId':'session_id','ItemId':'item_id','Time':'timestamp'}
)
test_df = pd.read_csv(f"{location}/test.csv", sep='\t').rename(
    columns={'SessionId':'session_id','ItemId':'item_id','Time':'timestamp'}
)


existing_removal_data = pd.read_csv(f"{location}/__removal_impact_results_importance_mrr@21_eval_mrr@21.csv", 
                                    names=["experiment_name", "seed", "valid_metric_score", "test_metric_score", "num_sessions_removed"])
existing_removal_data

Unnamed: 0,experiment_name,seed,valid_metric_score,test_metric_score,num_sessions_removed
0,important_first_loo,1313,0.2634,0.2634,0
1,important_first_loo,1313,0.2628,0.2631,251
2,important_first_loo,1313,0.2623,0.2629,502
3,important_first_loo,1313,0.2617,0.2628,753
4,important_first_loo,1313,0.2617,0.2631,1004
...,...,...,...,...,...
3760,random,12345,0.2617,0.2620,189666
3761,random,12345,0.2612,0.2615,190437
3762,random,12345,0.2616,0.2618,191208
3763,random,12345,0.2611,0.2613,191979


In [15]:
n_seeds = existing_removal_data["seed"].unique()
print(f"Number of seeds: {len(n_seeds)}")

# === Constants ===
session_ids = train_df["session_id"].unique()
total_sessions = len(session_ids)
target_sessions_removed = int(0.40 * total_sessions)
sessions_per_step = 92
n_steps = 100
sessions_per_step = target_sessions_removed // n_steps

print(f"Will remove {sessions_per_step} sessions per step for {n_steps} steps (total ~{target_sessions_removed})")

# === Helper function ===
def evaluate(filtered_train):
    test_mrr = illoominate.train_and_evaluate_for_sbr(filtered_train, test_df, model, metric, params)['score'][0]
    val_mrr = illoominate.train_and_evaluate_for_sbr(filtered_train, val_df, model, metric, params)['score'][0]
    return test_mrr, val_mrr

# === Run experiments ===
all_results = []

for seed in n_seeds:
    np.random.seed(seed)
    session_lengths = train_df.groupby("session_id")["item_id"].count()
    sorted_sessions_longest = session_lengths.sort_values(ascending=False).index.tolist()
    sorted_sessions_shortest = session_lengths.sort_values(ascending=True).index.tolist()

    for method_name, sorted_sessions in [
        ("remove_longest_sessions", sorted_sessions_longest),
        ("remove_shortest_sessions", sorted_sessions_shortest),
    ]:
        for step in range(n_steps + 1):  # Include step 0 => no removal
            total_to_remove = step * sessions_per_step
            removed_sessions = sorted_sessions[:total_to_remove]
            filtered_train = train_df[~train_df["session_id"].isin(removed_sessions)]

            try:
                test_mrr, val_mrr = evaluate(filtered_train)

                all_results.append({
                    "experiment_name": method_name,
                    "seed": seed,
                    "valid_metric_score": val_mrr,
                    "test_metric_score": test_mrr,
                    "num_sessions_removed": len(removed_sessions)
                })
            except Exception as e:
                print(f"Error in seed {seed}, step {step}, method {method_name}: {e}")

Number of seeds: 3
Will remove 1574 sessions per step for 100 steps (total ~157462)


In [16]:
results_df = pd.DataFrame(all_results)
results_df.to_csv(f"{location}/session_length_removal_experiments.csv", index=False)