In [1]:
import os
import tqdm
import wandb
import warnings
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import concurrent.futures

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow logging
warnings.filterwarnings('ignore', category=UserWarning, module='google.protobuf')

from matplotlib.axes import Axes
from wandb.apis.public import Run

from typing import Union, List, Dict
from src.visualization import set_themes

api = wandb.Api() # Initialize Weights & Biases API, used for fetching run data
set_themes() # Set custom themes for plots
pl.Config.set_tbl_rows(20) # Set Polars table display rows limit

pd.set_option('future.no_silent_downcasting', True)

failed to send, dropping 5 traces to intake at http://localhost:8126/v0.5/traces after 3 retries
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: [wandb.Api()] Loaded credentials for https://api.wandb.ai from /home/hafidh_rendyanto/.netrc.


In [2]:
config = {
    "model": "matrix_factorization",
    "ensure_available_locally": False
}
sorting_criterion = {
    "epoch/test_hitrate@50": 0.5,
    "epoch/test_ndcg@50": 0.25,
}

def fetch_run_metadata(run: Run, considered_metrics: Union[str, Dict[str, float]] = "epoch/epoch") -> Dict:
    run_config = {}
    for key, value in run.config.items():
        # Convert lists and dicts to strings
        if isinstance(value, (list, dict)):
            run_config[key] = str(value)
        else:
            run_config[key] = value

    run_history = run.history()
    run_history = run_history.replace({"Infinity": np.inf, "NaN": np.nan})

    if isinstance(considered_metrics, str):
        run_history["score"] = run_history[considered_metrics]
    elif isinstance(considered_metrics, dict):
        run_history["score"] = sum(
            run_history[metric] * weight for metric, weight in considered_metrics.items()
        )
    else:
        raise ValueError("considered_metrics must be either a string or a dictionary")
    
    best_summary = run_history.iloc[run_history["score"].argmax()]
    best_summary = {f"best:{key}": val for key, val in best_summary.items()}
    
    return {
        "run_id": run.id,
        "run_name": run.name,
        "sweep_id": run.sweep.id if run.sweep else None,
        "model": run.config.get("model"),
        **run_config,
        **{metric: run_history[metric].to_list() for metric in run_history},
        **best_summary,
        "gpu_type": run.metadata.get("gpu"),
        "cpu_count": run.metadata.get("cpu_count"),
    }

batch_size = 8
records = []
futures = {}
executor = concurrent.futures.ThreadPoolExecutor(max_workers=batch_size)
runs:List[Run] = api.runs("feedr/peppermint-matrix", per_page=15, filters={"config.model": config["model"]})
run_iterator = iter(runs)
with tqdm.tqdm(total=len(runs), ncols=128) as pbar:
    while len(records) < len(runs):
        # submit new tasks if we empty slots in the batch
        while len(futures) < batch_size and len(records) + len(futures) < len(runs):
            current_runs = next(run_iterator)
            current_future = executor.submit(fetch_run_metadata, current_runs, sorting_criterion)
            futures[current_future] = current_runs

        # check for completed tasks
        finished_futures, _ = concurrent.futures.wait(futures.keys(), return_when=concurrent.futures.FIRST_COMPLETED, timeout=0.1)
        for finished_future in finished_futures:
            finished_run = futures.pop(finished_future)
            records.append(finished_future.result())
            pbar.update(1)

# Create a Polars DataFrame from the records
experiment_runs = pl.DataFrame(records, infer_schema_length=None)
    
# # Tag run as available locally if the model files exist
local_run_ids = []
local_sweep_ids = os.listdir(f"./models/{config['model']}/")
for sweep_id in local_sweep_ids:
    local_run_ids.extend([run_id for run_id in os.listdir(f"./models/{config['model']}/{sweep_id}/")])
    
experiment_runs = experiment_runs.with_columns(
    available_locally=pl.col("run_id").is_in(local_run_ids)
)

if config["ensure_available_locally"]:
    experiment_runs = experiment_runs.filter(pl.col("available_locally") == True)

experiment_runs = experiment_runs.sort("_timestamp", descending=False)
experiment_runs = experiment_runs.with_columns(
    run_duration_second=pl.col("_runtime").list.max(),
    run_duration_minute=(pl.col("_runtime").list.max() / 60)
)
experiment_runs.select(
    pl.col("run_id"),
    pl.col("run_name"),
    pl.col("sweep_id"),
    pl.col("model"),
    pl.col("embedding_dimension"),
    pl.col("shuffle"),
    pl.col("best:epoch/epoch"),
    pl.col("best:epoch/train_loss"),
    pl.col("best:epoch/test_loss"),
    pl.col("best:epoch/test_recall@10"),
    pl.col("best:epoch/test_ndcg@10"),
)

failed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 2 additional messages skipped
 37%|█████████████████████████████████▏                                                       | 109/292 [00:44<00:59,  3.06it/s]failed to send, dropping 5 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 46 additional messages skipped
 90%|████████████████████████████████████████████████████████████████████████████████▏        | 263/292 [01:46<00:09,  3.11it/s]failed to send, dropping 15 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 52 additional messages skipped
100%|█████████████████████████████████████████████████████████████████████████████████████████| 292/292 [01:55<00:00,  2.52it/s]


run_id,run_name,sweep_id,model,embedding_dimension,shuffle,best:epoch/epoch,best:epoch/train_loss,best:epoch/test_loss,best:epoch/test_recall@10,best:epoch/test_ndcg@10
str,str,str,str,i64,bool,f64,f64,f64,f64,f64
"""sm4faeql""","""electric-sweep-1""","""pjxpg4bb""","""matrix_factorization""",16,false,51.0,0.044853,0.223118,0.022528,0.087484
"""mhh2vww3""","""solar-sweep-3""","""pjxpg4bb""","""matrix_factorization""",4,true,46.0,0.092834,0.20148,0.019505,0.076231
"""im0wjtep""","""lively-sweep-2""","""pjxpg4bb""","""matrix_factorization""",32,true,26.0,0.039394,0.196034,0.022696,0.088301
"""yszchdsw""","""laced-sweep-4""","""pjxpg4bb""","""matrix_factorization""",32,true,33.0,0.035967,0.219474,0.022535,0.086142
"""gkt242cc""","""clean-sweep-5""","""pjxpg4bb""","""matrix_factorization""",256,true,11.0,0.027117,0.208644,0.020824,0.082289
"""yvji2dcx""","""rose-sweep-6""","""pjxpg4bb""","""matrix_factorization""",1024,false,1.0,0.2362,0.242782,0.023828,0.091777
"""n9r3cmud""","""warm-sweep-7""","""pjxpg4bb""","""matrix_factorization""",128,false,7.0,0.040676,0.183099,0.022164,0.085856
"""eg5qej8g""","""twilight-sweep-8""","""pjxpg4bb""","""matrix_factorization""",256,true,14.0,0.022493,0.22761,0.02086,0.082151
"""tw2l0zlb""","""fallen-sweep-9""","""pjxpg4bb""","""matrix_factorization""",1024,true,0.0,0.54582,0.310808,0.021982,0.084718
"""5upp90iz""","""visionary-sweep-10""","""pjxpg4bb""","""matrix_factorization""",512,false,1.0,0.334225,0.280716,0.024727,0.0927


# Parameter Comparison

## Embedding Dimension vs Regularization

In [20]:
experiment_summary = experiment_runs.group_by("embedding_dimension", "l2_regularization").agg(
    pl.col("run_id").count().alias("num_runs"),
    pl.col("best:epoch/epoch").mean(),
    pl.col("best:epoch/test_recall@10").mean(),
    pl.col("best:epoch/test_ndcg@10").mean(),
    pl.col("best:epoch/test_recall@50").mean(),
    pl.col("best:epoch/test_ndcg@50").mean(),
).sort("embedding_dimension", "l2_regularization")
experiment_summary

embedding_dimension,l2_regularization,num_runs,best:epoch/epoch,best:epoch/test_recall@10,best:epoch/test_ndcg@10,best:epoch/test_recall@50,best:epoch/test_ndcg@50
i64,f64,u32,f64,f64,f64,f64,f64
2,0.0,16,57.9375,0.014982,0.058917,0.054826,0.106436
2,0.00001,1,54.0,0.005822,0.024608,0.020867,0.042713
2,0.0001,1,9.0,0.002774,0.012851,0.008106,0.022531
2,0.001,3,61.333333,0.000812,0.004035,0.003367,0.008748
2,0.01,2,9.5,0.000507,0.002262,0.002856,0.006967
2,0.1,1,11.0,0.000992,0.004458,0.004238,0.010557
2,1.0,1,12.0,0.001274,0.004595,0.004151,0.010002
4,0.0,14,55.5,0.019447,0.075638,0.070553,0.131094
4,0.00001,1,63.0,0.012488,0.050913,0.045238,0.090212
4,0.001,2,61.5,0.000668,0.003475,0.002886,0.007831


In [None]:
experiment_summary[["embedding_dimension", "l2_regularization", "num_runs"]].pivot(
    values=["num_runs"],
    index="embedding_dimension",
    columns="l2_regularization"
)

  experiment_summary[["embedding_dimension", "l2_regularization", "num_runs"]].pivot(


embedding_dimension,0.0,0.00001,0.0001,0.001,0.01,0.1,1.0
i64,u32,u32,u32,u32,u32,u32,u32
2,16,1.0,1.0,3,2.0,1.0,1.0
4,14,1.0,,2,1.0,2.0,3.0
8,21,1.0,2.0,4,1.0,,1.0
16,21,2.0,4.0,1,2.0,2.0,
32,23,3.0,2.0,1,,1.0,4.0
64,16,,1.0,2,2.0,2.0,1.0
128,17,3.0,5.0,1,2.0,,1.0
256,21,,1.0,2,4.0,1.0,4.0
512,20,1.0,4.0,1,2.0,1.0,2.0
1024,17,1.0,4.0,2,2.0,1.0,4.0


failed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 5 additional messages skipped


In [22]:
experiment_summary[["embedding_dimension", "l2_regularization", "best:epoch/epoch"]].pivot(
    values=["best:epoch/epoch"],
    index="embedding_dimension",
    columns="l2_regularization"
)

  experiment_summary[["embedding_dimension", "l2_regularization", "best:epoch/epoch"]].pivot(


embedding_dimension,0.0,0.00001,0.0001,0.001,0.01,0.1,1.0
i64,f64,f64,f64,f64,f64,f64,f64
2,57.9375,54.0,9.0,61.333333,9.5,11.0,12.0
4,55.5,63.0,,61.5,10.0,10.5,11.0
8,54.47619,43.0,8.0,57.5,8.0,,10.0
16,46.0,43.5,6.0,63.0,9.0,10.0,
32,20.913043,45.0,9.0,61.0,,11.0,12.25
64,14.9375,,6.0,61.0,9.5,10.0,11.0
128,12.352941,61.0,7.2,62.0,9.0,,13.0
256,10.428571,,9.0,62.5,8.5,11.0,12.5
512,6.1,0.0,9.0,63.0,7.0,10.0,11.5
1024,2.529412,0.0,7.75,60.5,8.5,11.0,11.0


In [23]:
experiment_summary[["embedding_dimension", "l2_regularization", "best:epoch/test_recall@50"]].pivot(
    values=["best:epoch/test_recall@50"],
    index="embedding_dimension",
    columns="l2_regularization"
)

  experiment_summary[["embedding_dimension", "l2_regularization", "best:epoch/test_recall@50"]].pivot(


embedding_dimension,0.0,0.00001,0.0001,0.001,0.01,0.1,1.0
i64,f64,f64,f64,f64,f64,f64,f64
2,0.054826,0.020867,0.008106,0.003367,0.002856,0.004238,0.004151
4,0.070553,0.045238,,0.002886,0.002421,0.004219,0.003879
8,0.079099,0.04832,0.008414,0.002207,0.003977,,0.004109
16,0.080918,0.03676,0.008352,0.003039,0.003094,0.004109,
32,0.080722,0.039426,0.009178,0.002863,,0.004344,0.004471
64,0.079383,,0.008537,0.003359,0.003974,0.003645,0.004336
128,0.077684,0.026922,0.008872,0.003315,0.004032,,0.005491
256,0.074986,,0.009208,0.002735,0.003894,0.00429,0.004465
512,0.075177,0.036303,0.009268,0.004035,0.003424,0.004574,0.00388
1024,0.073647,0.027489,0.009001,0.003141,0.003816,0.003391,0.004085


In [24]:
experiment_summary[["embedding_dimension", "l2_regularization", "best:epoch/test_ndcg@50"]].pivot(
    values=["best:epoch/test_ndcg@50"],
    index="embedding_dimension",
    columns="l2_regularization"
)

  experiment_summary[["embedding_dimension", "l2_regularization", "best:epoch/test_ndcg@50"]].pivot(


embedding_dimension,0.0,0.00001,0.0001,0.001,0.01,0.1,1.0
i64,f64,f64,f64,f64,f64,f64,f64
2,0.106436,0.042713,0.022531,0.008748,0.006967,0.010557,0.010002
4,0.131094,0.090212,,0.007831,0.005739,0.009427,0.009827
8,0.144633,0.096512,0.021229,0.00593,0.008969,,0.009946
16,0.147691,0.074384,0.021692,0.008105,0.007887,0.009946,
32,0.147881,0.079614,0.023749,0.007688,,0.009647,0.010074
64,0.14602,,0.022169,0.008924,0.008209,0.008655,0.009003
128,0.144031,0.056749,0.022935,0.009317,0.008758,,0.010247
256,0.140562,,0.023848,0.007283,0.008645,0.010235,0.009814
512,0.139682,0.071196,0.023688,0.009883,0.008287,0.009475,0.008763
1024,0.138018,0.059109,0.023294,0.008187,0.008453,0.007566,0.009626


# Cross-GPU Training

In [10]:
experiment_runs.group_by("embedding_dimension").agg(
    pl.col("run_duration_minute").mean()
).sort("embedding_dimension")

embedding_dimension,run_duration_minute
i64,f64
2,39.190645
4,40.23737
8,39.740251
16,38.22859
32,38.584058
64,37.962726
128,37.906583
256,40.633227
512,39.869674
1024,39.675106


In [None]:
experiment_runs.group_by("gpu_type").agg(
    pl.col("run_duration_minute").mean()
).sort("gpu_type")

gpu_type,run_duration_minute
str,f64
"""NVIDIA A100-SXM4-40GB""",47.189427
"""NVIDIA A10G""",38.394305
"""NVIDIA L4""",46.713581


failed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 2 additional messages skipped


In [None]:
experiment_summary = experiment_runs.filter(pl.col("l2_regularization") == 0.).group_by("embedding_dimension", "gpu_type").agg(
    pl.col("run_id").count().alias("num_runs"),
    pl.col("best:epoch/epoch").mean(),
    pl.col("best:epoch/test_recall@10").mean(),
    pl.col("best:epoch/test_ndcg@10").mean(),
    pl.col("best:epoch/test_recall@50").mean(),
    pl.col("best:epoch/test_ndcg@50").mean(),
).sort("embedding_dimension", "gpu_type")
experiment_summary

embedding_dimension,gpu_type,num_runs,best:epoch/epoch,best:epoch/test_recall@10,best:epoch/test_ndcg@10,best:epoch/test_recall@50,best:epoch/test_ndcg@50
i64,str,u32,f64,f64,f64,f64,f64
2,"""NVIDIA A100-SXM4-40GB""",1,63.0,0.014696,0.059235,0.05268,0.105463
2,"""NVIDIA A10G""",14,57.214286,0.015012,0.058911,0.054863,0.106399
2,"""NVIDIA L4""",1,63.0,0.014849,0.058681,0.056454,0.10793
4,"""NVIDIA A100-SXM4-40GB""",1,54.0,0.01967,0.076773,0.071414,0.132074
4,"""NVIDIA A10G""",10,54.4,0.019497,0.075881,0.070607,0.131341
4,"""NVIDIA L4""",3,59.666667,0.019205,0.074449,0.070085,0.129945
8,"""NVIDIA A100-SXM4-40GB""",2,56.0,0.021582,0.082868,0.078556,0.142345
8,"""NVIDIA A10G""",16,54.3125,0.022172,0.086039,0.079129,0.14499
8,"""NVIDIA L4""",3,54.333333,0.021985,0.084684,0.0793,0.144255
16,"""NVIDIA A100-SXM4-40GB""",1,52.0,0.023087,0.089038,0.080538,0.148108


failed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 4 additional messages skipped


In [16]:
experiment_summary[["embedding_dimension", "gpu_type", "num_runs"]].pivot(
    values=["num_runs"],
    index="embedding_dimension",
    columns="gpu_type"
)

  experiment_summary[["embedding_dimension", "gpu_type", "num_runs"]].pivot(


embedding_dimension,NVIDIA A100-SXM4-40GB,NVIDIA A10G,NVIDIA L4
i64,u32,u32,u32
2,1.0,14,1.0
4,1.0,10,3.0
8,2.0,16,3.0
16,1.0,20,
32,,20,3.0
64,1.0,15,
128,,17,
256,3.0,14,4.0
512,2.0,16,2.0
1024,1.0,16,


In [17]:
experiment_summary[["embedding_dimension", "gpu_type", "best:epoch/test_recall@50"]].pivot(
    values=["best:epoch/test_recall@50"],
    index="embedding_dimension",
    columns="gpu_type"
)

  experiment_summary[["embedding_dimension", "gpu_type", "best:epoch/test_recall@50"]].pivot(


embedding_dimension,NVIDIA A100-SXM4-40GB,NVIDIA A10G,NVIDIA L4
i64,f64,f64,f64
2,0.05268,0.054863,0.056454
4,0.071414,0.070607,0.070085
8,0.078556,0.079129,0.0793
16,0.080538,0.080937,
32,,0.080756,0.0805
64,0.079803,0.079356,
128,,0.077684,
256,0.074252,0.075102,0.075132
512,0.07872,0.075183,0.071587
1024,0.081884,0.073133,


In [None]:
experiment_summary[["embedding_dimension", "gpu_type", "best:epoch/test_ndcg@50"]].pivot(
    values=["best:epoch/test_ndcg@50"],
    index="embedding_dimension",
    columns="gpu_type"
)

  experiment_summary[["embedding_dimension", "gpu_type", "best:epoch/test_ndcg@50"]].pivot(


embedding_dimension,NVIDIA A100-SXM4-40GB,NVIDIA A10G,NVIDIA L4
i64,f64,f64,f64
2,0.105463,0.106399,0.10793
4,0.132074,0.131341,0.129945
8,0.142345,0.14499,0.144255
16,0.148108,0.14767,
32,,0.147998,0.147097
64,0.147484,0.145922,
128,,0.144031,
256,0.139429,0.140763,0.140708
512,0.144037,0.139711,0.135093
1024,0.150235,0.137255,


failed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces after 3 retries, 2 additional messages skipped
