3_meta_hgb.ipynb

Final stacking layer combining base models (LightGBM + CamemBERTav2).

  1. Aggregates OOF predictions & engineers consensus features (vote sum, disagreement).
  2. Computes user-level stability stats.
  3. Trains a HistGradientBoosting meta-model using StratifiedGroupKFold to prevent leakage.


In [2]:
import os
import gc
import numpy as np
import pandas as pd
from pandas import json_normalize
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedGroupKFold

# configuration

SEED = 42
N_FOLDS = 5

# setup directories
# ROOT_DIR = "/content/drive/MyDrive/Colab Notebooks/code" # for google colab
ROOT_DIR = "."
OUT_DIR = os.path.join(ROOT_DIR, "intermediate")
os.makedirs(OUT_DIR, exist_ok=True)

# input raw data files
TRAIN_JSONL = os.path.join(ROOT_DIR, "train.jsonl")
TEST_JSONL = os.path.join(ROOT_DIR, "kaggle_test.jsonl")

# model outputs (predictions from previous steps)
# --> input features for the meta-model (stacking)
OOF_LGBM_USER_FILE = os.path.join(OUT_DIR, "oof_lgbm.csv")
TEST_LGBM_USER_FILE = os.path.join(OUT_DIR, "test_lgbm.csv")
LGBM_FEATS_TRAIN_USER_FILE = os.path.join(OUT_DIR, "lgbm_features_train.csv")
LGBM_FEATS_TEST_USER_FILE = os.path.join(OUT_DIR, "lgbm_features_test.csv")

OOF_CAMEMBERT_FILE = os.path.join(OUT_DIR, "oof_camembert.csv")
TEST_CAMEMBERT_FILE = os.path.join(OUT_DIR, "test_camembert.csv")

# final submission file
SUB_OUT = os.path.join(ROOT_DIR, "submission.csv")

ID_COL, LABEL_COL = "ID", "label"


# load data and build user key
print("Loading JSONL files and creating user keys")

# load raw jsonl data
raw_train = pd.read_json(TRAIN_JSONL, lines=True)
raw_test = pd.read_json(TEST_JSONL, lines=True)

# flatten nested json objects into a pandas dataframe
train_df_full = json_normalize(raw_train.to_dict(orient='records'))
test_df_full = json_normalize(raw_test.to_dict(orient='records'))

# create a mapping to link challenges to users
# we use 'user.created_at' as a proxy for user id since explicit ids are missing
user_train_map = (
    train_df_full[["challenge_id", "user.created_at"]]
    .rename(columns={"challenge_id": ID_COL})
)
user_test_map = (
    test_df_full[["challenge_id", "user.created_at"]]
    .rename(columns={"challenge_id": ID_COL})
)

# convert to string to ensure consistent matching later
user_train_map["user_key"] = user_train_map["user.created_at"].astype(str)
user_test_map["user_key"] = user_test_map["user.created_at"].astype(str)

print(f"Unique users in train: {user_train_map['user_key'].nunique()}")
print(f"Unique users in test:  {user_test_map['user_key'].nunique()}")


# load base model outputs (stacking layer 1)
# we load predictions made by base models (lightgbm, camembertav2)
# these probabilities become the input features (x) for our meta-model
print("Loading LightGBM user-level predictions")
oof_lgbm_user = pd.read_csv(OOF_LGBM_USER_FILE)
test_lgbm_user = pd.read_csv(TEST_LGBM_USER_FILE)

# create working copies
oof_train = oof_lgbm_user.copy()
test_train = test_lgbm_user.copy()

print("Loading CamemBERTa predictions")
camembert_trn_map, camembert_tst_map = None, None

# try-except blocks
# if a specific model output is missing, the code continues without it
try:
    camembert_trn = pd.read_csv(OOF_CAMEMBERT_FILE).drop(columns=["user_label"], errors="ignore")
    # rename column for clarity and to avoid conflicts with lgbm cols
    camembert_trn = camembert_trn.rename(columns={"oof_proba": "camembert_proba"})
    # merge camemberta probs onto the user map to align data
    camembert_trn_map = user_train_map.merge(
        camembert_trn[["user_key", "camembert_proba"]],
        on="user_key", how="left"
    )
except:
    print("CamemBERTa OOF file not found - proceeding without it")

try:
    camembert_tst = pd.read_csv(TEST_CAMEMBERT_FILE)
    camembert_tst_map = user_test_map.merge(
        camembert_tst[["user_key", "camembert_proba"]],
        on="user_key", how="left"
    )
except:
    print("CamemBERTa test file not found - proceeding without it")


print("Loading LightGBM user features")
try:
    lgbm_feats_trn = pd.read_csv(LGBM_FEATS_TRAIN_USER_FILE)
    lgbm_feats_tst = pd.read_csv(LGBM_FEATS_TEST_USER_FILE)
except:
    lgbm_feats_trn = lgbm_feats_tst = None
    print("LightGBM feature files not found")

# merge all features
# consolidate all signals (model predictions + engineered features) into one dataframe per split
print("Merging features")

y = oof_train[LABEL_COL].values
train_merged = oof_train.copy()
test_merged = test_train.copy()

# merge pre-calculated lightgbm features if available
if lgbm_feats_trn is not None:
    train_merged = train_merged.merge(lgbm_feats_trn, on=ID_COL, how="left")
if lgbm_feats_tst is not None:
    test_merged = test_merged.merge(lgbm_feats_tst, on=ID_COL, how="left")

# merge camemberta probabilities if available
if camembert_trn_map is not None:
    train_merged = train_merged.merge(
        camembert_trn_map[[ID_COL, "camembert_proba"]], on=ID_COL, how="left"
    )
if camembert_tst_map is not None:
    test_merged = test_merged.merge(
        camembert_tst_map[[ID_COL, "camembert_proba"]], on=ID_COL, how="left"
    )

# finally, add the user keys back to the merged data for grouping
train_merged = train_merged.merge(user_train_map[[ID_COL, "user_key"]], on=ID_COL, how="left")
test_merged = test_merged.merge(user_test_map[[ID_COL, "user_key"]], on=ID_COL, how="left")


# meta feature engineering
# create new features based on the relationship between model predictions
# e.g. do the models agree? what is the mean confidence?

print("Creating meta-features")

proba_cols = ["lightgbm_user_proba"]
if "camembert_proba" in train_merged.columns:
    proba_cols.append("camembert_proba")

# calculate statistics across the ensemble members
train_merged["models_mean_proba"] = train_merged[proba_cols].mean(axis=1)
train_merged["models_std_proba"] = train_merged[proba_cols].std(axis=1) # high std = models disagree
test_merged["models_mean_proba"] = test_merged[proba_cols].mean(axis=1)
test_merged["models_std_proba"] = test_merged[proba_cols].std(axis=1)

# convert probabilities to hard votes (0 or 1)
for col in proba_cols:
    train_merged[f"hard_{col}"] = (train_merged[col] > 0.5).astype(int)
    test_merged[f"hard_{col}"] = (test_merged[col] > 0.5).astype(int)

# sum the votes to see consensus strength [0 or 1 or 2 ]
train_merged["models_vote_sum"] = train_merged[[f"hard_{c}" for c in proba_cols]].sum(axis=1)
test_merged["models_vote_sum"] = test_merged[[f"hard_{c}" for c in proba_cols]].sum(axis=1)

# calculate absolute difference between specific models
if "camembert_proba" in train_merged.columns:
    train_merged["delta_proba_lgbm_camembert"] = (
        train_merged["lightgbm_user_proba"] - train_merged["camembert_proba"]
    ).abs()
    test_merged["delta_proba_lgbm_camembert"] = (
        test_merged["lightgbm_user_proba"] - test_merged["camembert_proba"]
    ).abs()

# user-level statistics --> aggregate predictions per user
# is a user generally "easy" or "hard" to classify ?
for name, df_sub in [("train", train_merged), ("test", test_merged)]:
    # compute mean, max, std of probabilities grouped by user
    stats = df_sub.groupby("user_key")[proba_cols].agg(
        {c: ["mean", "max", "std"] for c in proba_cols}
    )
    # flatten multi-level column names (e.g. ('proba', 'mean') -> 'user_proba_mean')
    stats.columns = [f"user_{c}_{s}" for c, s in stats.columns.to_flat_index()]
    stats = stats.reset_index().fillna(0)

    if name == "train":
        train_merged = train_merged.merge(stats, on="user_key", how="left")
    else:
        test_merged = test_merged.merge(stats, on="user_key", how="left")

# feature selection
print("Selecting numeric features")

# define what not to train on (identifiers, strings, target label)
exclude_cols = {ID_COL, LABEL_COL, "user_key", "user.created_at"}

# automatically select all numeric columns available
numeric_cols = set(train_merged.select_dtypes(include=[np.number]).columns)
feature_cols = sorted(numeric_cols - exclude_cols)

# prepare numpy arrays for training
X_train = train_merged[feature_cols].values
X_test = test_merged[feature_cols].values

# meta-model training
print("Training meta-model")

# using HGB which handles nans natively
hgb_params = {
    "learning_rate": 0.03, # step size, lower is more robust
    "max_iter": 800, # max number of trees
    "max_depth": 6, # limits complexity to avoid overfitting
    "min_samples_leaf": 60, # prevents fitting to noise/outliers
    "l2_regularization": 0.5, # penalty on weights to smooth model
    "max_bins": 255, # feature discretization precision
    "random_state": SEED, # reproducibility
    "early_stopping": True, # auto-stop if validation stagnates
    "validation_fraction": 0.10, # % of data used to check early stopping
    "n_iter_no_change": 30, # patience buffer
}

# handling the groups for cross-validation
# we fill nas just to avoid errors, though user_key shouldn't be na
groups = train_merged["user_key"].fillna("NA_USER").values

# stratifiedgroupkfold
# 1. stratified: keeps the ratio of class 0 vs class 1 constant in each fold
# 2. group: ensures all samples from the same user are in the same fold (train OR val)
cv = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_proba = np.zeros(len(y))
test_proba_folds = []

# main training loop
for fold, (tr_idx, val_idx) in enumerate(cv.split(X_train, y, groups), 1):
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    clf = HistGradientBoostingClassifier(**hgb_params)
    clf.fit(X_tr, y_tr)

    # predict probability of class 1
    val_pred = clf.predict_proba(X_val)[:, 1]
    oof_proba[val_idx] = val_pred

    # simple accuracy check for monitoring
    acc = accuracy_score(y_val, (val_pred >= 0.5).astype(int))
    print(f"Fold {fold}: accuracy = {acc:.4f}")

    # predict on the test set for this fold
    test_proba_folds.append(clf.predict_proba(X_test)[:, 1])

# global oof accuracy
total_acc = accuracy_score(y, (oof_proba >= 0.5).astype(int))
print(f"> Overall OOF Accuracy: {total_acc:.4f} <")

# generate submission
print("Exporting submission")

# average the predictions from all folds (bagging) to improve stability
test_proba_mean = np.mean(test_proba_folds, axis=0)

submission = pd.DataFrame({
    "ID": test_merged[ID_COL].values,
    # threshold the probability at 0.5 to get binary class
    "Predicted": (test_proba_mean >= 0.5).astype(int),
})
submission.to_csv(SUB_OUT, index=False)

print(f"Submission written to {SUB_OUT}")

Loading JSONL files and creating user keys
Unique users in train: 30696
Unique users in test:  20465
Loading LightGBM user-level predictions
Loading CamemBERTa predictions
Loading LightGBM user features
Merging features
Creating meta-features
Selecting numeric features
Training meta-model
Fold 1: accuracy = 0.8716
Fold 2: accuracy = 0.8723
Fold 3: accuracy = 0.8837
Fold 4: accuracy = 0.8809
Fold 5: accuracy = 0.8777
> Overall OOF Accuracy: 0.8772 <
Exporting submission
Submission written to ./submission.csv


```text
Loading JSONL files and creating user keys
Unique users in train: 30696
Unique users in test:  20465

Loading LightGBM user-level predictions
Loading CamemBERT predictions
Loading LightGBM user features

Merging features
Creating meta-features
Selecting numeric features

Training meta-model
Fold 1: accuracy = 0.8716
Fold 2: accuracy = 0.8723
Fold 3: accuracy = 0.8837
Fold 4: accuracy = 0.8809
Fold 5: accuracy = 0.8777

> Overall OOF Accuracy: 0.8772 <

Exporting submission
Submission written to ./submission.csv
```
