#### **Imports**

In [1]:
import os, re, math, json, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt


from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

import xgboost as xgb
from packaging import version
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


warnings.filterwarnings("ignore")

INPUT = Path("/kaggle/input/linking-writing-processes-to-writing-quality")
if not INPUT.exists():
    INPUT = Path("linking-writing-processes-to-writing-quality")

TRAIN_LOGS = INPUT / "train_logs.csv"
TRAIN_SCORES = INPUT / "train_scores.csv"
TEST_LOGS  = INPUT / "test_logs.csv"
SAMPLE_SUB = INPUT / "sample_submission.csv"

WORK   = Path("/kaggle/working") if Path("/kaggle/working").exists() else Path(".")
SUBMIT = WORK / "submission.csv"

SEED = 42
N_FOLDS = 10
np.random.seed(SEED)

def rmse(y_true, y_pred):
    return math.sqrt(mean_squared_error(y_true, y_pred))


#### **Load data**

In [2]:
train_logs = pd.read_csv(TRAIN_LOGS)
train_scores = pd.read_csv(TRAIN_SCORES)
test_logs = pd.read_csv(TEST_LOGS)
sample_sub = pd.read_csv(SAMPLE_SUB)

for c in ["down_time","up_time","action_time","event_id","cursor_position","word_count"]:
    for df in (train_logs, test_logs):
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

print(train_logs.shape, train_scores.shape, test_logs.shape)


(8405898, 11) (2471, 2) (6, 11)


#### **Essay reconstruction**

In [3]:
class EssayBuilder:
    MOVE_RX = re.compile(r"Move From \[(\d+), (\d+)\] To \[(\d+), (\d+)\]")

    def reconstruct_all(self, df: pd.DataFrame) -> pd.DataFrame:
        cols = ["id","event_id","activity","cursor_position","text_change"]
        df = df[cols].sort_values(["id","event_id"]).reset_index(drop=True)
        out = []
        for eid, g in df.groupby("id", sort=False):
            out.append({"id": eid, "essay": self._rebuild_one(g)})
        return pd.DataFrame(out)

    def _rebuild_one(self, g: pd.DataFrame) -> str:
        text = ""
        for act, cur, chg in g[["activity","cursor_position","text_change"]].itertuples(index=False):
            cur = int(cur)
            chg = chg if isinstance(chg, str) else ""
            if act == "Nonproduction":
                continue
            elif act in ("Input", "Paste"):
                L = len(chg)
                start = max(cur - L, 0)
                text = text[:start] + chg + text[start:]
            elif act == "Remove/Cut":
                L = len(chg)
                text = text[:cur] + text[cur + L:]
            elif act == "Replace" and "=>" in chg:
                old, new = [p.strip() for p in chg.split("=>", 1)]
                start = max(cur - len(new), 0)
                end   = start + len(old)
                if end < start: end = start
                text = text[:start] + new + text[end:]
            elif isinstance(act, str) and act.startswith("Move From"):
                m = self.MOVE_RX.fullmatch(act.strip())
                if not m:
                    continue
                x1, y1, x2, y2 = map(int, m.groups())
                if x1 == x2:
                    continue
                if x1 < x2:
                    text = text[:x1] + text[y1:y2] + text[x1:y1] + text[y2:]
                else:
                    text = text[:x2] + text[x1:y1] + text[x2:x1] + text[y1:]
        return text

builder = EssayBuilder()
train_essays = builder.reconstruct_all(train_logs)
test_essays  = builder.reconstruct_all(test_logs)

train_essays.head()


Unnamed: 0,id,essay
0,001519c8,qqqqqqqqq qq qqqqq qq qqqq qqqq. qqqqqq qqq q...
1,0022f953,"qqqq qq qqqqqqqqqqq ? qq qq qqq qqq qqq, qqqqq..."
2,0042269b,qqqqqqqqqqq qq qqqqq qqqqqqqqq qq qqqqqqqqqqq ...
3,0059420b,qq qqqqqqq qqqqqq qqqqqqqqqqqqq qqqq q qqqq qq...
4,0075873a,"qqqqqqqqqqq qq qqq qqqqq qq qqqqqqqqqq, qqq qq..."


#### **Feature engineering**

In [4]:
WORD_RX = re.compile(r"[A-Za-z']+")

def q_words(text: str):
    return WORD_RX.findall(text or "")

def word_lengths(text: str):
    return [len(w) for w in q_words(text)]

def paragraph_word_lengths(text: str):
    paras = [p for p in (text or "").split("\n")]
    pls = []
    for p in paras:
        if p.strip() == "":
            continue
        pls.append(len(WORD_RX.findall(p)))
    return pls

def count_chars_typed_from_logs(g: pd.DataFrame) -> int:
    total = 0
    for act, chg in g[["activity","text_change"]].itertuples(index=False):
        if not isinstance(chg, str):
            continue
        if act in ("Input", "Paste"):
            total += len(chg)
        elif act == "Replace" and "=>" in chg:
            old, new = [p.strip() for p in chg.split("=>", 1)]
            total += len(new)
    return total

def median_iki_with_lag(g: pd.DataFrame, lag: int) -> float:
    sub = g[g["activity"].isin(["Input","Remove/Cut"])].copy()
    sub["up_time_lag"] = sub.groupby("id")["up_time"].shift(lag)
    val = (sub["down_time"] - sub["up_time_lag"]).dropna()
    val = val[val >= 0]
    return float(val.median()) if len(val) else 0.0

def input_bursts_count(g: pd.DataFrame) -> int:
    sub = g[g["activity"].isin(["Input","Remove/Cut"])].copy()
    sub["up_time_lag"] = sub.groupby("id")["up_time"].shift(1)
    sub["gap"] = (sub["down_time"] - sub["up_time_lag"])
    sub = sub.dropna()
    sub = sub[sub["gap"] >= 0]
    is_fast = (sub["gap"] < 2000).astype(int).values
    if len(is_fast) == 0:
        return 0
    runs = 0
    prev = 0
    for z in is_fast:
        if z == 1 and prev == 0:
            runs += 1
        prev = z
    return int(runs)

def punctuation_counts(text: str):
    t = text or ""
    return dict(
        n_dot=t.count("."),
        n_comma=t.count(","),
        n_hyphen=t.count("-"),
        n_space=t.count(" "),
        n_line_breaks=t.count("\n"),
        n_spec_char=sum(1 for ch in t if (not ch.isalnum()) and (not ch.isspace()))
    )

def typo_like_counts(text: str):
    t = text or ""
    return dict(n_typos_dot_comma = len(re.findall(r"\s[.,]", t)))

def build_features(df_logs: pd.DataFrame, df_essays: pd.DataFrame) -> pd.DataFrame:
    df = df_logs.sort_values(["id","event_id"]).copy()
    df["next_down_time"] = df.groupby("id")["down_time"].shift(-1)
    df["gap_ms"] = df["next_down_time"] - df["up_time"]
    df["gap_ms"] = df["gap_ms"].where(df["gap_ms"] >= 0)

    def is_backspace(s):
        s = str(s).lower()
        return ("backspace" in s) or ("back space" in s) or (s == "back")
    def is_shift(s):
        return str(s).lower() == "shift"
    def is_space(s):
        return str(s).lower() == "space"

    df["is_backspace"] = df["down_event"].map(is_backspace) | df["up_event"].map(is_backspace)
    df["is_shift"]     = df["down_event"].map(is_shift)     | df["up_event"].map(is_shift)
    df["is_space"]     = df["down_event"].map(is_space)     | df["up_event"].map(is_space)

    rows = []
    essay_map = dict(zip(df_essays["id"].values, df_essays["essay"].values))

    for eid, g in df.groupby("id"):
        t0, t1 = g["down_time"].min(), g["up_time"].max()
        event_count = int(len(g))
        vc = g["activity"].astype(str).value_counts()
        c_input   = int(vc.get("Input", 0))
        c_remove  = int(vc.get("Remove/Cut", 0))
        c_paste   = int(vc.get("Paste", 0))
        c_replace = int(vc.get("Replace", 0))
        c_move    = sum(int(v) for k, v in vc.items() if isinstance(k, str) and k.startswith("Move From"))
        edit_cnt  = c_remove + c_paste + c_replace + c_move

        p_input  = (c_input / event_count) if event_count else 0.0
        p_shift  = float(g["is_shift"].mean()) if event_count else 0.0
        p_space  = float(g["is_space"].mean()) if event_count else 0.0

        gaps = g["gap_ms"].dropna()
        gaps = gaps[gaps >= 0]
        med_pause = float(gaps.median()) if len(gaps) else 0.0
        n_pause_3s = int((gaps > 3000).sum())

        med_IKI      = median_iki_with_lag(g, 1)
        med_IKI_lag2 = median_iki_with_lag(g, 2)
        med_IKI_lag3 = median_iki_with_lag(g, 3)
        med_IKI_lag4 = median_iki_with_lag(g, 4)

        s_action_time = float(g["action_time"].std()) if g["action_time"].notna().sum() else 0.0
        total_typed_chars = count_chars_typed_from_logs(g)

        essay_text = essay_map.get(eid, "")
        prod_total_char = len(essay_text)
        punct = punctuation_counts(essay_text)
        typo_like = typo_like_counts(essay_text)

        wl = word_lengths(essay_text)
        sd_length_word = float(np.std(wl)) if wl else 0.0

        def count_len(k): 
            return int(np.sum(np.array(wl) == k)) if wl else 0
        n_q2  = count_len(2)
        n_q6  = count_len(6)
        n_q7  = count_len(7)
        n_q8  = count_len(8)
        n_q10 = count_len(10)

        rng = [w for w in wl if 8 <= w <= 12]
        seq_max_min_8_12 = (max(rng) - min(rng)) if rng else 0

        pls = paragraph_word_lengths(essay_text)
        mean_paragraph = float(np.mean(pls)) if pls else 0.0

        total_words = max(1, len(wl))
        p_dot   = punct["n_dot"]   / total_words
        p_comma = punct["n_comma"] / total_words

        n_backspace = int(g["is_backspace"].sum())
        n_shift     = int(g["is_shift"].sum())
        burst_input = input_bursts_count(g)

        t20 = (t0 + 20*60*1000) if pd.notnull(t0) else None
        if t20 is not None and "word_count" in g.columns and g["word_count"].notna().any():
            before_20 = g.loc[g["down_time"] <= t20, "word_count"].max() if (g["down_time"] <= t20).any() else 0
            max_wc = int(g["word_count"].max()) if g["word_count"].notna().any() else len(wl)
            n_words_after_20m = int(max_wc - (before_20 if pd.notnull(before_20) else 0))
        else:
            n_words_after_20m = 0

        if t0 is not None and pd.notnull(t0):
            t30 = t0 + 30*60*1000
            if "word_count" in g.columns and g["word_count"].notna().any():
                wc_0 = g.loc[g["down_time"] >= t0, "word_count"].min()
                wc_30 = g.loc[g["up_time"] <= t30, "word_count"].max() if (g["up_time"] <= t30).any() else wc_0
                dt = 30.0
                mean_input_30m = float((wc_30 - wc_0) / max(dt, 1e-9)) if wc_0 is not None and wc_30 is not None else 0.0
            else:
                mean_input_30m = 0.0
        else:
            mean_input_30m = 0.0

        prod_lbreak_shift = punct["n_line_breaks"] * (p_shift if not np.isnan(p_shift) else 0.0)

        # production curve deciles (time-normalized)
        if pd.notnull(t0) and pd.notnull(t1) and t1 > t0:
            T = float(t1 - t0)
            g2 = g.copy()
            g2["t_norm"] = (g2["down_time"] - t0) / T
            if "word_count" in g2.columns and g2["word_count"].notna().any():
                series = g2[["t_norm","word_count"]].dropna().sort_values("t_norm")
                wc_times = series["t_norm"].values
                wc_vals  = series["word_count"].values
            else:
                g2["ins_len"] = 0
                m_in = g2["activity"].isin(["Input","Paste"])
                g2.loc[m_in, "ins_len"] = g2.loc[m_in, "text_change"].fillna("").map(len)
                series = g2[["t_norm","ins_len"]].sort_values("t_norm")
                wc_times = series["t_norm"].values
                wc_vals  = np.cumsum(series["ins_len"].values)
            deciles = np.linspace(0.1, 1.0, 10)
            prod_share = []
            if len(wc_vals) >= 2 and wc_vals[-1] > 0:
                total = float(wc_vals[-1])
                for d in deciles:
                    idx = np.searchsorted(wc_times, d, side="right") - 1
                    idx = np.clip(idx, 0, len(wc_vals) - 1)
                    prod_share.append(float(wc_vals[idx] / total))
            else:
                prod_share = [0.0] * 10
            early_20 = prod_share[1]
            mid_50   = prod_share[4]
            late_80  = prod_share[7]
            early_late_ratio = (early_20 + 1e-9) / (1.0 - late_80 + 1e-9)
        else:
            prod_share = [0.0] * 10
            early_20 = mid_50 = late_80 = early_late_ratio = 0.0

        # pause distribution shape
        p50 = float(gaps.quantile(0.50)) if len(gaps) else 0.0
        p75 = float(gaps.quantile(0.75)) if len(gaps) else 0.0
        p90 = float(gaps.quantile(0.90)) if len(gaps) else 0.0
        p95 = float(gaps.quantile(0.95)) if len(gaps) else 0.0
        p99 = float(gaps.quantile(0.99)) if len(gaps) else 0.0

        # edit intensity + timing & backspace streaks
        edit_total = float(edit_cnt)
        edit_per_100_words = (edit_total / max(1, len(wl))) * 100.0
        backspace_per_100_words = (n_backspace / max(1, len(wl))) * 100.0
        if pd.notnull(t0) and pd.notnull(t1) and t1 > t0:
            cutoff = t0 + 0.8 * (t1 - t0)
            late_edits = g[(g["down_time"] >= cutoff) & g["activity"].isin(["Remove/Cut","Replace","Paste"])].shape[0]
            frac_late_edits = float(late_edits / max(1, edit_cnt))
        else:
            frac_late_edits = 0.0

        m_bs = g["is_backspace"].values.astype(int)
        streaks, cur = [], 0
        for z in m_bs:
            if z: cur += 1
            elif cur > 0:
                streaks.append(cur); cur = 0
        if cur > 0: streaks.append(cur)
        bs_streak_max = int(max(streaks)) if streaks else 0
        bs_streak_mean = float(np.mean(streaks)) if streaks else 0.0

        act_probs = g["activity"].astype(str).value_counts(normalize=True).values
        act_entropy = float(-(act_probs * np.log(act_probs + 1e-12)).sum())

        session_min = ((t1 - t0) / 60000.0) if pd.notnull(t0) and pd.notnull(t1) and t1 > t0 else 0.0
        events_per_min = (event_count / max(session_min, 1e-9)) if session_min > 0 else 0.0

        move_ratio    = c_move    / max(1, event_count)
        replace_ratio = c_replace / max(1, event_count)
        paste_ratio   = c_paste   / max(1, event_count)
        edit_share    = edit_cnt  / max(1, event_count)

        sub_ir = g[g["activity"].isin(["Input","Remove/Cut"])].copy()
        sub_ir["up_time_lag"] = sub_ir["up_time"].shift(1)
        iki = (sub_ir["down_time"] - sub_ir["up_time_lag"]).dropna()
        iki = iki[iki >= 0]
        iki_std = float(np.std(iki)) if len(iki) else 0.0
        iki_cv  = float(iki_std / (np.median(iki)+1e-9)) if len(iki) else 0.0

        rows.append({
            "id": eid,
            "prod_total_char": prod_total_char,
            "_total_char": total_typed_chars,
            **punct, **typo_like,
            "p_dot": p_dot, "p_comma": p_comma, "p_space": p_space,
            "sd_length_word": sd_length_word,
            "n_q2": n_q2, "n_q6": n_q6, "n_q7": n_q7, "n_q8": n_q8, "n_q10": n_q10,
            "seq_max_min_8_12": seq_max_min_8_12,
            "mean_paragraph": mean_paragraph,
            "med_IKI": med_IKI, "med_IKI_lag2": med_IKI_lag2, "med_IKI_lag3": med_IKI_lag3, "med_IKI_lag4": med_IKI_lag4,
            "med_pause": med_pause, "n_pause_3s": n_pause_3s, "s_action_time": s_action_time,
            "burst_input": burst_input, "n_backspace": n_backspace, "n_shift": n_shift,
            "p_shift": p_shift, "p_input": p_input,
            "n_words_after_20m": n_words_after_20m, "mean_input_30m": mean_input_30m,
            "prod_lbreak_shift": prod_lbreak_shift,
            **{f"prod_d{int((i+1)*10)}": prod_share[i] for i in range(10)},
            "prod_early20": early_20, "prod_mid50": mid_50, "prod_late80": late_80, "early_late_ratio": early_late_ratio,
            "pause_p50": p50, "pause_p75": p75, "pause_p90": p90, "pause_p95": p95, "pause_p99": p99,
            "edit_per_100_words": edit_per_100_words, "backspace_per_100_words": backspace_per_100_words,
            "frac_late_edits": frac_late_edits,
            "bs_streak_max": bs_streak_max, "bs_streak_mean": bs_streak_mean,
            "act_entropy": act_entropy,
            "session_min": session_min, "events_per_min": events_per_min,
            "move_ratio": move_ratio, "replace_ratio": replace_ratio, "paste_ratio": paste_ratio, "edit_share": edit_share,
            "iki_std": iki_std, "iki_cv": iki_cv,
        })
    return pd.DataFrame(rows)

train_feats = build_features(train_logs, train_essays)
test_feats  = build_features(test_logs,  test_essays)

df_train = train_feats.merge(train_scores, on="id", how="left")
feature_cols = [c for c in train_feats.columns if c != "id"]
print("Train features shape:", train_feats.shape, "| Test features shape:", test_feats.shape)


Train features shape: (2471, 69) | Test features shape: (3, 69)


#### **Params**

In [5]:
X = df_train[feature_cols].fillna(0.0)
y = df_train["score"].astype(float).values
X_test = test_feats[feature_cols].fillna(0.0)

TREE_METHOD = "gpu_hist" if os.environ.get("CUDA_VISIBLE_DEVICES") not in (None, "") else "hist"
USE_GPU_CAT = (os.environ.get("CUDA_VISIBLE_DEVICES") not in (None, ""))

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# XGB
xgb_best_params = {
    'n_estimators': 3136,
    'learning_rate': 0.010121181166477724,
    'max_depth': 5,
    'min_child_weight': 5.111879961449728,
    'gamma': 0.8033271268511519,
    'subsample': 0.6752734231760704,
    'colsample_bytree': 0.5573188522513011,
    'reg_alpha': 0.8909929789441696,
    'reg_lambda': 0.9941415870261027,
    'max_bin': 452,
    'grow_policy': 'depthwise',
    'objective': 'reg:squarederror',
    'tree_method': TREE_METHOD,
    'random_state': SEED,
    'n_jobs': -1,
}

# CatBoost
cat_best_params = {
    'iterations': 5915,
    'learning_rate': 0.015600644582174339,
    'depth': 5,
    'l2_leaf_reg': 1.7350901516859896,
    'random_strength': 0.07440670993932308,
    'bagging_temperature': 1.4205571202737526,
    'subsample': 0.5653366003208794,
    'leaf_estimation_iterations': 8,
    'loss_function': 'RMSE',
}

# LightGBM
lgb_best_params = {
    'boosting_type': 'gbdt',
    'learning_rate': 0.03554388601967273,
    'num_leaves': 105,
    'max_depth': 4,
    'lambda_l1': 2.0306366179907154,
    'lambda_l2': 3.502509429630488,
    'min_data_in_leaf': 18,
    'min_sum_hessian_in_leaf': 0.003416106525862324,
    'min_gain_to_split': 0.39791230620879725,
    'feature_fraction': 0.6164560326658721,
    'bagging_fraction': 0.9835406089972294,
    'bagging_freq': 2,
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
}
lgb_num_boost_round = 6815

# Extra Trees
extra_trees_best_params = {
  'n_estimators': 1160, 
  'max_depth': 40, 
  'min_samples_split': 8, 
  'min_samples_leaf': 1, 
  'max_features': 0.49666738375873354, 
  'bootstrap': False
}

# kNN
knn_best_params = {
  'n_neighbors': 15,
  'weights': 'distance', 
  'p': 1, 
  'leaf_size': 18
}

# Ridge
ridge_best_params = {
  'alpha': 99.99512488296149
}


#### **Helpers**

In [6]:
def rmse_np(a, b):
    return float(np.mean((a - b) ** 2)) ** 0.5

def best_iter_predict_xgb(model, X_):
    if hasattr(model, "best_iteration") and model.best_iteration is not None:
        try:
            return model.predict(X_, iteration_range=(0, model.best_iteration))
        except TypeError:
            return model.predict(X_, ntree_limit=getattr(model, "best_ntree_limit", 0))
    return model.predict(X_)

def train_xgb_with_es(params, xtr, ytr, xva, yva):
    model = xgb.XGBRegressor(**params)
    if version.parse(xgb.__version__) >= version.parse("2.0.0"):
        callbacks = [xgb.callback.EarlyStopping(rounds=300, save_best=True, data_name="validation_0", metric_name="rmse")]
        model.fit(xtr, ytr, eval_set=[(xva, yva)], eval_metric="rmse", verbose=False, callbacks=callbacks)
    else:
        model.fit(xtr, ytr, eval_set=[(xva, yva)], eval_metric="rmse", verbose=False, early_stopping_rounds=300)
    return model

def train_cat_with_es(params, xtr, ytr, xva, yva, use_gpu):
    model = CatBoostRegressor(
        **params,
        task_type=("GPU" if use_gpu else "CPU"),
        random_state=SEED,
        verbose=False
    )
    train_pool = Pool(xtr, ytr)
    valid_pool = Pool(xva, yva)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=300, use_best_model=True, verbose=False)
    return model


#### **5-fold CV (XGB + CAT + LGB)**

In [7]:
# OOF holders
oof_xgb = np.zeros(len(X)); oof_cat = np.zeros(len(X)); oof_lgb = np.zeros(len(X))
oof_et  = np.zeros(len(X)); oof_rg  = np.zeros(len(X)); oof_knn = np.zeros(len(X))

# Test fold predictions
test_preds_xgb_folds, test_preds_cat_folds, test_preds_lgb_folds = [], [], []
test_preds_et_folds,  test_preds_rg_folds,  test_preds_knn_folds  = [], [], []

for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    xtr, xva = X.iloc[trn_idx], X.iloc[val_idx]           # F2
    ytr, yva = y[trn_idx], y[val_idx]

    # XGBoost (F2)
    xgb_model = train_xgb_with_es(xgb_best_params, xtr, ytr, xva, yva)
    pv_xgb = best_iter_predict_xgb(xgb_model, xva)
    oof_xgb[val_idx] = pv_xgb
    test_preds_xgb_folds.append(best_iter_predict_xgb(xgb_model, X_test))

    # CatBoost (F2)
    cat_model = train_cat_with_es(cat_best_params, xtr, ytr, xva, yva, USE_GPU_CAT)
    pv_cat = cat_model.predict(xva)
    oof_cat[val_idx] = pv_cat
    test_preds_cat_folds.append(cat_model.predict(X_test))

    # LightGBM (F2; train API)
    dtr = lgb.Dataset(xtr, ytr); dva = lgb.Dataset(xva, yva, reference=dtr)
    lgbm = lgb.train(
        lgb_best_params,
        dtr,
        num_boost_round=lgb_num_boost_round,
        valid_sets=[dva],
        callbacks=[lgb.early_stopping(stopping_rounds=300, verbose=False),
                   lgb.log_evaluation(period=0)]
    )
    pv_lgb = lgbm.predict(xva, num_iteration=lgbm.best_iteration)
    oof_lgb[val_idx] = pv_lgb
    test_preds_lgb_folds.append(lgbm.predict(X_test, num_iteration=lgbm.best_iteration))

    # ExtraTrees (F2)
    et_model = ExtraTreesRegressor(
        **extra_trees_best_params,
        random_state=SEED,
        n_jobs=-1
    )
    et_model.fit(xtr, ytr)
    pv_et = et_model.predict(xva)
    oof_et[val_idx] = pv_et
    test_preds_et_folds.append(et_model.predict(X_test))

    # KNN (F2) with scaling
    knn_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("knn", KNeighborsRegressor(**knn_best_params))
    ])
    knn_pipe.fit(xtr, ytr)
    pv_knn = knn_pipe.predict(xva)
    oof_knn[val_idx] = pv_knn
    test_preds_knn_folds.append(knn_pipe.predict(X_test))

    # Ridge (F2) with scaling
    ridge_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(**ridge_best_params, random_state=SEED))
    ])
    ridge_pipe.fit(xtr, ytr)
    pv_rg = ridge_pipe.predict(xva)
    oof_rg[val_idx] = pv_rg
    test_preds_rg_folds.append(ridge_pipe.predict(X_test))

    print(
        f"Fold {fold} | "
        f"XGB {rmse_np(yva, pv_xgb):.5f} | "
        f"CAT {rmse_np(yva, pv_cat):.5f} | "
        f"LGB {rmse_np(yva, pv_lgb):.5f} | "
        f"ET {rmse_np(yva, pv_et):.5f} | "
        f"RIDGE {rmse_np(yva, pv_rg):.5f} | "
        f"KNN {rmse_np(yva, pv_knn):.5f}"
    )

# Single-model OOF scores
rmse_xgb = rmse_np(y, oof_xgb)
rmse_cat = rmse_np(y, oof_cat)
rmse_lgb = rmse_np(y, oof_lgb)
rmse_et  = rmse_np(y, oof_et)
rmse_rg  = rmse_np(y, oof_rg)
rmse_knn = rmse_np(y, oof_knn)
print(
    f"OOF RMSE — XGB: {rmse_xgb:.5f} | CAT: {rmse_cat:.5f} | LGB: {rmse_lgb:.5f} | "
    f"ET: {rmse_et:.5f} | RIDGE: {rmse_rg:.5f} | KNN: {rmse_knn:.5f}"
)


Fold 0 | XGB 0.57919 | CAT 0.57404 | LGB 0.57941 | ET 0.59355 | RIDGE 0.61927 | KNN 0.63610
Fold 1 | XGB 0.51020 | CAT 0.51273 | LGB 0.50603 | ET 0.51546 | RIDGE 0.55003 | KNN 0.58027
Fold 2 | XGB 0.65643 | CAT 0.65754 | LGB 0.65543 | ET 0.66359 | RIDGE 0.74116 | KNN 0.70498
Fold 3 | XGB 0.60569 | CAT 0.60776 | LGB 0.61446 | ET 0.64173 | RIDGE 0.67011 | KNN 0.73048
Fold 4 | XGB 0.58983 | CAT 0.58413 | LGB 0.59611 | ET 0.59606 | RIDGE 0.61831 | KNN 0.66668
Fold 5 | XGB 0.60040 | CAT 0.60450 | LGB 0.61438 | ET 0.60569 | RIDGE 0.63963 | KNN 0.69435
Fold 6 | XGB 0.61656 | CAT 0.61968 | LGB 0.63311 | ET 0.65492 | RIDGE 0.69295 | KNN 0.71438
Fold 7 | XGB 0.62079 | CAT 0.61664 | LGB 0.62525 | ET 0.63684 | RIDGE 0.65332 | KNN 0.67658
Fold 8 | XGB 0.60543 | CAT 0.60394 | LGB 0.61701 | ET 0.64228 | RIDGE 0.66207 | KNN 0.71348
Fold 9 | XGB 0.58544 | CAT 0.58593 | LGB 0.58406 | ET 0.58405 | RIDGE 0.65741 | KNN 0.60914
OOF RMSE — XGB: 0.59805 | CAT: 0.59774 | LGB: 0.60376 | ET: 0.61484 | RIDGE: 0.6

#### **Ridge stack & submission**

In [8]:

# Ridge stacking over 6 models
Z = np.column_stack([oof_xgb, oof_cat, oof_lgb, oof_et, oof_rg, oof_knn])
meta = Ridge(alpha=0.01, fit_intercept=True, random_state=SEED)
meta.fit(Z, y)
oof_blend = meta.predict(Z)
rmse_blend = rmse_np(y, oof_blend)
print(f"OOF RMSE — Ridge blend (XGB+CAT+LGB+ET+RIDGE+KNN): {rmse_blend:.5f}")
print("Blend coefficients:", meta.coef_, "intercept:", meta.intercept_)


# Build test predictions
pred_test_xgb = np.mean(np.column_stack(test_preds_xgb_folds), axis=1)
pred_test_cat = np.mean(np.column_stack(test_preds_cat_folds), axis=1)
pred_test_lgb = np.mean(np.column_stack(test_preds_lgb_folds), axis=1)
pred_test_et  = np.mean(np.column_stack(test_preds_et_folds),  axis=1)
pred_test_rg  = np.mean(np.column_stack(test_preds_rg_folds),  axis=1)
pred_test_knn = np.mean(np.column_stack(test_preds_knn_folds), axis=1)

Z_test = np.column_stack([pred_test_xgb, pred_test_cat, pred_test_lgb,
                          pred_test_et, pred_test_rg, pred_test_knn])
pred_test = np.clip(meta.predict(Z_test), 0.0, 6.0)

# Save submission + summary
sub = sample_sub.copy()
sub = sub[["id"]].merge(pd.DataFrame({"id": test_feats["id"], "score": pred_test}), on="id", how="left")
sub.to_csv(SUBMIT, index=False)


print("Saved submission to:", SUBMIT)
print("Done.")


OOF RMSE — Ridge blend (XGB+CAT+LGB+ET+RIDGE+KNN): 0.59473
Blend coefficients: [ 0.55891823  0.50401011 -0.07178017 -0.00623127  0.11177279 -0.08529851] intercept: -0.02718735481870338
Saved submission to: submission.csv
Done.
