# Data Exploration

Testing the data processing utilities with base table and static_0.

In [None]:
import sys
sys.path.insert(0, "..")

import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_processing import (
    load_table_group,
    downcast_dtypes,
    drop_high_missing_cols,
    drop_high_cardinality_string_cols,
    preprocess_table,
    get_table_info,
)
from src.features import (
    handle_dates, create_domain_ratios,
    aggregate_depth1, aggregate_depth2,
    drop_correlated_columns, collapse_rare_categories, remove_drift_features,
)
from src.metrics import gini_stability

sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams.update({"figure.dpi": 120, "figure.facecolor": "white"})

In [None]:
DATA_PATH = "../data/"

## Load Base Table

In [None]:
# Load the base table
base = load_table_group(DATA_PATH, "base", split="train")
print(f"Base table shape: {base.shape}")
base.head()

In [None]:
# Check base table info
get_table_info(base)

In [None]:
# Preprocess base table
base_processed = preprocess_table(base)
print(f"\nAfter preprocessing: {base_processed.shape}")
get_table_info(base_processed)

## Load Static_0 Table

This table has multiple chunks (static_0_0, static_0_1, etc.) that need to be concatenated.

In [None]:
# Load static_0 - this will concatenate all chunks
static_0 = load_table_group(DATA_PATH, "static_0", split="train")
print(f"Static_0 table shape: {static_0.shape}")
static_0.head()

In [None]:
# Check static_0 info before preprocessing
info_before = get_table_info(static_0)
print(f"Shape: {info_before['shape']}")
print(f"Memory: {info_before['estimated_memory_mb']:.2f} MB")
print(f"Dtype counts: {info_before['dtype_counts']}")
print(f"Columns with >50% missing: {len(info_before['columns_with_high_missing'])}")

In [None]:
# Test downcast_dtypes
static_0_downcasted = downcast_dtypes(static_0)
info_downcasted = get_table_info(static_0_downcasted)
print(f"Memory before downcast: {info_before['estimated_memory_mb']:.2f} MB")
print(f"Memory after downcast: {info_downcasted['estimated_memory_mb']:.2f} MB")
print(f"Memory reduction: {(1 - info_downcasted['estimated_memory_mb']/info_before['estimated_memory_mb'])*100:.1f}%")

In [None]:
# Test drop_high_missing_cols
print(f"Columns before: {static_0.shape[1]}")
static_0_no_missing = drop_high_missing_cols(static_0, threshold=0.98)
print(f"Columns after (threshold=0.98): {static_0_no_missing.shape[1]}")

In [None]:
# Test drop_high_cardinality_string_cols
static_0_no_high_card = drop_high_cardinality_string_cols(static_0, max_unique=10_000)
print(f"Columns after dropping high-cardinality strings: {static_0_no_high_card.shape[1]}")

In [None]:
# Apply full preprocessing pipeline
static_0_processed = preprocess_table(static_0)
print(f"\nFinal shape after full preprocessing: {static_0_processed.shape}")
get_table_info(static_0_processed)

---

# Exploratory Data Analysis

## (a) Target Distribution & Temporal Drift

In [None]:
target_counts = base["target"].value_counts().sort("target").to_pandas()
total = target_counts["count"].sum()
default_rate = target_counts.loc[target_counts["target"] == 1, "count"].values[0] / total

fig, axes = plt.subplots(1, 2, figsize=(13, 4))

ax = axes[0]
bars = ax.bar(
    target_counts["target"].astype(str),
    target_counts["count"],
    color=["#4C72B0", "#DD8452"],
    edgecolor="black",
    linewidth=0.5,
)
for bar, count in zip(bars, target_counts["count"]):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(),
            f"{count:,}\n({count/total:.1%})", ha="center", va="bottom", fontsize=9)
ax.set_xlabel("Target")
ax.set_ylabel("Count")
ax.set_title(f"Target Distribution (default rate = {default_rate:.2%})")
ax.ticklabel_format(axis="y", style="plain")

weekly = (
    base.group_by("WEEK_NUM")
    .agg([
        pl.col("target").mean().alias("default_rate"),
        pl.col("target").count().alias("n_cases"),
    ])
    .sort("WEEK_NUM")
    .to_pandas()
)

ax = axes[1]
ax.plot(weekly["WEEK_NUM"], weekly["default_rate"], color="#4C72B0", linewidth=1.2)
z = np.polyfit(weekly["WEEK_NUM"], weekly["default_rate"], 1)
ax.plot(weekly["WEEK_NUM"], np.polyval(z, weekly["WEEK_NUM"]),
        "--", color="#DD8452", linewidth=1.5, label=f"trend (slope={z[0]:.5f})")
ax.set_xlabel("WEEK_NUM")
ax.set_ylabel("Default Rate")
ax.set_title("Default Rate by Week (temporal drift)")
ax.legend(fontsize=9)

fig.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(13, 3))
ax.bar(weekly["WEEK_NUM"], weekly["n_cases"], color="#4C72B0", edgecolor="none", width=1.0)
ax.set_xlabel("WEEK_NUM")
ax.set_ylabel("Number of Cases")
ax.set_title("Case Volume by Week")
ax.ticklabel_format(axis="y", style="plain")
fig.tight_layout()
plt.show()

## (b) Missing Rates Across Table Groups

In [None]:
TABLE_GROUPS = [
    "base", "static_0", "static_cb_0",
    "person_1", "person_2",
    "applprev_1", "applprev_2",
    "credit_bureau_a_1", "credit_bureau_a_2",
    "credit_bureau_b_1", "credit_bureau_b_2",
    "debitcard_1", "deposit_1", "other_1",
    "tax_registry_a_1", "tax_registry_b_1", "tax_registry_c_1",
]

missing_summary = []
for tg in TABLE_GROUPS:
    try:
        df = load_table_group(DATA_PATH, tg, split="train")
    except FileNotFoundError:
        continue
    n = df.height
    nc = df.null_count()
    for col in df.columns:
        if col == "case_id":
            continue
        rate = nc[col][0] / n
        missing_summary.append({"table_group": tg, "column": col, "missing_rate": rate})

missing_df = pl.DataFrame(missing_summary)
print(f"Total feature columns across all tables: {missing_df.height}")
print(f"Columns with >50% missing: {missing_df.filter(pl.col('missing_rate') > 0.5).height}")
print(f"Columns with >90% missing: {missing_df.filter(pl.col('missing_rate') > 0.9).height}")
print(f"Columns with >98% missing: {missing_df.filter(pl.col('missing_rate') > 0.98).height}")

In [None]:
table_miss = (
    missing_df.group_by("table_group")
    .agg([
        pl.col("missing_rate").mean().alias("avg_missing"),
        pl.col("missing_rate").max().alias("max_missing"),
        (pl.col("missing_rate") > 0.98).sum().alias("cols_gt_98pct"),
        pl.len().alias("n_cols"),
    ])
    .sort("avg_missing", descending=True)
    .to_pandas()
)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax = axes[0]
ax.barh(table_miss["table_group"], table_miss["avg_missing"], color="#4C72B0", edgecolor="none")
ax.set_xlabel("Average Missing Rate")
ax.set_title("Average Missing Rate per Table Group")
ax.invert_yaxis()
ax.axvline(0.5, color="grey", linestyle="--", linewidth=0.8, alpha=0.7)

ax = axes[1]
colors = ["#DD8452" if v > 0 else "#4C72B0" for v in table_miss["cols_gt_98pct"]]
ax.barh(table_miss["table_group"], table_miss["cols_gt_98pct"], color=colors, edgecolor="none")
ax.set_xlabel("Number of Columns")
ax.set_title("Columns with >98% Missing per Table Group")
ax.invert_yaxis()

fig.tight_layout()
plt.show()

In [None]:
top_missing = (
    missing_df.filter(pl.col("missing_rate") > 0.90)
    .sort("missing_rate", descending=True)
)
print(f"Columns with >90% missing ({top_missing.height} total):")
print(top_missing.head(30))

## (c) Feature Drift Detection

Compare numeric feature distributions in **early weeks** (WEEK_NUM 0–30) vs **late weeks** (WEEK_NUM 61–91).
For each feature we measure:
- **Relative shift in mean**: `|mean_late - mean_early| / (std_overall + ε)`
- **Relative shift in std**: `|std_late - std_early| / (std_overall + ε)`
- **Shift in missing rate**: `|miss_late - miss_early|`

Features with large shifts are candidates for dropping to improve model stability.

In [None]:
static_with_week = static_0.join(
    base.select("case_id", "WEEK_NUM"), on="case_id", how="left"
)

numeric_cols = [
    c for c in static_0.columns
    if c != "case_id" and static_0[c].dtype in (pl.Float64, pl.Float32, pl.Int64, pl.Int32)
]
print(f"Numeric columns to analyze: {len(numeric_cols)}")

EARLY_MAX = 30
LATE_MIN = 61

early = static_with_week.filter(pl.col("WEEK_NUM") <= EARLY_MAX)
late = static_with_week.filter(pl.col("WEEK_NUM") >= LATE_MIN)
print(f"Early weeks (0-{EARLY_MAX}): {early.height:,} rows")
print(f"Late weeks ({LATE_MIN}-91): {late.height:,} rows")

In [None]:
EPS = 1e-9
drift_records = []

overall_stats = static_0.select([
    pl.col(c).cast(pl.Float64).std().alias(f"{c}__std") for c in numeric_cols
])

for col in numeric_cols:
    std_all = overall_stats[f"{col}__std"][0]
    if std_all is None:
        continue
    std_all = float(std_all)

    mean_e = early[col].cast(pl.Float64).mean()
    mean_l = late[col].cast(pl.Float64).mean()
    std_e = early[col].cast(pl.Float64).std()
    std_l = late[col].cast(pl.Float64).std()
    miss_e = early[col].null_count() / early.height
    miss_l = late[col].null_count() / late.height

    if mean_e is None or mean_l is None:
        continue

    mean_shift = abs(mean_l - mean_e) / (std_all + EPS)
    std_shift = abs((std_l or 0) - (std_e or 0)) / (std_all + EPS)
    miss_shift = abs(miss_l - miss_e)

    drift_records.append({
        "column": col,
        "mean_early": round(mean_e, 4),
        "mean_late": round(mean_l, 4),
        "mean_shift": round(mean_shift, 4),
        "std_shift": round(std_shift, 4),
        "miss_early": round(miss_e, 4),
        "miss_late": round(miss_l, 4),
        "miss_shift": round(miss_shift, 4),
    })

drift_df = pl.DataFrame(drift_records).sort("mean_shift", descending=True)
print(f"Analyzed {drift_df.height} numeric features for drift")
drift_df.head(20)

In [None]:
top_n = 25
top_drift = drift_df.head(top_n).to_pandas()

fig, axes = plt.subplots(1, 3, figsize=(17, 6))

ax = axes[0]
ax.barh(top_drift["column"], top_drift["mean_shift"], color="#DD8452", edgecolor="none")
ax.set_xlabel("Normalised Mean Shift")
ax.set_title(f"Top {top_n} Features by Mean Drift")
ax.invert_yaxis()

top_std = drift_df.sort("std_shift", descending=True).head(top_n).to_pandas()
ax = axes[1]
ax.barh(top_std["column"], top_std["std_shift"], color="#55A868", edgecolor="none")
ax.set_xlabel("Normalised Std Shift")
ax.set_title(f"Top {top_n} Features by Std Drift")
ax.invert_yaxis()

top_miss = drift_df.sort("miss_shift", descending=True).head(top_n).to_pandas()
ax = axes[2]
ax.barh(top_miss["column"], top_miss["miss_shift"], color="#8172B2", edgecolor="none")
ax.set_xlabel("Δ Missing Rate")
ax.set_title(f"Top {top_n} Features by Missing-Rate Shift")
ax.invert_yaxis()

fig.tight_layout()
plt.show()

In [None]:
MEAN_SHIFT_THRESHOLD = 0.3
STD_SHIFT_THRESHOLD = 0.3
MISS_SHIFT_THRESHOLD = 0.1

drift_flagged = drift_df.filter(
    (pl.col("mean_shift") > MEAN_SHIFT_THRESHOLD)
    | (pl.col("std_shift") > STD_SHIFT_THRESHOLD)
    | (pl.col("miss_shift") > MISS_SHIFT_THRESHOLD)
).sort("mean_shift", descending=True)

print(f"Features flagged for drift (any criterion): {drift_flagged.height}")

top_6 = drift_flagged.head(6)["column"].to_list()
if top_6:
    n_plot = len(top_6)
    fig, axes = plt.subplots(2, 3, figsize=(15, 7))
    axes = axes.flatten()
    for i, col in enumerate(top_6):
        ax = axes[i]
        vals_e = early[col].drop_nulls().cast(pl.Float64).to_numpy()
        vals_l = late[col].drop_nulls().cast(pl.Float64).to_numpy()
        lo = np.nanpercentile(np.concatenate([vals_e, vals_l]), 1)
        hi = np.nanpercentile(np.concatenate([vals_e, vals_l]), 99)
        bins = np.linspace(lo, hi, 50)
        ax.hist(vals_e, bins=bins, alpha=0.5, density=True, label="early", color="#4C72B0")
        ax.hist(vals_l, bins=bins, alpha=0.5, density=True, label="late", color="#DD8452")
        ax.set_title(col, fontsize=9)
        ax.legend(fontsize=7)
        ax.tick_params(labelsize=7)
    for j in range(n_plot, len(axes)):
        axes[j].set_visible(False)
    fig.suptitle("Distribution Comparison: Early vs Late Weeks (top drifted features)", fontsize=11)
    fig.tight_layout()
    plt.show()

## (d) Candidate Drift-Prone Features to Drop

Features are flagged if **any** of these hold:
- Normalised mean shift > 0.3
- Normalised std shift > 0.3
- Missing rate shift > 0.1

In [None]:
high_missing_cols = (
    missing_df.filter(
        (pl.col("table_group") == "static_0") & (pl.col("missing_rate") > 0.98)
    )["column"].to_list()
)

drift_prone_cols = drift_flagged["column"].to_list()

candidates_to_drop = sorted(set(drift_prone_cols + high_missing_cols))

print(f"Drift-prone features (static_0): {len(drift_prone_cols)}")
print(f"High-missing features (>98%, static_0): {len(high_missing_cols)}")
print(f"Combined unique candidates to drop: {len(candidates_to_drop)}")
print()
print("Candidate features to drop:")
for col in candidates_to_drop:
    reasons = []
    if col in drift_prone_cols:
        reasons.append("drift")
    if col in high_missing_cols:
        reasons.append(">98% missing")
    print(f"  {col:45s} [{', '.join(reasons)}]")

In [None]:
print("DRIFT_PRONE_FEATURES = [")
for col in candidates_to_drop:
    print(f'    "{col}",')
print("]")

---

# Feature Engineering — Date Columns & Domain Ratios

## Test `handle_dates`

Join base (with `date_decision`) to static_0 (which has date columns ending in `D`), then convert all dates to numeric features relative to the decision date.

In [None]:
merged = base.join(static_0, on="case_id", how="left")

date_d_cols = [c for c in merged.columns if c.endswith("D") and c != "date_decision"]
year_cols = [c for c in merged.columns if "year" in c.lower() and c not in date_d_cols and c != "date_decision"]

print(f"Columns ending in 'D' (excl. date_decision): {len(date_d_cols)}")
print(f"  Examples: {date_d_cols[:5]}")
print(f"Columns containing 'year': {len(year_cols)}")
if year_cols:
    print(f"  Examples: {year_cols[:5]}")
print(f"\nSample date_decision values:\n{merged.select('date_decision').head(3)}")
if date_d_cols:
    print(f"\nSample '{date_d_cols[0]}' values (before):\n{merged.select(date_d_cols[0]).head(3)}")

In [None]:
merged_dates = handle_dates(merged)

print(f"Shape before: {merged.shape}")
print(f"Shape after:  {merged_dates.shape}")
print(f"\n'date_decision' dropped: {'date_decision' not in merged_dates.columns}")
print(f"'MONTH' dropped:         {'MONTH' not in merged_dates.columns}")

transformed_d_cols = [c for c in date_d_cols if c in merged_dates.columns]
if transformed_d_cols:
    sample_col = transformed_d_cols[0]
    print(f"\nSample '{sample_col}' values (after — years before decision):")
    print(merged_dates.select(sample_col).head(5))
    print(f"\n'{sample_col}' dtype after: {merged_dates[sample_col].dtype}")

if year_cols:
    sample_year = year_cols[0]
    print(f"\nSample '{sample_year}' values (after — delta from decision year):")
    print(merged_dates.select(sample_year).head(5))

## Test `create_domain_ratios`

Compute loan burden, disbursement, debt, and interest-rate ratios from static_0 columns.

In [None]:
source_cols = ["price_1097A", "annuity_780A", "disbursedcredamount_1113A",
               "credamount_770A", "totaldebt_9A", "eir_270L"]
present = [c for c in source_cols if c in merged_dates.columns]
missing = [c for c in source_cols if c not in merged_dates.columns]
print(f"Source columns present: {present}")
if missing:
    print(f"Source columns missing: {missing}")

merged_ratios = create_domain_ratios(merged_dates)

ratio_cols = ["loan_burden_ratio", "disbursed_credit_ratio",
              "debt_credit_ratio", "eir_credit_ratio"]
new_cols = [c for c in ratio_cols if c in merged_ratios.columns]
print(f"\nNew ratio columns created: {new_cols}")
print(f"Shape before ratios: {merged_dates.shape}")
print(f"Shape after ratios:  {merged_ratios.shape}")

if new_cols:
    print(f"\nSample ratio values:")
    print(merged_ratios.select(new_cols).head(10))

In [None]:
if new_cols:
    n_plot = len(new_cols)
    fig, axes = plt.subplots(1, n_plot, figsize=(5 * n_plot, 4))
    if n_plot == 1:
        axes = [axes]
    for ax, col in zip(axes, new_cols):
        vals = merged_ratios[col].drop_nulls().to_numpy()
        lo, hi = np.nanpercentile(vals, [1, 99])
        clipped = vals[(vals >= lo) & (vals <= hi)]
        ax.hist(clipped, bins=50, color="#4C72B0", edgecolor="none", alpha=0.8)
        ax.set_title(col, fontsize=10)
        ax.set_ylabel("Count")
    fig.suptitle("Domain Ratio Feature Distributions (1st–99th pctl)", fontsize=12)
    fig.tight_layout()
    plt.show()

---

# Feature Engineering — Depth 1 Aggregations

Load depth-1 tables, preprocess, aggregate by `case_id`, and join to the base table.

In [None]:
DEPTH1_NAMES = [
    "applprev_1",
    "credit_bureau_a_1",
    "credit_bureau_b_1",
    "person_1",
    "tax_registry_a_1",
    "tax_registry_b_1",
    "tax_registry_c_1",
]

depth1_tables = {}
for name in DEPTH1_NAMES:
    try:
        df = load_table_group(DATA_PATH, name, split="train")
        df = preprocess_table(df)
        depth1_tables[name] = df
        print(f"  {name:30s} {str(df.shape):>20s}")
    except FileNotFoundError:
        print(f"  {name:30s} {'NOT FOUND — skipped':>20s}")

### Filter `credit_bureau_a_1` to closed contracts

Closed contracts populate closed-specific columns (e.g. `dateofcredend_353D`, `credlmt_228A`).
Rows where these columns are **not null** represent closed contracts.

In [None]:
CLOSED_INDICATORS = [
    "dateofcredend_353D",
    "dateofcredstart_739D",
    "credlmt_228A",
    "contractst_964M",
]

if "credit_bureau_a_1" in depth1_tables:
    cb_a_1 = depth1_tables["credit_bureau_a_1"]
    available = [c for c in CLOSED_INDICATORS if c in cb_a_1.columns]

    if available:
        filter_col = available[0]
        before = cb_a_1.height
        cb_a_1_closed = cb_a_1.filter(pl.col(filter_col).is_not_null())
        depth1_tables["credit_bureau_a_1"] = cb_a_1_closed
        print(f"Filtered credit_bureau_a_1 on '{filter_col}' is_not_null:")
        print(f"  {before:,} → {cb_a_1_closed.height:,} rows")
    else:
        print(f"Warning: no closed-contract indicator found in columns.")
        print(f"  Searched for: {CLOSED_INDICATORS}")
        print(f"  Using all {cb_a_1.height:,} rows without filtering.")

### Aggregate each depth-1 table and join to base

In [None]:
depth1_agg = {}
for name, df in depth1_tables.items():
    print(f"\n{'─'*60}")
    print(f"  {name}  (input shape: {df.shape})")
    print(f"{'─'*60}")
    depth1_agg[name] = aggregate_depth1(df)

print(f"\n{'═'*60}")
print("Summary:")
for name, agg_df in depth1_agg.items():
    print(f"  {name:30s} → {agg_df.shape[1] - 1:>5,} features, {agg_df.height:>8,} rows")

In [None]:
depth1_merged = base.clone()
for name, agg_df in depth1_agg.items():
    depth1_merged = depth1_merged.join(agg_df, on="case_id", how="left")
    print(f"  + {name:30s} → {depth1_merged.shape}")

total_d1_feats = depth1_merged.shape[1] - base.shape[1]
print(f"\nBase columns:          {base.shape[1]}")
print(f"New depth-1 features:  {total_d1_feats}")
print(f"Final merged shape:    {depth1_merged.shape}")
print(f"Memory:                {depth1_merged.estimated_size('mb'):.1f} MB")

---

# Feature Engineering — Depth 2 Aggregations

Two-pass aggregation: first by `(case_id, num_group1)`, then by `case_id`.
Skip `credit_bureau_b_2` (very high missing rate).

In [None]:
DEPTH2_NAMES = [
    "applprev_2",
    "person_2",
    "credit_bureau_a_2",
    # credit_bureau_b_2 skipped — very high missing rate
]

depth2_tables = {}
for name in DEPTH2_NAMES:
    try:
        df = load_table_group(DATA_PATH, name, split="train")
        df = preprocess_table(df)
        depth2_tables[name] = df
        print(f"  {name:30s} {str(df.shape):>20s}")
    except FileNotFoundError:
        print(f"  {name:30s} {'NOT FOUND — skipped':>20s}")

In [None]:
depth2_agg = {}
for name, df in depth2_tables.items():
    print(f"\n{'─'*60}")
    print(f"  {name}  (input shape: {df.shape})")
    print(f"{'─'*60}")
    depth2_agg[name] = aggregate_depth2(df)

print(f"\n{'═'*60}")
print("Summary:")
for name, agg_df in depth2_agg.items():
    print(f"  {name:30s} → {agg_df.shape[1] - 1:>5,} features, {agg_df.height:>8,} rows")

---

# Merge All Features (Depth 0 + 1 + 2)

Combine depth-0 (base, static_0, static_cb_0), depth-1 aggregations, and depth-2 aggregations into a single training DataFrame.

In [None]:
# ── Depth-0 tables ──────────────────────────────────────────────
train = base.join(static_0_processed, on="case_id", how="left")

try:
    static_cb_0 = load_table_group(DATA_PATH, "static_cb_0", split="train")
    static_cb_0 = preprocess_table(static_cb_0)
    train = train.join(static_cb_0, on="case_id", how="left")
    print(f"+ static_cb_0          → {train.shape}")
except FileNotFoundError:
    print("static_cb_0 not found — skipped")

train = handle_dates(train)
train = create_domain_ratios(train)
print(f"Depth-0 (with dates & ratios): {train.shape}")

# ── Depth-1 aggregations ───────────────────────────────────────
for name, agg_df in depth1_agg.items():
    train = train.join(agg_df, on="case_id", how="left")
print(f"+ depth-1                    → {train.shape}")

# ── Depth-2 aggregations ───────────────────────────────────────
for name, agg_df in depth2_agg.items():
    train = train.join(agg_df, on="case_id", how="left")
print(f"+ depth-2                    → {train.shape}")

In [None]:
info = get_table_info(train)
n_numeric = sum(1 for c in train.columns if train[c].dtype in (pl.Float32, pl.Float64, pl.Int32, pl.Int64))
n_string = sum(1 for c in train.columns if train[c].dtype in (pl.String, pl.Utf8, pl.Categorical))

print(f"Final merged DataFrame")
print(f"  Shape:           {train.shape}")
print(f"  Memory:          {info['estimated_memory_mb']:.1f} MB")
print(f"  Numeric cols:    {n_numeric}")
print(f"  String cols:     {n_string}")
print(f"  >50% missing:    {len(info['columns_with_high_missing'])}")
print(f"\nDtype breakdown: {info['dtype_counts']}")

---

# Post-Merge Feature Filtering

1. Drop columns that are >95% correlated (keep the one with lower missing rate)
2. Collapse rare categories (>200 unique → keep top 20, rest to null)
3. Remove drift-prone features identified in EDA

In [None]:
print(f"Before filtering: {train.shape}")

train = drop_correlated_columns(train, threshold=0.95)
train = collapse_rare_categories(train, max_unique=200, keep_top=20)
train = remove_drift_features(train, candidates_to_drop)

print(f"\nAfter filtering:  {train.shape}")
info = get_table_info(train)
print(f"Memory:           {info['estimated_memory_mb']:.1f} MB")
print(f"Dtype breakdown:  {info['dtype_counts']}")

---

# Build Test Features & Save

Replicate the full feature pipeline on the test split, align columns with the
filtered train set, then save both as parquet.

In [None]:
# ── Depth-0 ─────────────────────────────────────────────────────
test_base = load_table_group(DATA_PATH, "base", split="test")
test = test_base.clone()

for tg in ["static_0", "static_cb_0"]:
    try:
        t = preprocess_table(load_table_group(DATA_PATH, tg, split="test"))
        test = test.join(t, on="case_id", how="left")
    except FileNotFoundError:
        pass

test = handle_dates(test)
test = create_domain_ratios(test)
print(f"Test depth-0: {test.shape}")

# ── Depth-1 ─────────────────────────────────────────────────────
for name in DEPTH1_NAMES:
    try:
        t = preprocess_table(load_table_group(DATA_PATH, name, split="test"))
        if name == "credit_bureau_a_1":
            avail = [c for c in CLOSED_INDICATORS if c in t.columns]
            if avail:
                t = t.filter(pl.col(avail[0]).is_not_null())
        test = test.join(aggregate_depth1(t), on="case_id", how="left")
    except FileNotFoundError:
        pass
print(f"Test + depth-1: {test.shape}")

# ── Depth-2 ─────────────────────────────────────────────────────
for name in DEPTH2_NAMES:
    try:
        t = preprocess_table(load_table_group(DATA_PATH, name, split="test"))
        test = test.join(aggregate_depth2(t), on="case_id", how="left")
    except FileNotFoundError:
        pass
print(f"Test + depth-2: {test.shape}")

# ── Post-merge filtering ───────────────────────────────────────
test = collapse_rare_categories(test, max_unique=200, keep_top=20)

In [None]:
# Align test columns with filtered train (minus target)
train_feature_cols = [c for c in train.columns if c != "target"]
present = [c for c in train_feature_cols if c in test.columns]
missing_in_test = [c for c in train_feature_cols if c not in test.columns]

if missing_in_test:
    print(f"Adding {len(missing_in_test)} null columns missing from test")
    test = test.with_columns([
        pl.lit(None).cast(train[c].dtype).alias(c) for c in missing_in_test
    ])

test = test.select(train_feature_cols)
print(f"Train shape: {train.shape}")
print(f"Test  shape: {test.shape}")
print(f"Column match (excl. target): {list(test.columns) == train_feature_cols}")

In [None]:
from pathlib import Path

out_dir = Path(DATA_PATH) / "processed"
out_dir.mkdir(exist_ok=True)

train.write_parquet(out_dir / "train_final.parquet")
test.write_parquet(out_dir / "test_final.parquet")

print(f"Saved to {out_dir.resolve()}")
print(f"  train_final.parquet  {train.shape}  ({train.estimated_size('mb'):.1f} MB)")
print(f"  test_final.parquet   {test.shape}  ({test.estimated_size('mb'):.1f} MB)")

---

# CatBoost Baseline with StratifiedGroupKFold

Train a CatBoost classifier using 5-fold CV where complete `WEEK_NUM` groups
stay together (no week is split across folds). CatBoost handles categorical
features natively — no encoding needed.

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score

META_COLS = {"case_id", "target", "WEEK_NUM"}
feature_cols = [c for c in train.columns if c not in META_COLS]
cat_cols = [c for c in feature_cols if train[c].dtype in (pl.String, pl.Utf8, pl.Categorical)]

print(f"Features:    {len(feature_cols)}")
print(f"  numeric:   {len(feature_cols) - len(cat_cols)}")
print(f"  categorical: {len(cat_cols)}")

train_pd = train.to_pandas()
X = train_pd[feature_cols]
y = train_pd["target"].values
week_num = train_pd["WEEK_NUM"].values

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
oof_preds = np.zeros(len(X))
fold_results = []
cb_models = []

for fold, (train_idx, val_idx) in enumerate(sgkf.split(X, y, week_num)):
    print(f"\n{'═'*60}")
    print(f"  Fold {fold + 1} / 5   "
          f"(train {len(train_idx):,}  val {len(val_idx):,}  "
          f"val weeks {np.unique(week_num[val_idx]).tolist()[:6]}…)")
    print(f"{'═'*60}")

    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        l2_leaf_reg=3.0,
        random_seed=42 + fold,
        eval_metric="AUC",
        cat_features=cat_cols,
        allow_writing_files=False,
    )

    model.fit(
        X_tr, y_tr,
        eval_set=(X_val, y_val),
        early_stopping_rounds=100,
        verbose=200,
    )

    val_pred = model.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_pred

    fold_auc = roc_auc_score(y_val, val_pred)
    fold_stab = gini_stability(week_num[val_idx], y_val, val_pred)

    fold_results.append({"fold": fold + 1, "auc": fold_auc, **fold_stab})
    cb_models.append(model)

    print(f"\n  AUC:            {fold_auc:.6f}")
    print(f"  Stability:      {fold_stab['stability_score']:.6f}")
    print(f"  Mean Gini:      {fold_stab['mean_gini']:.6f}")
    print(f"  Falling rate:   {fold_stab['falling_rate']:.6f}")
    print(f"  Std residuals:  {fold_stab['std_residuals']:.6f}")

In [None]:
oof_auc = roc_auc_score(y, oof_preds)
oof_stab = gini_stability(week_num, y, oof_preds)

print(f"{'═'*60}")
print(f"  Overall OOF Results (CatBoost)")
print(f"{'═'*60}")
print(f"  AUC:            {oof_auc:.6f}")
print(f"  Stability:      {oof_stab['stability_score']:.6f}")
print(f"  Mean Gini:      {oof_stab['mean_gini']:.6f}")
print(f"  Falling rate:   {oof_stab['falling_rate']:.6f}")
print(f"  Std residuals:  {oof_stab['std_residuals']:.6f}")
print(f"\nPer-fold summary:")
for r in fold_results:
    print(f"  Fold {r['fold']}: AUC={r['auc']:.4f}  "
          f"Stability={r['stability_score']:.4f}  "
          f"Mean Gini={r['mean_gini']:.4f}")

ginis = oof_stab["weekly_ginis"]
x = np.arange(len(ginis))
slope = oof_stab["slope"]
intercept = np.mean(ginis) - slope * np.mean(x)

fig, ax = plt.subplots(figsize=(13, 4))
ax.plot(x, ginis, "o-", color="#4C72B0", markersize=4, linewidth=1.2, label="weekly gini")
ax.plot(x, slope * x + intercept, "--", color="#DD8452", linewidth=1.5,
        label=f"trend (slope={slope:.5f})")
ax.axhline(oof_stab["mean_gini"], color="grey", linestyle=":", linewidth=0.8,
           label=f"mean gini = {oof_stab['mean_gini']:.4f}")
ax.set_xlabel("Week Index")
ax.set_ylabel("Gini (2·AUC − 1)")
ax.set_title(f"CatBoost OOF Weekly Gini  (stability = {oof_stab['stability_score']:.4f})")
ax.legend(fontsize=9)
fig.tight_layout()
plt.show()

In [None]:
import json

artifacts_dir = Path("..") / "artifacts"
artifacts_dir.mkdir(exist_ok=True)

# OOF predictions
oof_df = pl.DataFrame({
    "case_id": train["case_id"],
    "WEEK_NUM": train["WEEK_NUM"],
    "target": train["target"],
    "oof_score_catboost": oof_preds,
})
oof_df.write_parquet(artifacts_dir / "catboost_oof.parquet")

# Fold models
for i, m in enumerate(cb_models):
    m.save_model(str(artifacts_dir / f"catboost_fold_{i}.cbm"))

# Scores
with open(artifacts_dir / "catboost_fold_scores.json", "w") as f:
    json.dump({"fold_results": fold_results, "oof_auc": oof_auc,
               "oof_stability": oof_stab}, f, indent=2)

print(f"Artifacts saved to {artifacts_dir.resolve()}/")
print(f"  catboost_oof.parquet          ({oof_df.shape})")
print(f"  catboost_fold_0..4.cbm        (5 models)")
print(f"  catboost_fold_scores.json")