In [27]:
# Install exact versions (force reinstall to avoid conflicts)

#!pip install --no-cache-dir --force-reinstall \
#numpy==1.26.4 \
#scipy==1.11.4 \
#pandas==2.1.4 \
#scikit-learn==1.3.2 \
#xgboost==2.0.3 \
#lightgbm==4.6.0 \
#catboost==1.2.3 \
#torch==2.6.0 \
#torchvision==0.21.0 \
#torchaudio==2.6.0 \
#joblib==1.3.2 \
#tqdm==4.66.1 \
#pyyaml==6.0.1 \
#cloudpickle==3.0.0


In [28]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, balanced_accuracy_score
from tqdm.auto import tqdm

In [29]:
TRAIN_PATH = "/kaggle/input/datasets/yassinechelly4/dataoverflow/train.csv"
TEST_PATH  = "/kaggle/input/datasets/yassinechelly4/dataoverflow/test.csv"
MODEL_PATH = "catboost_model.joblib"
SEED = 42

import numpy as np
import random
np.random.seed(SEED)
random.seed(SEED)

## pre process function

In [30]:
def preprocess(df):
    # Implement any preprocessing steps required for your model here.
    # Return a Pandas DataFrame of the data
    #
    # Note: Don't drop the 'User_ID' column here.
    # It will be used in the predict function to return the final predictions.

    df = df.copy()

    # ── 1. DROP LOW-SIGNAL COLUMNS (keep User_ID) ──────────────────────────
    DROP_COLS = [
        "Employer_ID",
        "Previous_Claims_Filed",
        "Existing_Policyholder",
        "Underwriting_Processing_Days",
        "Infant_Dependents",
        "Policy_Start_Day",
    ]
    df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)

    # ── 2. MISSING VALUE HANDLING ───────────────────────────────────────────
    df["Child_Dependents"]  = df["Child_Dependents"].fillna(0)
    df["Has_Broker"]        = df["Broker_ID"].notna().astype(int)
    df["Broker_ID"]         = df["Broker_ID"].fillna(-1)
    df["Region_Code"]       = df["Region_Code"].fillna("Unknown")
    df["Deductible_Tier"]   = df["Deductible_Tier"].fillna("Unknown")
    df["Acquisition_Channel"] = df["Acquisition_Channel"].fillna("Unknown")

    # ── 3. FEATURE ENGINEERING ─────────────────────────────────────────────
    df["Total_Dependents"]      = df["Adult_Dependents"] + df["Child_Dependents"]
    df["Income_Per_Dependent"]  = df["Estimated_Annual_Income"] / (df["Total_Dependents"] + 1)
    df["Grace_To_Duration_Ratio"] = df["Grace_Period_Extensions"] / (df["Previous_Policy_Duration_Months"] + 1)
    df["Log_Income"]            = np.log1p(df["Estimated_Annual_Income"])
    df["Log_Days_Since_Quote"]  = np.log1p(df["Days_Since_Quote"])

    # Cyclical month encoding
    month_order = ["January","February","March","April","May","June",
                   "July","August","September","October","November","December"]
    df["Month_Num"] = pd.Categorical(df["Policy_Start_Month"], categories=month_order, ordered=True).codes + 1
    df["Month_Sin"] = np.sin(2 * np.pi * df["Month_Num"] / 12)
    df["Month_Cos"] = np.cos(2 * np.pi * df["Month_Num"] / 12)
    df.drop(columns=["Policy_Start_Month", "Month_Num"], inplace=True)

    # ── 4. ENCODING ─────────────────────────────────────────────────────────
    # Ordinal: Deductible_Tier
    deductible_map = {
        "Tier_1_High_Ded": 3,
        "Tier_2_Mid_Ded":  2,
        "Tier_3_Low_Ded":  1,
        "Tier_4_Zero_Ded": 0,
        "Unknown":        -1,
    }
    df["Deductible_Tier"] = df["Deductible_Tier"].map(deductible_map)

    # Target encoding: Region_Code (high cardinality — 166 unique values)
    if "Purchased_Coverage_Bundle" in df.columns:
        global_mean = df["Purchased_Coverage_Bundle"].mean()
        smoothing   = 10
        stats = df.groupby("Region_Code")["Purchased_Coverage_Bundle"].agg(["mean", "count"])
        stats["encoded"] = (
            (stats["mean"] * stats["count"] + global_mean * smoothing) /
            (stats["count"] + smoothing)
        )
        df["Region_Code_Encoded"] = df["Region_Code"].map(stats["encoded"]).fillna(global_mean)
    else:
        # Test set: fall back to global mean (0.0 placeholder — replace with train map in predict)
        df["Region_Code_Encoded"] = 0.0
    df.drop(columns=["Region_Code"], inplace=True)

    # One-hot encoding: low-cardinality categoricals
    OHE_COLS = ["Broker_Agency_Type", "Acquisition_Channel", "Payment_Schedule", "Employment_Status"]
    df = pd.get_dummies(df, columns=OHE_COLS, drop_first=False, dtype=int)

    # ── 5. STANDARDISATION (StandardScaler — continuous/skewed features) ────
    STD_COLS = [c for c in [
        "Estimated_Annual_Income", "Log_Income", "Log_Days_Since_Quote",
        "Income_Per_Dependent", "Grace_To_Duration_Ratio", "Days_Since_Quote",
        "Previous_Policy_Duration_Months", "Policy_Start_Year",
        "Policy_Start_Week", "Broker_ID", "Region_Code_Encoded",
    ] if c in df.columns]

    std_scaler = StandardScaler()
    df[STD_COLS] = std_scaler.fit_transform(df[STD_COLS])

    # ── 6. NORMALISATION (MinMaxScaler — counts/bounded features) ───────────
    MM_COLS = [c for c in [
        "Adult_Dependents", "Child_Dependents", "Total_Dependents",
        "Grace_Period_Extensions", "Years_Without_Claims",
        "Policy_Amendments_Count", "Vehicles_on_Policy",
        "Custom_Riders_Requested", "Deductible_Tier",
        "Month_Sin", "Month_Cos",
    ] if c in df.columns]

    mm_scaler = MinMaxScaler()
    df[MM_COLS] = mm_scaler.fit_transform(df[MM_COLS])

    return df

## load model

In [31]:
def load_model():
    model = None
    # ------------------ MODEL LOADING LOGIC ------------------
    model = joblib.load(MODEL_PATH)
    # ------------------ END MODEL LOADING LOGIC ------------------
    return model


## model predection

In [32]:
def predict(df, model):
    predictions = None
    # ------------------ PREDICTION LOGIC ------------------
    # df has already been passed through preprocess().
    # User_ID is retained in df for the final output.

    user_ids = df["User_ID"]
    X = df.drop(columns=["User_ID"], errors="ignore")

    # Drop target column if accidentally present (e.g. during local testing)
    X = X.drop(columns=["Purchased_Coverage_Bundle"], errors="ignore")

    # Align test columns to exactly match training feature set
    # (adds any missing OHE columns as 0, drops any extras)
    train_features = model.feature_names_
    X = X.reindex(columns=train_features, fill_value=0)

    preds = model.predict(X)
    preds = preds.flatten()

    predictions = pd.DataFrame({
        "User_ID":                   user_ids.values,
        "Purchased_Coverage_Bundle": preds,
    })
    # ------------------ END PREDICTION LOGIC ------------------
    return predictions

In [33]:
import time
import pickle


def run(df) -> tuple[float, float, float]:
    # Load the processed data:
    df_processed = preprocess(df)

    # Extract true labels before they get dropped in predict()
    true_labels = None
    if "Purchased_Coverage_Bundle" in df.columns:
        true_labels = df["Purchased_Coverage_Bundle"].values

    # Load the model:
    model = load_model()
    size = get_model_size(model)

    # Get the predictions and time taken:
    start = time.perf_counter()
    predictions = predict(
        df_processed, model
    )  # NOTE: Don't call the `preprocess` function here.

    duration = time.perf_counter() - start
    macro_f1 = get_model_accuracy(predictions, true_labels)

    return size, macro_f1, duration


def get_model_size(model) -> float:
    """Return the serialised size of the model in megabytes."""
    return len(pickle.dumps(model)) / (1024 * 1024)


def get_model_accuracy(predictions, true_labels=None) -> float:
    """Return Macro F1-Score if true labels are available, else 0.0."""
    from sklearn.metrics import f1_score

    if true_labels is None:
        return 0.0

    pred_labels = predictions["Purchased_Coverage_Bundle"].values
    return f1_score(true_labels, pred_labels, average="macro")


def compute_final_score(size_mb: float, macro_f1: float, latency_s: float) -> float:
    """
    Replicates the hackathon scoring formula:
        final_score = Macro F1
                      × max(0.5, 1 - size_mb / 200)   # Size Penalty
                      × max(0.5, 1 - latency_s / 10)   # Latency Penalty
    """
    size_penalty    = max(0.5, 1 - size_mb   / 200)
    latency_penalty = max(0.5, 1 - latency_s / 10)
    return macro_f1 * size_penalty * latency_penalty


## Train & Save Model

In [34]:
## ── TRAINING HELPER — CatBoost with GridSearchCV & Progress Tracking ──

def train_and_save():
    from sklearn.metrics import balanced_accuracy_score, classification_report, accuracy_score
    print('Loading data...')
    train_raw = pd.read_csv(TRAIN_PATH)
    train_processed = preprocess(train_raw)
    TARGET = 'Purchased_Coverage_Bundle'
    X = train_processed.drop(columns=['User_ID', TARGET], errors='ignore')
    y = train_processed[TARGET]
    
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.15, random_state=SEED, stratify=y)
    
    print('Tuning CatBoost with GridSearchCV...')
    param_grid = {
        'depth': [4, 6],
        'learning_rate': [0.05, 0.1],
        'iterations': [100]
    }
    
    # Using verbose=100 in CatBoost allows tqdm-like progress logs every 100 iterations
    base_model = CatBoostClassifier(
        loss_function='MultiClass',
        random_seed=SEED,
        task_type='GPU',
        verbose=100, 
        allow_writing_files=False
    )
    
    # verbose=3 in GridSearchCV provides detailed progress tracking for each fold/parameter combo
    grid_search = GridSearchCV(
        base_model, 
        param_grid, 
        cv=3, 
        scoring='f1_macro', 
        n_jobs=-1,
        verbose=3
    )
    
    # We wrap the fit call in a simple print statement as GridSearchCV handles the rest internally
    with tqdm(total=1, desc="Overall Grid Search") as pbar:
        grid_search.fit(X_tr, y_tr)
        pbar.update(1)
    
    print(f'Best params: {grid_search.best_params_}')
    model = grid_search.best_estimator_
    
    y_pred = model.predict(X_val)
    print(f'Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}')
    print(classification_report(y_val, y_pred, zero_division=0))
    
    joblib.dump(model, MODEL_PATH)
    print(f'Model saved to {MODEL_PATH}')
    return model

trained_model = train_and_save()


Loading training data ...
Preprocessing ...
Training Logistic Regression ...

Validation accuracy          : 0.5002
Validation balanced accuracy : 0.5318

Classification Report:
              precision    recall  f1-score   support

           0       0.08      0.77      0.14       123
           1       0.31      0.29      0.30       244
           2       0.87      0.51      0.65      5421
           3       0.23      0.51      0.32       724
           4       0.52      0.46      0.49      2094
           5       0.21      0.76      0.33        72
           6       0.16      0.54      0.24       108
           7       0.41      0.47      0.44       343
           8       0.06      1.00      0.12         1
           9       0.00      0.00      0.00         1

    accuracy                           0.50      9131
   macro avg       0.29      0.53      0.30      9131
weighted avg       0.68      0.50      0.55      9131


Model saved → logistic_model.pkl


## ── EVALUATE + SUBMIT with Progress Tracking ─────────────────────────────────

# Step 1: Evaluate run() on training data
print("=" * 50)
print("  Running full pipeline evaluation on train set")
print("=" * 50)

print("Reading training data...")
train_raw = pd.read_csv(TRAIN_PATH)

with tqdm(total=3, desc="Evaluation Progress") as pbar:
    # Using a progress bar for the three main steps in evaluation
    pbar.set_description("Preprocessing data")
    # We'll call the components of run() separately to show progress
    df_processed = preprocess(train_raw)
    pbar.update(1)
    
    pbar.set_description("Loading model")
    model = load_model()
    pbar.update(1)
    
    pbar.set_description("Generating predictions")
    size = get_model_size(model)
    start = time.perf_counter()
    predictions = predict(df_processed, model)
    duration = time.perf_counter() - start
    
    true_labels = train_raw["Purchased_Coverage_Bundle"].values
    accuracy = get_model_accuracy(predictions, true_labels)
    pbar.update(1)

print(f"\n  Model size  : {size:.4f} MB")
print(f"  Accuracy    : {accuracy:.4f}  (balanced)")
print(f"  Pred time   : {duration:.4f} seconds")
print("=" * 50)

# Step 2: Generate predictions on test set & save submission
print("\nGenerating test predictions ...")
with tqdm(total=3, desc="Submission Generation") as pbar:
    pbar.set_description("Reading test data")
    test_raw = pd.read_csv(TEST_PATH)
    pbar.update(1)
    
    pbar.set_description("Preprocessing test data")
    test_processed = preprocess(test_raw)
    pbar.update(1)
    
    pbar.set_description("Predicting & Saving")
    model = load_model()
    submission = predict(test_processed, model)
    submission.to_csv("submission.csv", index=False)
    pbar.update(1)

print(f"Submission saved → submission.csv  ({len(submission):,} rows)")
print("\nSample predictions:")
print(submission.head(10).to_string(index=False))


In [35]:
## ── EVALUATE + SUBMIT ────────────────────────────────────────────────────────

# Step 1: Evaluate run() on training data
print("=" * 50)
print("  Running full pipeline evaluation on train set")
print("=" * 50)

train_raw        = pd.read_csv(TRAIN_PATH)
size, accuracy, duration = run(train_raw)

print(f"\n  Model size  : {size:.4f} MB")
print(f"  Accuracy    : {accuracy:.4f}  (balanced)")
print(f"  Pred time   : {duration:.4f} seconds")
print("=" * 50)

# Step 2: Generate predictions on test set & save submission
print("\nGenerating test predictions ...")
test_raw       = pd.read_csv(TEST_PATH)
test_processed = preprocess(test_raw)
model          = load_model()
submission     = predict(test_processed, model)
submission.to_csv("submission.csv", index=False)

print(f"Submission saved → submission.csv  ({len(submission):,} rows)")
print("\nSample predictions:")
print(submission.head(10).to_string(index=False))


  Running full pipeline evaluation on train set

  Model size  : 0.0049 MB
  Accuracy    : 0.3614  (balanced)
  Pred time   : 0.0538 seconds

Generating test predictions ...
Submission saved → submission.csv  (15,218 rows)

Sample predictions:
   User_ID  Purchased_Coverage_Bundle
USR_060868                          2
USR_060869                          2
USR_060870                          2
USR_060871                          2
USR_060872                          2
USR_060873                          4
USR_060874                          3
USR_060875                          6
USR_060876                          3
USR_060877                          2
