## 0 - Import Modules and Data

In [89]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV


import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.svm import SVC

from scipy.stats import zscore

from imblearn.over_sampling import SMOTE

import os
import joblib


from catboost import CatBoostClassifier
import optuna
from optuna.integration import CatBoostPruningCallback
from sklearn.metrics import f1_score

import warnings
from optuna.exceptions import ExperimentalWarning

warnings.filterwarnings("ignore", category=ExperimentalWarning)

# Plotting style
ftsize_title = 20
ftsize_axis = 16
sns.set_style('whitegrid')
sns.set_palette("viridis")

In [90]:
def performance_display(y_test, y_pred, title):
    """Function that displays a beautiful confusion matrix

    Args:
        y_test (pd.DataFrame): True label values of testing set
        y_pred (pd.DataFrame): Predicted label values of testing set
    """
    # Compute confusion matrix
    cm = confusion_matrix(y_true=y_test, y_pred=y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted', fontsize=ftsize_axis)
    plt.ylabel('Actual', fontsize=ftsize_axis)
    # Invert y axis
    plt.gca().invert_yaxis()
    title = 'Confusion Matrix - ' + title
    plt.title(title, fontsize=ftsize_title, fontweight='bold')
    plt.show()
    
    # Print classification report
    report = classification_report(y_true=y_test, y_pred=y_pred)
    print('Classification Report:')
    print(report)

In [91]:
def create_submission_file(X_test, filename="submission", model_name="rf.pkl"):
    # Load trained model
    model = joblib.load(f"models/{model_name}")
    print(f"Model loaded from models/{model_name}")

    # Predict TARGET values
    y_pred = model.predict(X_test)

    # Create submission DataFrame
    submission_df = pd.DataFrame({
        "ID": X_test['ID'],
        "TARGET": y_pred.astype(bool)  # Convert to True/False
    })

    # Save to CSV
    filename = f"submissions/{filename}.csv"
    submission_df.to_csv(filename, index=False)
    print(f"Submission file saved as {filename}")

In [92]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")


# Step 1: Load original training and test data
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
df_extra = pd.read_csv("data/extra.csv")

df_train['FROM_EXTRA'] = 0
df_extra['FROM_EXTRA'] = 1

df_test['FROM_EXTRA'] = 0

df_train_full = pd.concat([df_train, df_extra], axis=0).reset_index(drop=True)

X = df_train_full.drop('TARGET', axis=1)
y = df_train_full['TARGET']


print(f"Combined X shape: {X.shape}")
print(f"Combined y shape: {y.shape}")
print(y.value_counts())

Combined X shape: (370132, 325)
Combined y shape: (370132,)
TARGET
False    337115
True      33017
Name: count, dtype: int64


In [93]:
# Compute class weights
pos_weight = (len(y) - y.sum()) / y.sum()
class_weights = [1, pos_weight]

print("Using class weights:", class_weights)

Using class weights: [1, np.float64(10.210346185298482)]


In [94]:
# Drop ID from training (keep it only for submission)
X_final = X.drop("ID", axis=1)
y_final = y
X_test_final = df_test.drop("ID", axis=1)

In [95]:
X_train, X_val, y_train, y_val = train_test_split(
    X_final, 
    y_final,
    test_size=0.2,
    stratify=y_final,
    random_state=42
)

print(X_train.shape, X_val.shape)


(296105, 324) (74027, 324)


In [96]:
def objective(trial):

    # params = {
    #     "depth": trial.suggest_int("depth", 8, 12),
    #     "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
    #     "iterations": trial.suggest_int("iterations", 1000, 5000),
    #     "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 20, log=True),
    #     "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 2, log=True),
    #     "subsample": trial.suggest_float("subsample", 0.7, 1.0),
    #     "rsm": trial.suggest_float("rsm", 0.7, 1.0),
    #     "class_weights": [1, trial.suggest_int("pos_weight", 5, 20)],  # optimize imbalance
    #     "loss_function": "Logloss",
    #     "eval_metric": "F1",
    #     "random_state": 42,
    #     "verbose": False
    # }

    params = {
        "depth": trial.suggest_int("depth", 6, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
        "iterations": trial.suggest_int("iterations", 1000, 6000),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 30, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 2, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "rsm": trial.suggest_float("rsm", 0.5, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),  # binning granularity
        # "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),
        "sampling_frequency": trial.suggest_categorical("sampling_frequency", ["PerTree", "PerTreeLevel"]),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 10),  # for categorical features
        "class_weights": [1, trial.suggest_int("pos_weight", 5, 20)],
        "loss_function": "Logloss",
        "eval_metric": "F1",
        "random_state": 42,
        "verbose": False
    }

    model = CatBoostClassifier(**params)

    # pruning callback cuts bad trials early
    pruning_callback = CatBoostPruningCallback(trial, "F1")

    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        early_stopping_rounds=60,
        verbose=False,
        callbacks=[pruning_callback]
    )

    # Get predicted probabilities
    probs = model.predict_proba(X_val)[:, 1]

    # Sweep thresholds from 0.1 to 0.9
    thresholds = np.linspace(0.1, 0.9, 81)
    f1s = [f1_score(y_val, probs > t) for t in thresholds]

    # Use the best F1 score
    score = max(f1s)

    # Save the best threshold for this trial
    best_threshold = thresholds[np.argmax(f1s)]
    trial.set_user_attr("best_threshold", best_threshold)

    return score


In [97]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=300, show_progress_bar=True)

print("Best F1:", study.best_value)
print("Best Params:", study.best_params)
best_threshold = study.best_trial.user_attrs["best_threshold"]
print(f"Best threshold: {best_threshold:.3f}")

[I 2025-11-04 18:33:10,951] A new study created in memory with name: no-name-08884a01-27b2-40af-bcc3-6af14bfa21f3


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2025-11-04 18:33:40,039] Trial 0 finished with value: 0.41896972495217993 and parameters: {'depth': 8, 'learning_rate': 0.09246008011205867, 'iterations': 3953, 'l2_leaf_reg': 7.397220147476662, 'bagging_temperature': 0.05735666710836515, 'subsample': 0.8601843632598398, 'rsm': 0.8180459393178633, 'border_count': 230, 'sampling_frequency': 'PerTreeLevel', 'one_hot_max_size': 4, 'pos_weight': 9}. Best is trial 0 with value: 0.41896972495217993.
[I 2025-11-04 18:33:57,733] Trial 1 finished with value: 0.40376497432972047 and parameters: {'depth': 12, 'learning_rate': 0.19053303745145242, 'iterations': 1907, 'l2_leaf_reg': 3.1219562082146575, 'bagging_temperature': 0.057686698851045014, 'subsample': 0.8929240962808016, 'rsm': 0.7143114131433703, 'border_count': 117, 'sampling_frequency': 'PerTree', 'one_hot_max_size': 6, 'pos_weight': 16}. Best is trial 0 with value: 0.41896972495217993.
[I 2025-11-04 18:34:20,629] Trial 2 finished with value: 0.40998521551233935 and parameters: {'dept

Best Params: {'depth': 8, 'learning_rate': 0.0891298975097704, 'iterations': 2113, 'l2_leaf_reg': 15.020144260249884, 'bagging_temperature': 0.181935508142554, 'subsample': 0.7286665619626174, 'rsm': 0.7086792468056278, 'pos_weight': 6}


Best F1: 0.4205530817904851
Best Params: {'depth': 8, 'learning_rate': 0.027243341395631743, 'iterations': 1621, 'l2_leaf_reg': 11.464256887807908, 'bagging_temperature': 0.10490277228464714, 'subsample': 0.8090584457549639, 'rsm': 0.8224343572178422, 'pos_weight': 7}
Best threshold: 0.640

In [98]:
best_params = study.best_params.copy()

# Convert pos_weight to class_weights
pos_weight = best_params.pop("pos_weight")
best_params["class_weights"] = [1, pos_weight]

# Add other fixed parameters
best_params.update({
    "loss_function": "Logloss",
    "eval_metric": "F1",
    "verbose": False,
    "random_state": 42
})

import json

# Save best parameters to a JSON file
with open("parameters/best_catboost_params_v2.json", "w") as f:
    json.dump(best_params, f, indent=2)

print("Saved best parameters to best_catboost_params.json")

Saved best parameters to best_catboost_params.json


In [99]:
# Load best parameters from JSON
with open("parameters/best_catboost_params_v2.json", "r") as f:
    best_params = json.load(f)

# Train final model using loaded parameters
final_model = CatBoostClassifier(**best_params)
final_model.fit(
    X_final,
    y_final,
    early_stopping_rounds=60,
    verbose=False
)

<catboost.core.CatBoostClassifier at 0x7390cd83e5a0>

In [100]:
joblib.dump(final_model, "models/catboost_tuned_V4.pkl")
print("Model saved as catboost_tuned_V4.pkl")

Model saved as catboost_tuned_V4.pkl


In [101]:
test_probs = final_model.predict_proba(X_test_final)[:, 1]
test_preds = (test_probs > best_threshold).astype(bool)

submission = pd.DataFrame({
    "ID": df_test["ID"],
    "TARGET": test_preds
})

submission.to_csv("submissions/catboost_tuned_V4.csv", index=False)
print("Saved as catboost_tuned_V4.csv")
print(f"Saved catboost_tuned_V4.csv using threshold {best_threshold:.3f}")

Saved as catboost_tuned_V4.csv
Saved catboost_tuned_V4.csv using threshold 0.850
