In [11]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve
from catboost import CatBoostClassifier

In [12]:
sys.path.append(os.path.abspath("../src"))
from preprocessing import add_time_features, add_distance_features, run_preproc

DATA_PATH = os.path.abspath("../train/train.csv")
RANDOM_STATE = 42

In [13]:
target_col = "target"
categorical_cols = ["gender", "merch", "cat_id", "one_city", "us_state", "jobs"]
drop_cols = ["name_1", "name_2", "street", "post_code"]
n_cats = 50

raw = pd.read_csv(DATA_PATH)
y = raw[target_col].astype(int)

# Split before building encoders to avoid leakage
raw_train, raw_valid, y_train, y_valid = train_test_split(
    raw, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)


def build_train_ref(df):
    train = df.drop(columns=drop_cols).copy()
    train = add_time_features(train)

    for col in categorical_cols:
        new_col = f"{col}_cat"
        temp_df = (
            train.groupby(col, dropna=False)[[target_col]]
            .count()
            .sort_values(target_col, ascending=False)
            .reset_index()
            .set_axis([col, "count"], axis=1)
            .reset_index()
        )
        temp_df["index"] = temp_df.apply(
            lambda x: np.nan if pd.isna(x[col]) else x["index"], axis=1
        )
        temp_df[new_col] = [
            "cat_NAN" if pd.isna(x) else f"cat_{x}" if x < n_cats else f"cat_{n_cats}+"
            for x in temp_df["index"]
        ]
        train = train.merge(temp_df[[col, new_col]], how="left", on=col)

    train = add_distance_features(train)
    return train


train_ref = build_train_ref(raw_train)


In [14]:
X_train = run_preproc(train_ref, raw_train.drop(columns=[target_col]))
X_valid = run_preproc(train_ref, raw_valid.drop(columns=[target_col]))

cat_features = [
    col for col in [
        "hour", "year", "month", "day_of_month", "day_of_week",
        "gender_cat", "merch_cat", "cat_id_cat", "one_city_cat", "us_state_cat", "jobs_cat"
    ] if col in X_train.columns
]

for col in cat_features:
    if col in X_train.columns:
        X_train[col] = X_train[col].astype(str)
        X_valid[col] = X_valid[col].astype(str)

pos = int(y_train.sum())
neg = len(y_train) - pos
scale_pos_weight = neg / max(pos, 1)
print("scale_pos_weight:", scale_pos_weight)


scale_pos_weight: 173.61670829864002


In [15]:
model = CatBoostClassifier(
    iterations=2000,
    depth=6,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="PRAUC",
    custom_metric=["AUC", "PRAUC"],
    random_seed=RANDOM_STATE,
    scale_pos_weight=scale_pos_weight,
    od_type="Iter",
    od_wait=100,
    verbose=200
)

model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_valid, y_valid),
    use_best_model=True
)

valid_pred = model.predict_proba(X_valid)[:, 1]
print("ROC-AUC:", roc_auc_score(y_valid, valid_pred))
print("PR-AUC:", average_precision_score(y_valid, valid_pred))

precision, recall, thresholds = precision_recall_curve(y_valid, valid_pred)
f1 = 2 * precision[:-1] * recall[:-1] / (precision[:-1] + recall[:-1] + 1e-9)
best_idx = int(f1.argmax())
best_threshold = float(thresholds[best_idx]) if len(thresholds) else 0.5
print("Best F1 threshold:", best_threshold)
print("Best F1:", f1[best_idx])


0:	learn: 0.9769766	test: 0.9759045	best: 0.9759045 (0)	total: 379ms	remaining: 12m 38s
200:	learn: 0.9983895	test: 0.9982930	best: 0.9982930 (200)	total: 1m 7s	remaining: 10m 1s
400:	learn: 0.9993126	test: 0.9989033	best: 0.9989129 (396)	total: 2m 19s	remaining: 9m 14s
600:	learn: 0.9996077	test: 0.9990693	best: 0.9990709 (599)	total: 3m 30s	remaining: 8m 9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9991025741
bestIteration = 695

Shrink model to first 696 iterations.
ROC-AUC: 0.999138554020117
PR-AUC: 0.9321447466492158
Best F1 threshold: 0.9639767434497941
Best F1: 0.8656215000599505


In [20]:
MODEL_PATH = os.path.abspath("../models/my_model.cbm")
model.save_model(MODEL_PATH)
print("Saved:", MODEL_PATH)


Saved: /Users/alex/Desktop/КНАД_222/Год_4/ML_Sys/mts25_mlops_hw2_real_time_fraud_detection-main/fraud_detector/models/my_model.cbm
