In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, average_precision_score, precision_recall_curve,
    classification_report, confusion_matrix
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin

## Random Forest

In [2]:
#Load the data
df = pd.read_csv("/content/drive/MyDrive/cleaned_fraud_df.csv")
use_cols = ['age_group','gender_clean','category_clean','amount_bin','merchant','customer','fraud']
df = df[use_cols].copy()

y = df['fraud'].astype(int).values
X = df.drop(columns=['fraud'])

# Stratified 70/15/15 split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)
print("Train:", X_train.shape, y_train.mean())
print("Valid:", X_valid.shape, y_valid.mean())
print("Test :", X_test.shape,  y_test.mean())

Train: (416250, 6) 0.012108108108108109
Valid: (89196, 6) 0.012108166285483654
Test : (89197, 6) 0.012108030539143694


In [4]:
# Encoders
# TargetEncoder for IDs (identity signal, leakage-safe, smoothed)
# CountEncoder for IDs (popularity signal, log-scaled)
# OneHot for low-card categoricals
class TargetEncoder(BaseEstimator, TransformerMixin):
    """
    Smoothed target encoding: for each category v,
        enc(v) = (sum_y + prior * m) / (count + m)
    where prior = global mean, m = smoothing.
    Unseen categories map to prior. Safe inside CV/Pipeline.
    """
    def __init__(self, cols, smoothing=200):
        self.cols = cols
        self.smoothing = smoothing
        self.maps_ = {}
        self.prior_ = None

    def fit(self, X, y):
        X = X.copy()
        y = pd.Series(y)
        self.prior_ = float(y.mean())
        self.maps_ = {}
        for c in self.cols:
            s = X[c].astype(str)
            stats = y.groupby(s).agg(['sum','count'])
            enc = (stats['sum'] + self.prior_ * self.smoothing) / (stats['count'] + self.smoothing)
            self.maps_[c] = enc
        return self

    def transform(self, X):
        X = X.copy()
        feats = []
        for c in self.cols:
            m = self.maps_.get(c, pd.Series(dtype=float))
            v = X[c].astype(str).map(m).fillna(self.prior_)
            feats.append(v.astype('float32').to_numpy().reshape(-1, 1))
        return np.hstack(feats)

class CountEncoder(BaseEstimator, TransformerMixin):
    """
    Unsupervised frequency (count) encoding with optional log scaling.
    """
    def __init__(self, cols, normalize=False, log1p=True):
        self.cols = cols
        self.normalize = normalize
        self.log1p = log1p
        self.maps_ = {}
        self.n_train_ = None

    def fit(self, X, y=None):
        X = X.copy()
        self.n_train_ = len(X)
        self.maps_ = {}
        for c in self.cols:
            vc = X[c].astype(str).value_counts()
            if self.normalize:
                vc = vc / self.n_train_
            self.maps_[c] = vc
        return self

    def transform(self, X):
        X = X.copy()
        feats = []
        for c in self.cols:
            m = self.maps_.get(c, pd.Series(dtype=float))
            enc = X[c].astype(str).map(m).fillna(0.0)
            if self.log1p and not self.normalize:
                enc = np.log1p(enc)  # log(count)
            elif self.log1p and self.normalize:
                enc = np.log1p(enc * self.n_train_)  # ~log(count)
            feats.append(enc.astype('float32').to_numpy().reshape(-1, 1))
        return np.hstack(feats)

low_card  = ['age_group','gender_clean','category_clean','amount_bin']
high_card = ['merchant','customer']

# Small feature space: dense is fine
try:
    # scikit-learn >= 1.2
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    # scikit-learn < 1.2
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocess = ColumnTransformer(
    transformers=[
        ('ohe',  ohe, low_card),
        ('te',   TargetEncoder(high_card, smoothing=200), high_card),
        ('freq', CountEncoder(high_card, normalize=False, log1p=True), high_card),
    ],
    remainder='drop'
)

In [5]:
# Model definition
rf = RandomForestClassifier(
    n_estimators=600,
    max_depth=12,
    min_samples_leaf=100,          # prevent tiny pure leaves (helps against tail overfit)
    max_features='sqrt',
    class_weight='balanced_subsample',  # imbalance-aware per tree
    n_jobs=-1,
    random_state=42,
    oob_score=True
)

pipe = Pipeline([
    ('pre', preprocess),
    ('rf',  rf)
])

pipe.fit(X_train, y_train)
print("OOB score:", getattr(pipe.named_steps['rf'], 'oob_score_', None))

OOB score: 0.9599783783783784


In [6]:
from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix

def evaluate_at_threshold(model, X, y, thr):
    proba = model.predict_proba(X)[:, 1]
    pred  = (proba >= thr).astype(int)
    ap  = average_precision_score(y, proba)
    auc = roc_auc_score(y, proba)
    print(f"AUC-PR: {ap:.4f} | ROC-AUC: {auc:.4f} | thr={thr:.4f} | prevalence={y.mean():.4f}")
    print(classification_report(y, pred, digits=3))
    print("Confusion matrix:\n", confusion_matrix(y, pred))
    return ap, auc

proba_valid = pipe.predict_proba(X_valid)[:, 1]
prec, rec, thr = precision_recall_curve(y_valid, proba_valid)
f1  = 2*prec*rec/(prec+rec+1e-9)
best_idx = int(np.nanargmax(f1[:-1]))  # thresholds has length len(prec)-1
thr_star = float(thr[best_idx])
print("Chosen threshold (max F1 on VALID):", thr_star)

_ = evaluate_at_threshold(pipe, X_valid, y_valid, thr_star)
_ = evaluate_at_threshold(pipe, X_test,  y_test,  thr_star)

Chosen threshold (max F1 on VALID): 0.9431542501590449
AUC-PR: 0.8226 | ROC-AUC: 0.9963 | thr=0.9432 | prevalence=0.0121
              precision    recall  f1-score   support

           0      0.997     0.997     0.997     88116
           1      0.772     0.725     0.748      1080

    accuracy                          0.994     89196
   macro avg      0.884     0.861     0.872     89196
weighted avg      0.994     0.994     0.994     89196

Confusion matrix:
 [[87885   231]
 [  297   783]]
AUC-PR: 0.8297 | ROC-AUC: 0.9967 | thr=0.9432 | prevalence=0.0121
              precision    recall  f1-score   support

           0      0.997     0.997     0.997     88117
           1      0.779     0.736     0.757      1080

    accuracy                          0.994     89197
   macro avg      0.888     0.867     0.877     89197
weighted avg      0.994     0.994     0.994     89197

Confusion matrix:
 [[87892   225]
 [  285   795]]


In [7]:
from sklearn.metrics import average_precision_score

def metrics_by_bin(model, X, y, bin_series, thr):
    proba = model.predict_proba(X)[:,1]
    pred  = (proba >= thr).astype(int)
    dfm = pd.DataFrame({'bin': bin_series.values, 'y': y, 'proba': proba, 'pred': pred})
    rows = []
    for b, g in dfm.groupby('bin'):
        if len(g) == 0:
            continue
        ap  = average_precision_score(g['y'], g['proba']) if g['y'].sum()>0 else np.nan
        auc = roc_auc_score(g['y'], g['proba']) if g['y'].nunique()>1 else np.nan
        tp = ((g['pred']==1)&(g['y']==1)).sum()
        fp = ((g['pred']==1)&(g['y']==0)).sum()
        fn = ((g['pred']==0)&(g['y']==1)).sum()
        prec = tp/(tp+fp+1e-9); rec = tp/(tp+fn+1e-9)
        rows.append([b, len(g), int(g['y'].sum()), ap, auc, prec, rec])
    return pd.DataFrame(rows, columns=['amount_bin','n','frauds','AP','ROC_AUC','Precision','Recall']).sort_values('n', ascending=False)

print("\nPer-bin metrics (VALID):")
print(metrics_by_bin(pipe, X_valid, y_valid, X_valid['amount_bin'], thr_star))

print("\nPer-bin metrics (TEST):")
print(metrics_by_bin(pipe, X_test, y_test, X_test['amount_bin'], thr_star))


Per-bin metrics (VALID):
  amount_bin      n  frauds        AP   ROC_AUC  Precision    Recall
6      25–50  32622      48  0.298446  0.994553   0.600000  0.062500
2      10–25  25376      32  0.228034  0.993614   0.000000  0.000000
0       0–10  16059      17  0.273962  0.993442   0.000000  0.000000
8      50–75  10186      51  0.448161  0.990454   0.541667  0.254902
9     75–150   3183     115  0.348547  0.929236   0.403509  0.400000
3    150–250    953     164  0.589594  0.893706   0.522843  0.628049
5    250–500    500     343  0.936744  0.890773   0.868074  0.959184
7   500–1000    219     214  0.983757  0.559813   0.980392  0.934579
1  1000–2500     49      47  0.959537  0.361702   0.953488  0.872340
4      2500+     49      49  1.000000       NaN   1.000000  0.959184

Per-bin metrics (TEST):
  amount_bin      n  frauds        AP   ROC_AUC  Precision    Recall
6      25–50  32667      45  0.273644  0.995216   1.000000  0.022222
2      10–25  25133      26  0.284144  0.997001   0.

In [8]:
param_dist = {
    'rf__n_estimators':    [400, 600, 800],
    'rf__max_depth':       [8, 10, 12, 16, None],
    'rf__min_samples_leaf':[50, 100, 150, 200, 300],
    'rf__max_features':    ['sqrt', 0.2, 0.3, 0.4],
    'rf__class_weight':    ['balanced', 'balanced_subsample']
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=20,
    scoring='average_precision',   # PR-AUC suits imbalance
    n_jobs=-1,
    cv=cv,
    verbose=1,
    random_state=42
)
search.fit(X_train, y_train)
print("Best PR-AUC (CV):", search.best_score_)
print("Best params:", search.best_params_)

best_model = search.best_estimator_

# Re-pick threshold on VALID using best model
proba_valid = best_model.predict_proba(X_valid)[:,1]
prec, rec, thr = precision_recall_curve(y_valid, proba_valid)
f1  = 2*prec*rec/(prec+rec+1e-9)
best_idx = int(np.nanargmax(f1[:-1]))
thr_star = float(thr[best_idx])
print("Chosen threshold (max F1, VALID):", thr_star)

_ = evaluate_at_threshold(best_model, X_valid, y_valid, thr_star)
_ = evaluate_at_threshold(best_model, X_test,  y_test,  thr_star)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best PR-AUC (CV): 0.8640920151996957
Best params: {'rf__n_estimators': 800, 'rf__min_samples_leaf': 50, 'rf__max_features': 0.3, 'rf__max_depth': 16, 'rf__class_weight': 'balanced'}
Chosen threshold (max F1, VALID): 0.9398871609573412
AUC-PR: 0.8660 | ROC-AUC: 0.9972 | thr=0.9399 | prevalence=0.0121
              precision    recall  f1-score   support

           0      0.998     0.997     0.997     88116
           1      0.769     0.797     0.783      1080

    accuracy                          0.995     89196
   macro avg      0.883     0.897     0.890     89196
weighted avg      0.995     0.995     0.995     89196

Confusion matrix:
 [[87857   259]
 [  219   861]]
AUC-PR: 0.8706 | ROC-AUC: 0.9975 | thr=0.9399 | prevalence=0.0121
              precision    recall  f1-score   support

           0      0.998     0.997     0.997     88117
           1      0.758     0.809     0.783      1080

    accuracy                   

In [9]:
print("\nPer-bin metrics (TEST):")
print(metrics_by_bin(best_model, X_test, y_test, X_test['amount_bin'], thr_star))


Per-bin metrics (TEST):
  amount_bin      n  frauds        AP   ROC_AUC  Precision    Recall
6      25–50  32667      45  0.336023  0.996426   0.424242  0.311111
2      10–25  25133      26  0.343598  0.997658   0.400000  0.307692
0       0–10  16083      21  0.392997  0.995879   0.571429  0.190476
8      50–75  10214      37  0.423899  0.989592   0.583333  0.378378
9     75–150   3315     129  0.536474  0.949614   0.500000  0.426357
3    150–250    946     163  0.728006  0.929001   0.522727  0.846626
5    250–500    503     337  0.965517  0.938517   0.886792  0.976261
7   500–1000    235     229  0.989746  0.671033   0.973568  0.965066
4      2500+     51      51  1.000000       NaN   1.000000  1.000000
1  1000–2500     50      42  0.923997  0.705357   0.869565  0.952381


In [10]:
joblib.dump(best_model if 'best_model' in locals() else pipe, "/content/drive/MyDrive/fraud_model/rf_pipeline.joblib")

# Save threshold and a short model card
with open("/content/drive/MyDrive/fraud_model/rf_threshold.txt","w") as f:
    f.write(str(float(thr_star)))

print("Saved: rf_pipeline.joblib, rf_threshold.txt")


Saved: rf_pipeline.joblib, rf_threshold.txt


In [11]:
# Use the tuned model if you ran RandomizedSearch; otherwise fall back to the baseline
model_current = best_model if 'best_model' in globals() else pipe


In [12]:
import numpy as np, pandas as pd
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score

def metrics_by_bin_from_preds(y, proba, pred, bin_series):
    dfm = pd.DataFrame({'bin': bin_series.values, 'y': y, 'proba': proba, 'pred': pred})
    rows = []
    for b, g in dfm.groupby('bin'):
        if len(g) == 0:
            continue
        ap  = average_precision_score(g['y'], g['proba']) if g['y'].sum()>0 else np.nan
        auc = roc_auc_score(g['y'], g['proba']) if g['y'].nunique()>1 else np.nan
        tp = ((g['pred']==1)&(g['y']==1)).sum()
        fp = ((g['pred']==1)&(g['y']==0)).sum()
        fn = ((g['pred']==0)&(g['y']==1)).sum()
        prec = tp/(tp+fp+1e-9); rec = tp/(tp+fn+1e-9)
        rows.append([b, len(g), int(g['y'].sum()), ap, auc, prec, rec])
    return pd.DataFrame(rows, columns=['amount_bin','n','frauds','AP','ROC_AUC','Precision','Recall']).sort_values('n', ascending=False)

def metrics_by_bin(model, X, y, bin_series, thr):
    proba = model.predict_proba(X)[:,1]
    pred  = (proba >= float(thr)).astype(int)
    return metrics_by_bin_from_preds(y, proba, pred, bin_series)

def thresholds_per_bin(model, X_valid, y_valid, bin_valid, strategy="max_f1", min_recall=None):
    """
    Learn a threshold per amount_bin from VALID set.
    strategy="max_f1" or "recall_floor" (then provide min_recall).
    """
    proba = model.predict_proba(X_valid)[:,1]
    dfv = pd.DataFrame({'bin': bin_valid.values, 'y': y_valid, 'p': proba})
    thr_map = {}
    for b, g in dfv.groupby('bin'):
        if g['y'].sum() == 0:
            thr_map[b] = 0.999  # ultra high: don't trigger for a bin with no positives
            continue
        prec, rec, thr = precision_recall_curve(g['y'], g['p'])
        if strategy == "max_f1":
            f1 = 2*prec*rec/(prec+rec+1e-9)
            idx = int(np.nanargmax(f1[:-1]))
        elif strategy == "recall_floor":
            floor = 0.5 if min_recall is None else float(min_recall)
            ok = np.where(rec[:-1] >= floor)[0]
            idx = ok[-1] if len(ok) else len(thr)-1
        else:
            raise ValueError("Unknown strategy")
        thr_map[b] = float(thr[idx])
    return thr_map

def predict_with_bin_thresholds(model, X, bin_series, thr_map, default_thr):
    proba = model.predict_proba(X)[:,1]
    thrs = bin_series.map(thr_map).fillna(float(default_thr)).astype(float).values
    pred = (proba >= thrs).astype(int)
    return proba, pred


In [13]:
try:
    thr_star
except NameError:
    from sklearn.metrics import precision_recall_curve
    pv = model_current.predict_proba(X_valid)[:,1]
    P, R, T = precision_recall_curve(y_valid, pv)
    F1 = 2*P*R/(P+R+1e-9)
    thr_star = float(T[int(np.nanargmax(F1[:-1]))])
    print("Computed thr_star (VALID, max F1):", thr_star)

print("\n=== BEFORE: Global threshold per-bin metrics (VALID) ===")
valid_before = metrics_by_bin(model_current, X_valid, y_valid, X_valid['amount_bin'], thr_star)
print(valid_before)

print("\n=== BEFORE: Global threshold per-bin metrics (TEST) ===")
test_before  = metrics_by_bin(model_current, X_test,  y_test,  X_test['amount_bin'],  thr_star)
print(test_before)



=== BEFORE: Global threshold per-bin metrics (VALID) ===
  amount_bin      n  frauds        AP   ROC_AUC  Precision    Recall
6      25–50  32622      48  0.408349  0.995106   0.472222  0.354167
2      10–25  25376      32  0.282457  0.994567   0.583333  0.218750
0       0–10  16059      17  0.295404  0.993647   0.428571  0.176471
8      50–75  10186      51  0.524545  0.991389   0.638889  0.450980
9     75–150   3183     115  0.431653  0.948156   0.415929  0.408696
3    150–250    953     164  0.712168  0.931010   0.570833  0.835366
5    250–500    500     343  0.968363  0.937568   0.887097  0.962099
7   500–1000    219     214  0.992332  0.722430   0.975962  0.948598
1  1000–2500     49      47  0.959224  0.351064   0.957447  0.957447
4      2500+     49      49  1.000000       NaN   1.000000  1.000000

=== BEFORE: Global threshold per-bin metrics (TEST) ===
  amount_bin      n  frauds        AP   ROC_AUC  Precision    Recall
6      25–50  32667      45  0.336023  0.996426   0.42424

In [14]:
# Learn a threshold per amount_bin (on VALID) using max-F1 within each bin
thr_map = thresholds_per_bin(model_current, X_valid, y_valid, X_valid['amount_bin'], strategy="max_f1")

# Evaluate on VALID with per-bin thresholds (for reference)
proba_v, pred_v = predict_with_bin_thresholds(model_current, X_valid, X_valid['amount_bin'], thr_map, default_thr=thr_star)
valid_after_thr = metrics_by_bin_from_preds(y_valid, proba_v, pred_v, X_valid['amount_bin'])

# Evaluate on TEST with those per-bin thresholds
proba_t, pred_t = predict_with_bin_thresholds(model_current, X_test,  X_test['amount_bin'],  thr_map, default_thr=thr_star)
test_after_thr  = metrics_by_bin_from_preds(y_test,  proba_t, pred_t, X_test['amount_bin'])

print("\n=== AFTER (Per-bin thresholds): per-bin metrics (VALID) ===")
print(valid_after_thr)

print("\n=== AFTER (Per-bin thresholds): per-bin metrics (TEST) ===")
print(test_after_thr)



=== AFTER (Per-bin thresholds): per-bin metrics (VALID) ===
  amount_bin      n  frauds        AP   ROC_AUC  Precision    Recall
6      25–50  32622      48  0.408349  0.995106   0.393443  0.500000
2      10–25  25376      32  0.282457  0.994567   0.450000  0.281250
0       0–10  16059      17  0.295404  0.993647   0.375000  0.352941
8      50–75  10186      51  0.524545  0.991389   0.551724  0.627451
9     75–150   3183     115  0.431653  0.948156   0.394904  0.539130
3    150–250    953     164  0.712168  0.931010   0.568548  0.859756
5    250–500    500     343  0.968363  0.937568   0.900552  0.950437
7   500–1000    219     214  0.992332  0.722430   0.977169  1.000000
1  1000–2500     49      47  0.959224  0.351064   0.959184  1.000000
4      2500+     49      49  1.000000       NaN   1.000000  0.979592

=== AFTER (Per-bin thresholds): per-bin metrics (TEST) ===
  amount_bin      n  frauds        AP   ROC_AUC  Precision    Recall
6      25–50  32667      45  0.336023  0.996426   0

### Weights FIX

In [15]:
from sklearn.base import clone

# Compute positive rate per bin on TRAIN
train_stats = (X_train.assign(y=y_train)
               .groupby('amount_bin')['y']
               .agg(pos='sum', n='size'))
train_stats['pos_rate'] = train_stats['pos']/train_stats['n']

# Build positive sample weights inversely to each bin's prevalence, capped
median_rate = float(train_stats['pos_rate'].replace(0, np.nan).median())
safe_rate   = train_stats['pos_rate'].replace(0, 1e-6)
pos_w = (median_rate / safe_rate).clip(1.0, 10.0)  # cap extreme weights
bin_pos_weight = pos_w.to_dict()

def make_sample_weight(X, y, bin_weights):
    w = np.ones(len(y), dtype='float32')
    mask = (y == 1)
    w[mask] = X.loc[mask, 'amount_bin'].map(bin_weights).fillna(1.0).values
    return w

w_train = make_sample_weight(X_train, y_train, bin_pos_weight)

# Retrain a fresh clone with weights
pipe_w = clone(model_current)   # preserves your encoders and RF hyperparams
pipe_w.fit(X_train, y_train, rf__sample_weight=w_train)

# Re-pick global threshold on VALID (max F1)
pv_w = pipe_w.predict_proba(X_valid)[:,1]
P_w, R_w, T_w = precision_recall_curve(y_valid, pv_w)
F1_w = 2*P_w*R_w/(P_w+R_w+1e-9)
thr_star_w = float(T_w[int(np.nanargmax(F1_w[:-1]))])
print("\nWeighted model: chosen thr_star_w (VALID, max F1):", thr_star_w)

# BEFORE/AFTER per-bin (global threshold for the weighted model)
print("\n=== Weighted model (global thr): per-bin metrics (VALID) ===")
valid_weighted_before = metrics_by_bin(pipe_w, X_valid, y_valid, X_valid['amount_bin'], thr_star_w)
print(valid_weighted_before)

print("\n=== Weighted model (global thr): per-bin metrics (TEST) ===")
test_weighted_before  = metrics_by_bin(pipe_w, X_test,  y_test,  X_test['amount_bin'],  thr_star_w)
print(test_weighted_before)

# apply per-bin thresholds to the weighted model
thr_map_w = thresholds_per_bin(pipe_w, X_valid, y_valid, X_valid['amount_bin'], strategy="max_f1")
pv2_v, pr2_v = predict_with_bin_thresholds(pipe_w, X_valid, X_valid['amount_bin'], thr_map_w, default_thr=thr_star_w)
pv2_t, pr2_t = predict_with_bin_thresholds(pipe_w, X_test,  X_test['amount_bin'],  thr_map_w, default_thr=thr_star_w)

print("\n=== Weighted model + per-bin thresholds: per-bin metrics (VALID) ===")
print(metrics_by_bin_from_preds(y_valid, pv2_v, pr2_v, X_valid['amount_bin']))

print("\n=== Weighted model + per-bin thresholds: per-bin metrics (TEST) ===")
print(metrics_by_bin_from_preds(y_test,  pv2_t, pr2_t, X_test['amount_bin']))



Weighted model: chosen thr_star_w (VALID, max F1): 0.9787487064232085

=== Weighted model (global thr): per-bin metrics (VALID) ===
  amount_bin      n  frauds        AP   ROC_AUC  Precision    Recall
6      25–50  32622      48  0.415377  0.995624   0.351351  0.541667
2      10–25  25376      32  0.295865  0.995052   0.366667  0.343750
0       0–10  16059      17  0.252438  0.994197   0.184211  0.411765
8      50–75  10186      51  0.590952  0.992341   0.326923  0.666667
9     75–150   3183     115  0.427684  0.946393   0.425926  0.400000
3    150–250    953     164  0.702087  0.929356   0.703125  0.548780
5    250–500    500     343  0.962020  0.929008   0.911672  0.842566
7   500–1000    219     214  0.989381  0.641121   0.978610  0.855140
1  1000–2500     49      47  0.977977  0.574468   0.952381  0.851064
4      2500+     49      49  1.000000       NaN   1.000000  0.938776

=== Weighted model (global thr): per-bin metrics (TEST) ===
  amount_bin      n  frauds        AP   ROC_AUC

In [16]:
import json
joblib.dump(pipe_w, "/content/drive/MyDrive/fraud_model/rf_pipeline_weighted.joblib")
# optional thresholds learned on VALID for the weighted model:
with open("/content/drive/MyDrive/fraud_model/thresholds_by_amount_bin_weighted.json","w") as f:
    json.dump({str(k): float(v) for k,v in thr_map_w.items()}, f)
with open("/content/drive/MyDrive/fraud_model/global_threshold_weighted.txt","w") as f:
    f.write(str(float(thr_star_w)))
print("Saved rf_pipeline_weighted.joblib (+ thresholds if used)")


Saved rf_pipeline_weighted.joblib (+ thresholds if used)


## Cat-Boost

In [5]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [6]:
import numpy as np
import pandas as pd
import json
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_recall_curve, average_precision_score, roc_auc_score,
    classification_report, confusion_matrix
)
from catboost import CatBoostClassifier, Pool

In [2]:
#Load data
df = pd.read_csv("/content/drive/MyDrive/cleaned_fraud_df.csv")

FEATURES = ['age_group','gender_clean','category_clean','amount_bin','merchant','customer']
TARGET   = 'fraud'

df = df[FEATURES + [TARGET]].copy()
y  = df[TARGET].astype(int).values
X  = df[FEATURES].copy()

In [3]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Train:", X_train.shape, y_train.mean())
print("Valid:", X_valid.shape, y_valid.mean())
print("Test :", X_test.shape,  y_test.mean())

Train: (416250, 6) 0.012108108108108109
Valid: (89196, 6) 0.012108166285483654
Test : (89197, 6) 0.012108030539143694


In [7]:
#CatBoost Pools with categorical features
cat_cols = FEATURES  # pass names directly
train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_cols)

In [8]:
# Model config (imbalance-aware + early stopping on PR-AUC)
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = (neg / max(pos, 1))  # guard against div-by-zero

cb = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=6,
    loss_function='Logloss',
    eval_metric='PRAUC',         # better for imbalance
    scale_pos_weight=scale_pos_weight,
    random_seed=42,
    od_type='Iter', od_wait=200, # early stopping patience
    verbose=200
)

cb.fit(train_pool, eval_set=valid_pool, use_best_model=True)

0:	learn: 0.9953018	test: 0.9950798	best: 0.9950798 (0)	total: 2.01s	remaining: 1h 40m 29s
200:	learn: 0.9988701	test: 0.9981351	best: 0.9981358 (162)	total: 1m 51s	remaining: 25m 53s
400:	learn: 0.9992853	test: 0.9979964	best: 0.9981394 (205)	total: 3m 44s	remaining: 24m 12s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9981394474
bestIteration = 205

Shrink model to first 206 iterations.


<catboost.core.CatBoostClassifier at 0x7a98f519b650>

In [9]:
# Global threshold selection (max F1) on VALID
def pick_threshold_max_f1(y_true, proba):
    P, R, T = precision_recall_curve(y_true, proba)
    F1 = 2*P*R/(P+R+1e-9)
    # T has length len(P)-1; drop the last F1 which has no threshold
    best_idx = int(np.nanargmax(F1[:-1]))
    return float(T[best_idx])

proba_valid = cb.predict_proba(valid_pool)[:,1]
thr_star = pick_threshold_max_f1(y_valid, proba_valid)
print("Chosen global threshold (VALID, max F1):", thr_star)

Chosen global threshold (VALID, max F1): 0.9742398392816267


In [10]:
#Evaluation helpers
def evaluate_at_threshold(model, pool, y_true, thr, label=""):
    p = model.predict_proba(pool)[:,1]
    pred = (p >= float(thr)).astype(int)
    ap  = average_precision_score(y_true, p)
    auc = roc_auc_score(y_true, p)
    print(f"\n[{label}] PRAUC={ap:.4f} | ROC-AUC={auc:.4f} | thr={thr:.4f} | prevalence={y_true.mean():.4f}")
    print(classification_report(y_true, pred, digits=3))
    print("Confusion matrix:\n", confusion_matrix(y_true, pred))
    return p, pred

def metrics_by_bin_from_preds(y_true, proba, pred, bins_series):
    dfm = pd.DataFrame({'bin': bins_series.values, 'y': y_true, 'proba': proba, 'pred': pred})
    rows = []
    for b, g in dfm.groupby('bin'):
        if len(g) == 0:
            continue
        ap  = average_precision_score(g['y'], g['proba']) if g['y'].sum()>0 else np.nan
        auc = roc_auc_score(g['y'], g['proba']) if g['y'].nunique()>1 else np.nan
        tp = ((g['pred']==1)&(g['y']==1)).sum()
        fp = ((g['pred']==1)&(g['y']==0)).sum()
        fn = ((g['pred']==0)&(g['y']==1)).sum()
        prec = tp/(tp+fp+1e-9); rec = tp/(tp+fn+1e-9)
        rows.append([b, len(g), int(g['y'].sum()), ap, auc, prec, rec])
    cols = ['amount_bin','n','frauds','AP','ROC_AUC','Precision','Recall']
    return pd.DataFrame(rows, columns=cols).sort_values('n', ascending=False)

def thresholds_per_bin_from_valid(y_valid, proba_valid, bin_valid,
                                  strategy="max_f1", min_recall=0.5):
    """
    Learn a threshold per amount_bin from the VALID set.
    strategy: "max_f1" or "recall_floor" (uses min_recall).
    """
    dfv = pd.DataFrame({'bin': bin_valid.values, 'y': y_valid, 'p': proba_valid})
    thr_map = {}
    for b, g in dfv.groupby('bin'):
        if g['y'].sum() == 0:
            thr_map[b] = 0.999  # no positives in this bin: keep it strict
            continue
        P, R, T = precision_recall_curve(g['y'], g['p'])
        if strategy == "max_f1":
            F1 = 2*P*R/(P+R+1e-9)
            idx = int(np.nanargmax(F1[:-1]))
        else:  # "recall_floor"
            ok = np.where(R[:-1] >= float(min_recall))[0]
            idx = ok[-1] if len(ok) else len(T)-1
        thr_map[b] = float(T[idx])
    return thr_map

def predict_with_bin_thresholds(model, X_df, pool, thr_map, default_thr):
    p = model.predict_proba(pool)[:,1]
    thrs = X_df['amount_bin'].map(thr_map).astype(float).fillna(float(default_thr)).values
    pred = (p >= thrs).astype(int)
    return p, pred, thrs

In [11]:
#Report: Global-threshold performance + per-bin tables
p_v, pred_v = evaluate_at_threshold(cb, valid_pool, y_valid, thr_star, label="VALID (global thr)")
p_t, pred_t = evaluate_at_threshold(cb, test_pool,  y_test,  thr_star, label="TEST  (global thr)")

print("\nPer-bin (VALID) with global threshold:")
print(metrics_by_bin_from_preds(y_valid, p_v, pred_v, X_valid['amount_bin']))

print("\nPer-bin (TEST) with global threshold:")
print(metrics_by_bin_from_preds(y_test,  p_t, pred_t, X_test['amount_bin']))


[VALID (global thr)] PRAUC=0.9282 | ROC-AUC=0.9982 | thr=0.9742 | prevalence=0.0121
              precision    recall  f1-score   support

           0      0.998     0.999     0.998     88116
           1      0.874     0.845     0.859      1080

    accuracy                          0.997     89196
   macro avg      0.936     0.922     0.929     89196
weighted avg      0.997     0.997     0.997     89196

Confusion matrix:
 [[87984   132]
 [  167   913]]

[TEST  (global thr)] PRAUC=0.9298 | ROC-AUC=0.9984 | thr=0.9742 | prevalence=0.0121
              precision    recall  f1-score   support

           0      0.998     0.998     0.998     88117
           1      0.851     0.854     0.853      1080

    accuracy                          0.996     89197
   macro avg      0.925     0.926     0.925     89197
weighted avg      0.996     0.996     0.996     89197

Confusion matrix:
 [[87956   161]
 [  158   922]]

Per-bin (VALID) with global threshold:
  amount_bin      n  frauds        A

In [12]:
# Per-bin thresholds (Fix #1) learned on VALID
# You can switch to strategy="recall_floor", e.g., min_recall=0.5 for small bins.
thr_map = thresholds_per_bin_from_valid(
    y_valid, p_v, X_valid['amount_bin'], strategy="max_f1", min_recall=0.5
)

p_vb, pred_vb, thrs_vb = predict_with_bin_thresholds(cb, X_valid, valid_pool, thr_map, default_thr=thr_star)
p_tb, pred_tb, thrs_tb = predict_with_bin_thresholds(cb, X_test,  test_pool,  thr_map, default_thr=thr_star)

print("\nPer-bin (VALID) with per-bin thresholds:")
print(metrics_by_bin_from_preds(y_valid, p_vb, pred_vb, X_valid['amount_bin']))

print("\nPer-bin (TEST) with per-bin thresholds:")
print(metrics_by_bin_from_preds(y_test,  p_tb, pred_tb, X_test['amount_bin']))


Per-bin (VALID) with per-bin thresholds:
  amount_bin      n  frauds        AP   ROC_AUC  Precision    Recall
6      25–50  32622      48  0.594167  0.997179   0.684211  0.541667
2      10–25  25376      32  0.449766  0.995857   0.583333  0.437500
0       0–10  16059      17  0.557836  0.996181   0.625000  0.588235
8      50–75  10186      51  0.647501  0.991543   0.607843  0.607843
9     75–150   3183     115  0.601069  0.960681   0.533784  0.686957
3    150–250    953     164  0.908099  0.977975   0.835366  0.835366
5    250–500    500     343  0.987679  0.977103   0.928767  0.988338
7   500–1000    219     214  0.996619  0.890654   0.981651  1.000000
1  1000–2500     49      47  0.990781  0.797872   0.959184  1.000000
4      2500+     49      49  1.000000       NaN   1.000000  1.000000

Per-bin (TEST) with per-bin thresholds:
  amount_bin      n  frauds        AP   ROC_AUC  Precision    Recall
6      25–50  32667      45  0.575428  0.997911   0.617647  0.466667
2      10–25  25133 

In [13]:
#Save artifacts
save_dir = Path("/content/drive/MyDrive/fraud_model")
model_path= save_dir / "catboost_fraud_model.cbm"
thresholds_path_cat= save_dir / "thresholds_by_amount_bin_cat.json"
golobal_threshold_path_cat= save_dir / "global_threshold_cat.txt"

cb.save_model(model_path)  # CatBoost native format
with open(thresholds_path_cat, "w") as f:
    json.dump({str(k): float(v) for k, v in thr_map.items()}, f)
with open(golobal_threshold_path_cat, "w") as f:
    f.write(str(float(thr_star)))

print(f"\nSaved:\n- {model_path}\n- {thresholds_path_cat}\n- {golobal_threshold_path_cat}")


Saved:
- /content/drive/MyDrive/fraud_model/catboost_fraud_model.cbm
- /content/drive/MyDrive/fraud_model/thresholds_by_amount_bin_cat.json
- /content/drive/MyDrive/fraud_model/global_threshold_cat.txt


In [14]:
# feature importance
imp = cb.get_feature_importance(type='PredictionValuesChange', data=train_pool)
fi = pd.Series(imp, index=FEATURES).sort_values(ascending=False)
print("\nFeature importance (PredictionValuesChange):")
print(fi)


Feature importance (PredictionValuesChange):
merchant          37.207447
category_clean    19.461013
amount_bin        19.263217
customer          13.410962
age_group          5.520114
gender_clean       5.137248
dtype: float64
