# Fraud‑Detection: Minimizing False Positives for Legitimate Frequent Customers

A complete, reproducible pipeline using **LightGBM** and custom metrics.

**Goal**: keep ⬆️ fraud recall (~90%) while driving ⬇️ false‑positive rate, giving *extra weight* to legitimate transactions from frequent customers.

In [18]:
import os, gc, warnings
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
warnings.filterwarnings('ignore')


In [19]:
DATA_PATH = '../data/processed/'      # <-- adjust if needed
df = pd.read_csv(os.path.join(DATA_PATH, 'transactions_processed.csv'))
print(df.shape)


(1852394, 55)


In [21]:
# December 2020 as blind test, October‑November 2020 as validation, rest train
test_mask  = (df['year']==2020) & (df['trans_month']==12)
valid_mask = (df['year']==2020) & (df['trans_month'].between(10,11))
train_mask = ~test_mask & ~valid_mask

train_df, valid_df, test_df = df[train_mask], df[valid_mask], df[test_mask]
print(train_df.shape, valid_df.shape, test_df.shape)


(1570873, 55) (141983, 55) (139538, 55)


In [22]:
target = 'is_fraud'
ignore = [target,'transaction_datetime']

features = [c for c in df.columns if c not in ignore]
cat_cols = [c for c in features if df[c].dtype=='object']

for c in cat_cols:
    for part in (train_df, valid_df, test_df):
        part[c] = part[c].astype('category')


In [23]:
# heavier weight for *legitimate* frequent customers
def make_weights(part):
    w = np.ones(len(part))
    frequent = (part['is_frequent_merchant'] == 1) & (part[target]==0)
    w[frequent] = 5.0
    return w

w_train = make_weights(train_df)
w_valid = make_weights(valid_df)


In [24]:
# ---- build Datasets exactly as before (no set_field) ----
lgb_train = lgb.Dataset(train_df[features], label=train_df[target],
                        weight=w_train, categorical_feature=cat_cols,
                        free_raw_data=False)
lgb_valid = lgb.Dataset(valid_df[features], label=valid_df[target],
                        weight=w_valid, categorical_feature=cat_cols,
                        free_raw_data=False, reference=lgb_train)

# ---- map Dataset‐id → frequent‑flag array ----
freq_map = {
    id(lgb_train): train_df['is_frequent_merchant'].values,
    id(lgb_valid): valid_df['is_frequent_merchant'].values
}

In [25]:
def fp_tp_ratio(preds, data):
    """FP/TP ratio among frequent customers (lower is better)."""
    y_true     = data.get_label()
    freq_flag  = freq_map[id(data)]
    y_pred     = preds > 0.5

    tp        = np.sum((y_true == 1) & (y_pred == 1))
    fp_freq   = np.sum((y_true == 0) & (y_pred == 1) & (freq_flag == 1))
    ratio     = (tp + fp_freq) / tp if tp else np.inf
    return 'fp_tp_ratio', ratio, False

In [43]:
from lightgbm import early_stopping, log_evaluation



params = dict(objective='binary', metric='auc',
              learning_rate=0.05, num_leaves=64,
              feature_fraction=0.8, bagging_fraction=0.8,
              bagging_freq=5, seed=42, verbosity=-1)


params.update({
    "scale_pos_weight": 50,   # instead of the 500‑ish auto value
    "min_child_weight": 0.1,  # let leaves split on fewer frauds
    "metric": "auc",          # let LightGBM optimise AUC; keep fp_tp in feval
})

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    feval=fp_tp_ratio,
    num_boost_round=500,
    callbacks=[
        early_stopping(stopping_rounds=50),   # <-- replaces early_stopping_rounds
        log_evaluation(period=50)             # nice progress print‑outs
    ]
)

Training until validation scores don't improve for 50 rounds
[50]	train's auc: 0.961389	train's fp_tp_ratio: 2.86492	valid's auc: 0.88001	valid's fp_tp_ratio: 4.53012
Early stopping, best iteration is:
[1]	train's auc: 0.972668	train's fp_tp_ratio: 2.70685	valid's auc: 0.963075	valid's fp_tp_ratio: 2.8559


In [44]:
valid_pred = model.predict(valid_df[features])
precision, recall, thresh = precision_recall_curve(valid_df[target], valid_pred)
# pick highest threshold with recall >= 0.90
thr = thresh[np.where(recall[:-1] >= 0.90)[0][-1]]
print(f'Selected threshold: {thr:.3f}')


Selected threshold: 0.004


In [45]:
test_pred = model.predict(test_df[features])
y_true = test_df[target].values
y_hat  = (test_pred >= thr).astype(int)

cm = confusion_matrix(y_true, y_hat)
print('Confusion matrix\n', cm)

report = classification_report(y_true, y_hat, digits=4)
print(report)

# FPR overall and for frequent customers
fp  = cm[0,1]; tp = cm[1,1]
freq_mask = (test_df['is_frequent_merchant']==1) & (y_true==0)
fp_freq = ((y_hat==1) & freq_mask).sum()
ratio_overall = (tp+fp)/tp
ratio_freq    = (tp+fp_freq)/tp
print(f'Overall FP/TP ratio: {ratio_overall:.3f}')
print(f'Frequent‑cust FP/TP ratio: {ratio_freq:.3f}')


Confusion matrix
 [[133381   5899]
 [    37    221]]
              precision    recall  f1-score   support

           0     0.9997    0.9576    0.9782    139280
           1     0.0361    0.8566    0.0693       258

    accuracy                         0.9575    139538
   macro avg     0.5179    0.9071    0.5238    139538
weighted avg     0.9979    0.9575    0.9766    139538

Overall FP/TP ratio: 27.692
Frequent‑cust FP/TP ratio: 27.692


In [41]:
gc.collect();