# 03 - Base Model and Custom Metrics

In [2]:
import os, gc, warnings
import pandas as pd
import numpy as np

from lightgbm import early_stopping, log_evaluation
import lightgbm as lgb

warnings.filterwarnings('ignore')

### Data Loading

In [3]:
DATA_PATH = '../data/processed/'     

df = pd.read_csv(os.path.join(DATA_PATH, 'transactions_processed.csv'))

In [4]:
test_mask  = (df['year']==2020) & (df['trans_month']==12)
valid_mask = (df['year']==2020) & (df['trans_month'].between(10,11))
train_mask = ~test_mask & ~valid_mask

train_df, valid_df, test_df = df[train_mask], df[valid_mask], df[test_mask]

### Preprocessing

In [5]:
target = 'is_fraud'
ignore = [target,'transaction_datetime']

features = [c for c in df.columns if c not in ignore]
cat_cols = [c for c in features if df[c].dtype=='object']

#  LightGBM requires categorical features instead of object dtype.
for c in cat_cols:
    for part in (train_df, valid_df, test_df):
        part[c] = part[c].astype('category')

### Base Model

In [6]:
def make_weights(part, legit_freq_w=10.0):
    """Boost loss on legitimate frequent‑customer rows."""
    w = np.ones(len(part))
    mask = (part['is_frequent_merchant'] == 1) & (part[target] == 0)
    w[mask] = legit_freq_w         
    return w

w_train = make_weights(train_df, 5.0)
w_valid = make_weights(valid_df, 5.0)

In [7]:
lgb_train = lgb.Dataset(train_df[features], label=train_df[target],
                        weight=w_train, categorical_feature=cat_cols,
                        free_raw_data=False)
lgb_valid = lgb.Dataset(valid_df[features], label=valid_df[target],
                        weight=w_valid, categorical_feature=cat_cols,
                        free_raw_data=False, reference=lgb_train)

freq_map = {
    id(lgb_train): train_df['is_frequent_merchant'].values,
    id(lgb_valid): valid_df['is_frequent_merchant'].values
}

In [17]:
def kpi_report(model, df, name, thr):
    """
        Helper function to calculate KPIs
        @param model: trained LightGBM model
        @param df: dataframe to evaluate
        @param name: name of the dataframe
        @param thr: threshold for the model

        Returns:
        - A dictionary containing:
            - Recall (fraud detection rate)
            - Overall FPR (false positive rate for overall transactions)
            - Frequent‑cust FPR (false positive rate for frequent customers)
            - Overall FP/TP ratio (overall fraud detection rate)
            - Freq‑cust FP/TP ratio (fraud detection rate for frequent customers)
    """
    predictions = model.predict(df[features])
    actuals = df[target].values
    is_frequent = df["is_frequent_merchant"].values
    predicted_fraud = predictions >= thr

    tp = ((actuals == 1) & predicted_fraud).sum()
    fn = ((actuals == 1) & ~predicted_fraud).sum()
    fp = ((actuals == 0) & predicted_fraud).sum()

    fp_freq = ((actuals == 0) & predicted_fraud & (is_frequent == 1)).sum()
    legitimate_freq = ((actuals == 0) & (is_frequent == 1)).sum()

    recall = tp / (tp + fn)
    fpr = fp / (actuals == 0).sum()
    fpr_freq = fp_freq / legitimate_freq

    fp_tp_ratio_overall = (tp + fp) / tp
    fp_tp_ratio_freq = (tp + fp_freq) / tp

    results = {
        "recall": recall,
        "fpr": fpr,
        "fpr_freq": fpr_freq,
        "fp_tp_ratio_overall": fp_tp_ratio_overall,
        "fp_tp_ratio_freq": fp_tp_ratio_freq
    }
    
    return results

In [9]:
params = dict(objective='binary', metric='auc',
              learning_rate=0.05, num_leaves=64,
              feature_fraction=0.8, bagging_fraction=0.8,
              bagging_freq=5, seed=42, verbosity=-1, scale_pos_weight=50, min_child_weight=0.1)

params_base = params.copy()
params_base['metric'] = 'auc'  

model_base = lgb.train(
    params_base,
    lgb_train,
    valid_sets=[lgb_valid],
    valid_names=['valid'],
    num_boost_round=500,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	valid's auc: 0.963712


In [19]:
import pandas as pd

validation_results = kpi_report(model_base, valid_df, "Validation", 0.9)
test_results = kpi_report(model_base, test_df, "Test", 0.9)

df = pd.DataFrame([validation_results, test_results], index=['Validation', 'Test'])
print(df.round(4))

            recall     fpr  fpr_freq  fp_tp_ratio_overall  fp_tp_ratio_freq
Validation  0.8879  0.0076    0.0059               2.7940            1.1462
Test        0.8450  0.0084    0.0058               6.3578            1.4908


### Custom Metrics

In [11]:
def fp_tp_ratio_freq(preds, data):
    y_true = data.get_label()
    freq   = freq_map[id(data)]      # from your earlier `freq_map`
    y_pred = preds > 0.50
    tp     = ((y_true==1) &  y_pred).sum()
    fp_f   = ((y_true==0) &  y_pred & (freq==1)).sum()
    ratio  = (tp + fp_f) / tp if tp else np.inf
    return 'fp_tp_ratio_freq', ratio, False

def fp_tp_ratio(preds, data):
    y_true  = data.get_label()
    y_pred  = preds > 0.50           # fixed cut‑off inside metric
    tp      = ((y_true==1) &  y_pred).sum()
    fp      = ((y_true==0) &  y_pred).sum()
    ratio   = (tp + fp) / tp if tp else np.inf   # lower = better
    return 'fp_tp_ratio', ratio, False

def balanced_cost(preds, data, w_fp=3.0, w_fn=10.0):
    """
    Penalizes false positives on frequent customers,
    and false negatives (missed frauds).
    Higher w_fn puts more pressure on recall.
    """
    y_true = data.get_label()
    freq   = freq_map[id(data)]
    y_pred = preds > 0.50

    fp_freq = ((y_true==0) & y_pred & (freq==1)).sum()
    fn      = ((y_true==1) & ~y_pred).sum()
    cost    = w_fp * fp_freq + w_fn * fn
    return 'balanced_cost', cost, False

def f05_score(preds, data):
    """F‑beta with β=0.5: weigh precision twice recall (good for FP control)."""
    y_true = data.get_label()
    y_pred = preds > 0.50
    tp = ((y_true==1) & y_pred).sum()
    fp = ((y_true==0) & y_pred).sum()
    fn = ((y_true==1) & ~y_pred).sum()
    precision = tp / (tp + fp) if (tp + fp) else 0
    recall    = tp / (tp + fn) if (tp + fn) else 0
    beta2 = 0.25         
    score = (1 + beta2) * precision * recall / (beta2 * precision + recall) if (precision+recall) else 0
    return 'f05_score', -score, True       

def freq_fpr(preds, data):
    """ 
        Frequent customer false positive rate 
        FP_freq / Legit_freq
        
    """
    y_true = data.get_label()
    freq   = freq_map[id(data)]
    y_pred = preds > 0.50
    fp_freq  = ((y_true==0) & y_pred & (freq==1)).sum()
    legit_freq = ((y_true==0) & (freq==1)).sum()
    fpr = fp_freq / legit_freq if legit_freq else 0
    return 'freq_fpr', fpr, False


In [12]:
metrics = [fp_tp_ratio, fp_tp_ratio_freq, balanced_cost, f05_score, freq_fpr]


def run_and_log(feval_fn, legit_freq_w=10.0):
    w_train = make_weights(train_df, legit_freq_w)
    w_valid = make_weights(valid_df, legit_freq_w)
    lgb_train.set_weight(w_train)
    lgb_valid.set_weight(w_valid)

    print(f"\n -> {feval_fn.__name__}")

    params = dict(
        objective="binary",
        learning_rate=0.05,
        num_leaves=64,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=5,
        seed=42,
        verbosity=-1,
        scale_pos_weight=50,
        min_child_weight=0.1,
    )

    mdl = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_valid],
        valid_names=["valid"],
        feval=feval_fn,
        num_boost_round=500,
        callbacks=[early_stopping(50), log_evaluation(100)],
    )

    best = mdl.best_score["valid"][feval_fn.__name__]
    print(f"Best {feval_fn.__name__}: {best:.4f}")
    return mdl, best


results = {}
models = {}
for fe in metrics:
    model, score = run_and_log(fe, legit_freq_w=10)  
    results[fe.__name__] = score
    models[fe.__name__] = model


 -> fp_tp_ratio
Training until validation scores don't improve for 50 rounds
[100]	valid's binary_logloss: 0.00759189	valid's fp_tp_ratio: 1.51102
[200]	valid's binary_logloss: 0.00630076	valid's fp_tp_ratio: 1.25887
[300]	valid's binary_logloss: 0.0059466	valid's fp_tp_ratio: 1.16632
Early stopping, best iteration is:
[319]	valid's binary_logloss: 0.00590922	valid's fp_tp_ratio: 1.15546
Best fp_tp_ratio: 1.1555

 -> fp_tp_ratio_freq
Training until validation scores don't improve for 50 rounds
[100]	valid's binary_logloss: 0.00759189	valid's fp_tp_ratio_freq: 1.02605
[200]	valid's binary_logloss: 0.00630076	valid's fp_tp_ratio_freq: 1.01461
[300]	valid's binary_logloss: 0.0059466	valid's fp_tp_ratio_freq: 1.01053
Early stopping, best iteration is:
[319]	valid's binary_logloss: 0.00590922	valid's fp_tp_ratio_freq: 1.0084
Best fp_tp_ratio_freq: 1.0084

 -> balanced_cost
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[28]	valid's binary_lo

In [20]:
import pandas as pd

all_results = {}

for model_name, model in models.items():
    validation_results = kpi_report(model, valid_df, "Validation", 0.9)
    test_results = kpi_report(model, test_df, "Test", 0.9)

    df = pd.DataFrame([validation_results, test_results], index=['Validation', 'Test'])
    all_results[model_name] = df.round(4)

print("\nKPI Reports:")
for model_name, df in all_results.items():
    print(f"\nModel: {model_name}")
    print(df)


KPI Reports:

Model: fp_tp_ratio
            recall     fpr  fpr_freq  fp_tp_ratio_overall  fp_tp_ratio_freq
Validation  0.6062  0.0002    0.0001               1.0560            1.0024
Test        0.5349  0.0002    0.0000               1.2174            1.0000

Model: fp_tp_ratio_freq
            recall     fpr  fpr_freq  fp_tp_ratio_overall  fp_tp_ratio_freq
Validation  0.6062  0.0002    0.0001               1.0560            1.0024
Test        0.5349  0.0002    0.0000               1.2174            1.0000

Model: balanced_cost
            recall     fpr  fpr_freq  fp_tp_ratio_overall  fp_tp_ratio_freq
Validation  0.7847  0.0022    0.0011               1.5846            1.0320
Test        0.7287  0.0028    0.0016               3.0904            1.1543

Model: f05_score
            recall     fpr  fpr_freq  fp_tp_ratio_overall  fp_tp_ratio_freq
Validation  0.8599  0.0096     0.006               3.3208            1.1527
Test        0.7752  0.0106     0.005               8.3600        