# Fraud Detection Case Study
This notebook presents a complete workflow for proactive fraud detection in financial transactions, including data cleaning, model development, performance evaluation, and actionable recommendations.

## 1. Data Acquisition & Dictionary
- Data source: [Link to dataset]
- Data dictionary: [Link to data dictionary]

*Please download the CSV file and place it in the workspace before running the notebook.*

In [1]:
# Imports
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    average_precision_score,
    precision_recall_curve,
)
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
# Runtime check: Python and package versions
import sys
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import seaborn as sns
import statsmodels
print('Python:', sys.version)
print('pandas:', pd.__version__)
print('numpy:', np.__version__)
print('scikit-learn:', sklearn.__version__)
print('matplotlib:', matplotlib.__version__)
print('seaborn:', sns.__version__)
print('statsmodels:', statsmodels.__version__)

Python: 3.10.13 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:15:57) [MSC v.1916 64 bit (AMD64)]
pandas: 2.3.1
numpy: 1.26.4
scikit-learn: 1.7.1
matplotlib: 3.10.5
seaborn: 0.13.2
statsmodels: 0.14.5


In [3]:
# Ensure required packages in this kernel
try:
    import pandas as pd, numpy as np, matplotlib, seaborn as sns, sklearn, statsmodels
    print('Packages OK')
except ModuleNotFoundError as e:
    import sys, subprocess
    pkgs = ['pandas','numpy','matplotlib','seaborn','scikit-learn','statsmodels']
    print(f'Installing missing: {e.name} ...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', *pkgs])
    print('Packages installed')

Packages OK


In [4]:
# Configuration
DATA_PATH = ''              # Set to CSV path or leave blank for autodetect
USE_SAMPLE = False          # False for full 6.36M rows
SAMPLE_SIZE = 1_000_000     # Ignored when USE_SAMPLE=False
RANDOM_STATE = 42
TEST_SIZE = 0.30
DISPLAY_EPS = 1e-12         # Floor for printing tiny probabilities

## 2. Data Cleaning

We apply scalable, robust cleaning to handle 6.36M rows:

- Missing values: Summarized globally; numeric imputed with median in pipeline; categoricals handled by OneHotEncoder with `handle_unknown='ignore'`.

- Outliers: IQR-based fences computed; extreme tails clipped in VIF step only to stabilize multi-collinearity estimation (modeling uses raw but log-transformed `amount`).

- Multicollinearity: VIF computed on numeric predictors excluding target/flags; used for diagnostics (drop if VIF >> 10). No leakage features used.

- High-cardinality IDs (`nameOrig`, `nameDest`) are excluded; instead, we derive low-cardinality signals like `dest_is_merchant`.


In [5]:
# Data load
from pathlib import Path

def _autodetect_csv():
    cands = []
    if DATA_PATH:
        cands.append(Path(DATA_PATH))
    here = Path.cwd()
    cands += [here / n for n in ['transactions.csv','fraud.csv','fraudDetection.csv','data.csv']]
    dld = Path.home() / 'Downloads'
    cands += [dld / n for n in ['transactions.csv','PS_20174392719_1491204439457_log.csv','fraud.csv','data.csv']]
    for p in cands:
        if p.exists():
            return p
    return None

csv_path = _autodetect_csv()
assert csv_path is not None, 'CSV not found. Set DATA_PATH or place the CSV in this folder or Downloads.'
print(f'Using CSV: {csv_path}')

dtype_map = {
    'step': 'int32','type': 'category','amount': 'float32','nameOrig': 'category',
    'oldbalanceOrg': 'float32','newbalanceOrig': 'float32','nameDest': 'category',
    'oldbalanceDest': 'float32','newbalanceDest': 'float32','isFraud': 'int8','isFlaggedFraud': 'int8',
}

def read_csv_scalable(path: str, use_sample=True, sample_size=1_000_000, random_state=42):
    chunks, total = [], 0
    for ch in pd.read_csv(path, dtype=dtype_map, chunksize=200_000):
        chunks.append(ch)
        total += len(ch)
        if use_sample and total >= sample_size:
            break
    df_ = pd.concat(chunks, ignore_index=True)
    if use_sample and len(df_) > sample_size:
        df_ = df_.sample(sample_size, random_state=random_state)
    df_.columns = [c.strip() for c in df_.columns]
    return df_

df = read_csv_scalable(str(csv_path), use_sample=USE_SAMPLE, sample_size=SAMPLE_SIZE, random_state=RANDOM_STATE)
print(df.shape)
df.head()

Using CSV: c:\Users\Vikra\OneDrive\Desktop\Accredian\fraud.csv
(6362620, 11)
(6362620, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.639648,C1231006815,170136.0,160296.359375,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.280029,C1666544295,21249.0,19384.720703,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.139648,C2048537720,41554.0,29885.859375,M1230701703,0.0,0.0,0,0


In [6]:
# Feature engineering
if 'type' in df.columns:
    df['type'] = df['type'].astype('string').str.replace('-', '_').astype('category')

df['orig_error'] = (df['newbalanceOrig'] + df['amount'] - df['oldbalanceOrg']).astype('float32')
df['dest_error'] = (df['oldbalanceDest'] + df['amount'] - df['newbalanceDest']).astype('float32')

df['is_TRANSFER'] = (df['type'] == 'TRANSFER').astype('int8')
df['is_CASH_OUT'] = (df['type'] == 'CASH_OUT').astype('int8')

df['amt_log'] = np.log1p(df['amount']).astype('float32')
df['is_high_value'] = (df['amount'] >= 200_000).astype('int8')

df['hour'] = (df['step'] % 24).astype('int8')
df['day'] = (df['step'] // 24).astype('int16')
df['is_weekend'] = (df['day'] % 7 >= 5).astype('int8')

df['dest_is_merchant'] = df['nameDest'].astype('string').str.startswith('M').fillna(False).astype('int8')

df['orig_went_zero'] = ((df['newbalanceOrig'] == 0) & (df['oldbalanceOrg'] > 0)).astype('int8')
df['dest_went_zero'] = ((df['newbalanceDest'] == 0) & (df['oldbalanceDest'] > 0)).astype('int8')

df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,...,is_TRANSFER,is_CASH_OUT,amt_log,is_high_value,hour,day,is_weekend,dest_is_merchant,orig_went_zero,dest_went_zero
0,1,PAYMENT,9839.639648,C1231006815,170136.0,160296.359375,M1979787155,0.0,0.0,0,...,0,0,9.194276,0,1,0,0,1,0,0
1,1,PAYMENT,1864.280029,C1666544295,21249.0,19384.720703,M2044282225,0.0,0.0,0,...,0,0,7.531167,0,1,0,0,1,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,...,1,0,5.204007,0,1,0,0,0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,...,0,1,5.204007,0,1,0,0,0,1,1
4,1,PAYMENT,11668.139648,C2048537720,41554.0,29885.859375,M1230701703,0.0,0.0,0,...,0,0,9.364703,0,1,0,0,1,0,0


In [7]:
# Missingness summary
display(df.isna().sum().sort_values(ascending=False).pipe(lambda s: s[s>0]))
print('\nMissing ratio (%)')
display((df.isna().mean()*100).round(3).sort_values(ascending=False))

Series([], dtype: int64)


Missing ratio (%)


step                0.0
dest_error          0.0
orig_went_zero      0.0
dest_is_merchant    0.0
is_weekend          0.0
day                 0.0
hour                0.0
is_high_value       0.0
amt_log             0.0
is_CASH_OUT         0.0
is_TRANSFER         0.0
orig_error          0.0
type                0.0
isFlaggedFraud      0.0
isFraud             0.0
newbalanceDest      0.0
oldbalanceDest      0.0
nameDest            0.0
newbalanceOrig      0.0
oldbalanceOrg       0.0
nameOrig            0.0
amount              0.0
dest_went_zero      0.0
dtype: float64

In [8]:
# Outliers (IQR)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if not numeric_cols:
    print('No numeric columns for outlier analysis')
else:
    desc = df[numeric_cols].describe(percentiles=[0.01,0.05,0.25,0.5,0.75,0.95,0.99]).T
    q = df[numeric_cols].quantile([0.25,0.75])
    iqr = q.loc[0.75] - q.loc[0.25]
    lf, uf = q.loc[0.25] - 1.5*iqr, q.loc[0.75] + 1.5*iqr
    desc['iqr'] = iqr; desc['lower_fence'] = lf; desc['upper_fence'] = uf
    out_low = (df[numeric_cols] < lf).sum(); out_high = (df[numeric_cols] > uf).sum()
    desc['outliers_low'] = out_low; desc['outliers_high'] = out_high
    desc['outliers_total'] = desc['outliers_low'] + desc['outliers_high']
    display(desc.sort_values('outliers_total', ascending=False).head(10))

Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max,iqr,lower_fence,upper_fence,outliers_low,outliers_high,outliers_total
orig_went_zero,6362620.0,0.2389866,0.4264646,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0,1520581,1520581
dest_error,6362620.0,55567.21,441528.8,-75885720.0,-353401.6675,-0.125,0.0,3500.48999,29353.02,456741.0,793161.7,13191234.0,29353.02,-44029.53,73382.54,177957,1256756,1434713
oldbalanceOrg,6362620.0,833883.4,2888242.0,0.0,0.0,0.0,0.0,14208.0,107315.2,5823702.0,16027260.0,59585040.0,107315.2,-160972.8,268287.9,0,1112507,1112507
newbalanceOrig,6362620.0,855113.7,2924048.0,0.0,0.0,0.0,0.0,0.0,144258.4,5980262.0,16176160.0,49585040.0,144258.4,-216387.6,360646.0,0,1053391,1053391
oldbalanceDest,6362620.0,1100702.0,3399180.0,0.0,0.0,0.0,0.0,132705.664062,943036.7,5147230.0,12371820.0,356015904.0,943036.7,-1414555.0,2357592.0,0,786135,786135
newbalanceDest,6362620.0,1224997.0,3674129.0,0.0,0.0,0.0,0.0,214661.445312,1111909.0,5515716.0,13137870.0,356179264.0,1111909.0,-1667864.0,2779773.0,0,738527,738527
is_TRANSFER,6362620.0,0.08375622,0.2770219,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0,532909,532909
orig_error,6362620.0,201092.6,606650.4,-4.0,-0.007812,0.0,2954.197449,68677.253906,249641.1,700716.5,1559495.0,92445520.0,246686.9,-367076.2,619671.5,0,408489,408489
amount,6362620.0,179861.9,603858.2,0.0,449.467593,2224.099597,13389.570312,74871.9375,208721.5,518634.2,1615980.0,92445520.0,195331.9,-279608.3,501719.3,0,338078,338078
day,6362620.0,9.503158,5.922111,0.0,0.0,0.0,6.0,9.0,13.0,20.0,28.0,30.0,7.0,-4.5,23.5,0,162303,162303


In [9]:
# VIF (numeric only)
from sklearn.preprocessing import StandardScaler

exclude_cols = {'isFraud','isFlaggedFraud'}
vif_cols = [c for c in numeric_cols if c not in exclude_cols]
X_num = df[vif_cols].copy().fillna(df[vif_cols].median())
for c in X_num.columns:
    q1, q99 = np.percentile(X_num[c], [1,99]); X_num[c] = X_num[c].clip(q1, q99)
X_s = StandardScaler().fit_transform(X_num)
vif_data = pd.DataFrame({'feature': X_num.columns,
                         'VIF': [variance_inflation_factor(X_s, i) for i in range(X_s.shape[1])]})
display(vif_data.sort_values('VIF', ascending=False).head(10))

Unnamed: 0,feature,VIF
0,step,31441.859182
13,day,31354.294617
3,newbalanceOrig,1357.102289
2,oldbalanceOrg,1315.951968
5,newbalanceDest,72.609239
4,oldbalanceDest,67.041044
6,orig_error,46.283285
1,amount,40.465224
12,hour,29.20788
9,is_CASH_OUT,4.484572


## 3. Model Development

We build an interpretable baseline Logistic Regression with class weighting and a full preprocessing pipeline (median imputation, scaling for numeric, one-hot for categorical). This offers:

- Speed and scalability on millions of rows

- Probabilistic outputs to tune thresholds by business cost (recall vs precision)

- Coefficients for explainability. We later recommend evaluating tree-based models (XGBoost/LightGBM) for potential lift.


In [10]:
# Split
TARGET = 'isFraud'
drop_cols = ['nameOrig','nameDest']
features = [c for c in df.columns if c not in drop_cols + [TARGET]]
X, y = df[features], df[TARGET].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
y_train.value_counts(normalize=True).head()

isFraud
0    0.998709
1    0.001291
Name: proportion, dtype: float64

In [11]:
# Pipeline + tuning (Logistic Regression)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

cat_cols = X_train.select_dtypes(include=['category','object','string']).columns.tolist()
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

def _ohe_dense():
    try:
        return OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocess = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler(with_mean=True))
        ]), num_cols),
        ('cat', _ohe_dense(), cat_cols)
    ], remainder='drop')

pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('model', LogisticRegression(max_iter=2000, class_weight='balanced', solver='lbfgs'))
])

param_grid = {
    'model__C': [0.1, 0.5, 1.0, 2.0],
    'model__penalty': ['l2'],
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
search = GridSearchCV(pipe, param_grid=param_grid, scoring='average_precision', cv=cv, n_jobs=-1, verbose=0)
search.fit(X_train, y_train)
clf = search.best_estimator_
print('Best params:', search.best_params_)


KeyboardInterrupt: 

## 4. Variable Selection

Selection combines diagnostics and domain knowledge:

- Drop identifiers/leakage (`nameOrig`, `nameDest`); prefer derived flags (`dest_is_merchant`).

- Keep fraud-relevant signals: `type`, `amt_log`, `is_TRANSFER`, `is_CASH_OUT`, `orig_error`, `dest_error`, time-of-day/day-of-week.

- Use VIF to spot redundant numeric features; if VIF >> 10 and no clear added value, drop the higher-variance one.

- Validate choices via cross-validated PR AUC; retain features that improve recall at acceptable precision.


In [None]:
# Coefficients (importance)
model = clf.named_steps['model']
prep = clf.named_steps['prep']
ohe = prep.named_transformers_['cat']
cat_names = ohe.get_feature_names_out(cat_cols) if len(cat_cols) else np.array([])
num_names = np.array(num_cols)
feat_names = np.concatenate([num_names, cat_names])
coefs = pd.Series(model.coef_.ravel(), index=feat_names).sort_values(key=np.abs, ascending=False)
print(coefs.head(20))
coefs.head(20).plot(kind='bar', figsize=(10,4))
plt.title('Top coefficients'); plt.tight_layout(); plt.show()

## 5. Model Performance

We report:

- ROC AUC (ranking quality) and PR AUC (more informative for class imbalance)

- Threshold tuned by F2 (recall emphasis) with classification report and confusion matrix

- ROC and PR curves for visual assessment


In [None]:
# Evaluation
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, roc_curve,
                             average_precision_score, precision_recall_curve)

proba = clf.predict_proba(X_test)[:,1]
proba_print = np.maximum(proba, DISPLAY_EPS)  # avoid printing 0.0
roc_auc = roc_auc_score(y_test, proba)
pr_auc = average_precision_score(y_test, proba)
print(f'ROC AUC: {roc_auc:.4f} | PR AUC: {pr_auc:.4f}')

precision, recall, thresholds = precision_recall_curve(y_test, proba)
f2 = (5*precision*recall)/(4*precision+recall+1e-12)
best_idx = f2.argmax(); best_thr = thresholds[max(0, best_idx-1)] if len(thresholds) else 0.5
print(f'Best threshold by F2: {best_thr:.6f}')

y_pred_thr = (proba >= best_thr).astype(int)
print(classification_report(y_test, y_pred_thr, digits=4))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_thr))

# Curves
fpr, tpr, _ = roc_curve(y_test, proba)
plt.figure(figsize=(5,4)); plt.plot(fpr, tpr, label=f'ROC AUC={roc_auc:.3f}'); plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel('FPR'); plt.ylabel('TPR'); plt.legend(); plt.title('ROC'); plt.tight_layout(); plt.show()
plt.figure(figsize=(5,4)); plt.plot(recall, precision, label=f'PR AUC={pr_auc:.3f}')
plt.xlabel('Recall'); plt.ylabel('Precision'); plt.legend(); plt.title('PR'); plt.tight_layout(); plt.show()

# Probability summary (sanity check)
print('Sample probabilities (floored for display):')
print(pd.Series(proba_print[:10]).round(6).to_string(index=False))

## 6. Key Factors for Fraud Prediction

Based on coefficients and domain logic, typical strong predictors include:

- `is_TRANSFER` and `is_CASH_OUT`: common fraud path is TRANSFER → CASH_OUT

- `amt_log` and `is_high_value`: larger transactions are riskier and often flagged

- `orig_error`/`dest_error`: accounting inconsistencies indicate abnormal balance transitions

- `orig_went_zero`: emptying an account in a single step

- Time features (`hour`, `is_weekend`): off-hours may correlate with higher risk

- `dest_is_merchant`: merchant destinations have missing balances by design; the contrast can be predictive


## 7. Interpretation of Factors

These factors align with the data dictionary and known fraud patterns:

- Fraud agents transfer funds and then cash out; hence high odds for `TRANSFER`/`CASH_OUT`.

- Business rule flags >200k explain why `is_high_value` relates to risk; however, the model may capture risk at lower amounts too.

- Balance errors (`orig_error`, `dest_error`) arise from mismatches in recorded balances; repeated inconsistencies suggest manipulation or data artifacts.

- Time windows reflect operational coverage; higher fraud during low-staff hours is common.

If any factor appears counterintuitive, validate via partial dependence/ICE plots or SHAP values in a tree-based model.


## 8. Prevention Recommendations

- Real-time scoring with low-latency feature store; auto-block above a dynamic threshold tuned for acceptable false positives.

- Step-up authentication for risky patterns (e.g., TRANSFER followed by CASH_OUT within N hours, high-value, off-hours).

- Velocity rules: per-origin daily amount/transaction count caps; sudden spikes trigger holds.

- Beneficiary reputation: risk scores for destinations; new or rarely used recipients require additional verification.

- Device/IP/User behavior analytics; geolocation anomalies; impossible travel checks.

- Human-in-the-loop queue with SLA; continuous feedback loop to retrain models weekly.


## 9. Evaluation of Actions

- Define KPIs: fraud loss per 1k transactions, detection rate (recall), false positive rate, alert volume, customer friction.

- A/B or phased rollout: holdout control group without new controls; compare KPIs over same period.

- Pre/post analysis with seasonality controls; use difference-in-differences if groups available.

- Monitor for drift: population stability index (PSI), feature/score drift; retrain triggers.

- Cost-benefit analysis combining model threshold with operational costs (manual review, customer churn risk).


## Data Dictionary Summary

- **step**: Unit of time (1 step = 1 hour, total 744 steps for 30 days)

- **type**: Transaction type (CASH-IN, CASH-OUT, DEBIT, PAYMENT, TRANSFER)

- **amount**: Transaction amount in local currency

- **nameOrig**: Customer initiating the transaction

- **oldbalanceOrg**: Initial balance of the originator before the transaction

- **newbalanceOrig**: New balance of the originator after the transaction

- **nameDest**: Recipient customer

- **oldbalanceDest**: Initial balance of the recipient before the transaction (missing for merchants)

- **newbalanceDest**: New balance of the recipient after the transaction (missing for merchants)

- **isFraud**: Indicates if the transaction is fraudulent (target variable)

- **isFlaggedFraud**: Flags illegal attempts (transfers > 200,000)


In [None]:
# 10. Persist model and quick inference demo
import joblib
from datetime import datetime

# Save trained pipeline
model_path = f"fraud_lr_pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)
print(f"Saved pipeline to: {model_path}")

# Reload and run a tiny inference demo on a few validation rows
pipe = joblib.load(model_path)
sample = X_test.head(5).copy()
proba_demo = pipe.predict_proba(sample)[:,1]
print("Sample probabilities:")
print(pd.Series(proba_demo, index=sample.index).round(6))

In [None]:
# 11. XGBoost baseline + SHAP
try:
    import xgboost as xgb
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'xgboost'])
    import xgboost as xgb

from sklearn.metrics import average_precision_score, roc_auc_score

xgb_model = xgb.XGBClassifier(
    n_estimators=600,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    n_jobs=-1,
    tree_method='hist',
    random_state=RANDOM_STATE,
    scale_pos_weight=(y_train.value_counts()[0] / max(1, y_train.value_counts()[1]))
)

from sklearn.pipeline import Pipeline as SkPipe
xgb_clf = SkPipe(steps=[('prep', preprocess), ('model', xgb_model)])

xgb_clf.fit(X_train, y_train)
proba_xgb = xgb_clf.predict_proba(X_test)[:,1]
print('[XGB] ROC AUC:', roc_auc_score(y_test, proba_xgb))
print('[XGB] PR  AUC:', average_precision_score(y_test, proba_xgb))

# SHAP (robust)
try:
    import shap
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'shap', '-q'])
    import shap

# Prepare background and explanation samples
preproc = xgb_clf.named_steps.get('prep', preprocess)
# Use small samples for SHAP speed; this does not affect evaluation metrics
X_bg = X_train.sample(min(2000, len(X_train)), random_state=RANDOM_STATE)
X_sh = X_test.sample(min(3000, len(X_test)), random_state=RANDOM_STATE)

# Transform to model input space
X_bg_trans = preproc.transform(X_bg)
X_sh_trans = preproc.transform(X_sh)

# Densify if sparse
try:
    import scipy.sparse as sp
    if sp.issparse(X_bg_trans):
        X_bg_trans = X_bg_trans.toarray()
    if sp.issparse(X_sh_trans):
        X_sh_trans = X_sh_trans.toarray()
except Exception:
    pass

# Feature names, if available
feat_names = None
try:
    feat_names = preproc.get_feature_names_out()
except Exception:
    pass

print(f"SHAP background: {X_bg_trans.shape}, explain: {X_sh_trans.shape}")

# Build explainer on the raw XGBoost model, compute SHAP values with additivity disabled for proba
try:
    explainer = shap.TreeExplainer(xgb_clf.named_steps['model'])
    shap_values = explainer.shap_values(X_sh_trans, check_additivity=False)
except Exception:
    # Fallback: model-agnostic on predict_proba
    explainer = shap.Explainer(xgb_clf.named_steps['model'].predict_proba, X_bg_trans)
    shap_values = explainer(X_sh_trans)

# Beeswarm summary
try:
    values = shap_values if isinstance(shap_values, (list, tuple)) else getattr(shap_values, 'values', shap_values)
    shap.summary_plot(values, X_sh_trans, feature_names=feat_names, max_display=20, show=True)
except Exception as e:
    print('SHAP summary plot failed:', e)


In [None]:
# 12. Calibrated probabilities (Logistic)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss

# Use the tuned pipeline's preprocessing and create a fresh base model
base_lr = LogisticRegression(max_iter=2000, class_weight='balanced', solver='lbfgs', C=clf.named_steps['model'].C)
calibrated = Pipeline(steps=[('prep', preprocess), ('model', base_lr)])
calibrated = CalibratedClassifierCV(calibrated, method='isotonic', cv=3, n_jobs=-1)

calibrated.fit(X_train, y_train)
proba_cal = calibrated.predict_proba(X_test)[:,1]
print('[Calibrated LR] ROC AUC:', roc_auc_score(y_test, proba_cal))
print('[Calibrated LR] PR  AUC:', average_precision_score(y_test, proba_cal))
print('[Calibrated LR] Brier:', brier_score_loss(y_test, proba_cal))

In [None]:
# 13. Stratified CV report (PR AUC)
from sklearn.model_selection import cross_val_score

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

lr_scores = cross_val_score(clf, X, y, scoring='average_precision', cv=cv, n_jobs=-1)
print('[LR tuned] PR AUC CV:', lr_scores.round(4), 'mean=', lr_scores.mean().round(4))

xgb_scores = cross_val_score(xgb_clf, X, y, scoring='average_precision', cv=cv, n_jobs=-1)
print('[XGB] PR AUC CV:', xgb_scores.round(4), 'mean=', xgb_scores.mean().round(4))

## Model card (concise)
- Data: 6.36M transactions (CSV); target: isFraud. Train/validation split: stratified, 30% validation.
- Features: domain features (amt_log, is_TRANSFER, is_CASH_OUT, errors, time), categorical OHE, numeric scaled.
- Models:
  - Logistic Regression (balanced class weights) with tuned C; calibrated probabilities (isotonic).
  - XGBoost (hist, imbalance via scale_pos_weight) with SHAP for explainability.
- Metrics (validation): report ROC AUC, PR AUC; threshold by F2; confusion matrix.
- CV: 3-fold stratified PR AUC for LR (tuned) and XGB.
- Risks: class imbalance, potential sampling bias if using sample mode; data drift; threshold needs re-tuning post-deployment.
- Ops: export pipeline with joblib; monitor PR AUC, drift (PSI), and alert volumes; retrain cadence weekly.


In [None]:
# 14. XGBoost SHAP explanations (auto-install + beeswarm)
try:
    import shap  # type: ignore
except Exception:
    import sys, subprocess
    print("Installing shap...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "shap", "-q"])  # quiet install
    import shap  # type: ignore

# Validate fitted XGBoost artifacts
if 'xgb_model' not in globals():
    raise RuntimeError("XGBoost model not found. Please run the XGBoost training cell first.")

# Locate a fitted preprocessor
from sklearn.compose import ColumnTransformer
preproc = None
if 'xgb_clf' in globals():
    # Try common names
    for key in ("preprocess", "prep", "preproc"):
        if hasattr(xgb_clf, "named_steps") and key in xgb_clf.named_steps:
            preproc = xgb_clf.named_steps[key]
            break
    # Try to find CT in steps
    if preproc is None and hasattr(xgb_clf, "steps"):
        for name, step in xgb_clf.steps:
            if isinstance(step, ColumnTransformer):
                preproc = step
                break
# Fallback to global variable
if preproc is None and 'preprocess' in globals():
    preproc = preprocess

if preproc is None:
    raise RuntimeError("Could not find a ColumnTransformer preprocessor associated with XGBoost.")

# Sample for SHAP to keep it fast
n_bg = 3000
n_sample = 5000
X_bg = X_train.sample(n=min(n_bg, len(X_train)), random_state=RANDOM_STATE)
X_sh = X_test.sample(n=min(n_sample, len(X_test)), random_state=RANDOM_STATE)

# Transform to model space
X_bg_t = preproc.transform(X_bg)
X_sh_t = preproc.transform(X_sh)

# Densify if needed
try:
    import scipy.sparse as sp
    if sp.issparse(X_bg_t):
        X_bg_t = X_bg_t.toarray()
    if sp.issparse(X_sh_t):
        X_sh_t = X_sh_t.toarray()
except Exception:
    pass

# Get feature names if available
feat_names_xgb = None
try:
    feat_names_xgb = preproc.get_feature_names_out()
except Exception:
    pass

print(f"SHAP background: {X_bg_t.shape}, explain: {X_sh_t.shape}")

# Build explainer and plot
try:
    explainer = shap.TreeExplainer(xgb_model)
    shap_values = explainer.shap_values(X_sh_t, check_additivity=False)
except Exception:
    # Fallback to model-agnostic
    explainer = shap.Explainer(xgb_model.predict_proba, X_bg_t)
    shap_values = explainer(X_sh_t)

# Summary beeswarm
try:
    shap.summary_plot(
        shap_values if isinstance(shap_values, (list, tuple)) else getattr(shap_values, 'values', shap_values),
        X_sh_t,
        feature_names=feat_names_xgb,
        show=True,
        max_display=25
    )
except Exception as e:
    print("SHAP summary plot failed:", e)

print("Top SHAP features computed and plotted (see chart above).")


In [None]:
# 15. XGBoost evaluation on full test set + operating threshold (F2 and optional cost-based)
from sklearn.metrics import (
    roc_auc_score, average_precision_score, precision_recall_curve,
    classification_report, confusion_matrix
)
from sklearn.model_selection import StratifiedKFold
import numpy as np

assert 'xgb_clf' in globals(), "xgb_clf pipeline not found. Train XGBoost cell first."

# Helper: choose threshold by F2 on a validation split from training (no test leakage)
def pick_threshold_f2(estimator, X_tr, y_tr, random_state=RANDOM_STATE):
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
    # Take the first fold as validation for speed
    train_idx, val_idx = next(skf.split(X_tr, y_tr))
    X_tr_sub, y_tr_sub = X_tr.iloc[train_idx], y_tr.iloc[train_idx]
    X_val_sub, y_val_sub = X_tr.iloc[val_idx], y_tr.iloc[val_idx]
    est = estimator
    est.fit(X_tr_sub, y_tr_sub)
    p = est.predict_proba(X_val_sub)[:, 1]
    prec, rec, thr = precision_recall_curve(y_val_sub, p)
    # Exclude last threshold which is nan in alignment with prec/rec arrays
    beta = 2.0
    f2 = (1 + beta**2) * (prec * rec) / (beta**2 * prec + rec + 1e-12)
    best_i = int(np.nanargmax(f2))
    best_thr = thr[max(0, min(best_i, len(thr) - 1))]
    return float(best_thr)

# Optional: choose threshold by costs on the same validation split
def pick_threshold_cost(estimator, X_tr, y_tr, cost_fp=1.0, cost_fn=10.0, random_state=RANDOM_STATE):
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
    train_idx, val_idx = next(skf.split(X_tr, y_tr))
    X_tr_sub, y_tr_sub = X_tr.iloc[train_idx], y_tr.iloc[train_idx]
    X_val_sub, y_val_sub = X_tr.iloc[val_idx], y_tr.iloc[val_idx]
    est = estimator
    est.fit(X_tr_sub, y_tr_sub)
    p = est.predict_proba(X_val_sub)[:, 1]
    prec, rec, thr = precision_recall_curve(y_val_sub, p)
    # Convert thresholds to predictions and compute expected cost
    # Note: prec/rec arrays are one element longer than thr; iterate over thr
    best_thr, best_cost = 0.5, float('inf')
    for t in thr:
        y_hat = (p >= t).astype(int)
        # Costs: FP incur cost_fp, FN incur cost_fn
        fp = np.sum((y_hat == 1) & (y_val_sub.values == 0))
        fn = np.sum((y_hat == 0) & (y_val_sub.values == 1))
        cost = cost_fp * fp + cost_fn * fn
        if cost < best_cost:
            best_cost, best_thr = cost, float(t)
    return float(best_thr)

# 1) Evaluate XGB on full test set (no sampling)
proba_xgb_full = xgb_clf.predict_proba(X_test)[:, 1]
roc_xgb = roc_auc_score(y_test, proba_xgb_full)
pr_xgb = average_precision_score(y_test, proba_xgb_full)
print(f"[XGB] ROC AUC (full test): {roc_xgb:.6f}")
print(f"[XGB] PR  AUC (full test): {pr_xgb:.6f}")
print(f"Samples evaluated (X_test): {len(X_test)}; No sampling applied.")

# 2) Pick operating thresholds
# F2-based threshold
thr_f2 = pick_threshold_f2(xgb_clf, X_train, y_train)
# Cost-based threshold (customize costs as needed)
cost_fp, cost_fn = 1.0, 10.0
thr_cost = pick_threshold_cost(xgb_clf, X_train, y_train, cost_fp=cost_fp, cost_fn=cost_fn)

# 3) Report metrics at thresholds on the full test set
for label, thr in [("F2-opt", thr_f2), ("Cost-opt", thr_cost)]:
    y_hat = (proba_xgb_full >= thr).astype(int)
    print(f"\n[XGB @ {label} threshold={thr:.4f}]")
    print(classification_report(y_test, y_hat, digits=4))
    cm = confusion_matrix(y_test, y_hat)
    print("Confusion matrix:\n", cm)

# Persist chosen thresholds for reuse
xgb_threshold_f2 = float(thr_f2)
xgb_threshold_cost = float(thr_cost)
print(f"Saved thresholds → F2: {xgb_threshold_f2:.4f} | Cost({cost_fp:.1f},{cost_fn:.1f}): {xgb_threshold_cost:.4f}")


In [None]:
# 16. Persist XGBoost pipeline + quick inference demo
import joblib
from pathlib import Path

assert 'xgb_clf' in globals(), "xgb_clf pipeline not found."
model_dir = Path("models")
model_dir.mkdir(exist_ok=True)
model_path_xgb = str(model_dir / "xgb_pipeline.joblib")

joblib.dump(xgb_clf, model_path_xgb)
print(f"Saved XGBoost pipeline → {model_path_xgb}")

# Quick inference demo
xgb_loaded = joblib.load(model_path_xgb)
proba_demo_xgb = xgb_loaded.predict_proba(X_test.iloc[:5])[:, 1]
# Use F2 threshold by default if defined
thr_use = xgb_threshold_f2 if 'xgb_threshold_f2' in globals() else 0.5
pred_demo_xgb = (proba_demo_xgb >= thr_use).astype(int)
print("Demo probabilities:", np.round(proba_demo_xgb, 6))
print(f"Demo predictions @ thr={thr_use:.3f}:", pred_demo_xgb)


In [None]:
# 17. Isotonic-calibrated XGBoost: fit, evaluate, and persist
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss

assert 'xgb_model' in globals(), "xgb_model not found; train XGBoost first."

# Build a calibrated pipeline by wrapping the trained xgb_model; reuse the same preprocessor as xgb_clf
from sklearn.pipeline import Pipeline as SkPipe
calibrated_xgb = SkPipe(steps=[
    ('preprocess', xgb_clf.named_steps.get('preprocess', preprocess)),
    ('model', CalibratedClassifierCV(estimator=xgb_model, method='isotonic', cv=3))
])

# Fit on training data
calibrated_xgb.fit(X_train, y_train)

# Evaluate on full test set (no sampling)
proba_xgb_cal = calibrated_xgb.predict_proba(X_test)[:, 1]
roc_cal = roc_auc_score(y_test, proba_xgb_cal)
pr_cal = average_precision_score(y_test, proba_xgb_cal)
brier_cal = brier_score_loss(y_test, proba_xgb_cal)
print(f"[XGB-Calibrated] ROC AUC (full test): {roc_cal:.6f}")
print(f"[XGB-Calibrated] PR  AUC (full test): {pr_cal:.6f}")
print(f"[XGB-Calibrated] Brier: {brier_cal:.6f}")
print(f"Samples evaluated (X_test): {len(X_test)}; No sampling applied.")

# Persist
model_path_xgb_cal = str(model_dir / "xgb_pipeline_calibrated.joblib")
joblib.dump(calibrated_xgb, model_path_xgb_cal)
print(f"Saved calibrated XGBoost pipeline → {model_path_xgb_cal}")


In [None]:
# 18. Evaluation coverage audit (no assumed/sampled data)
import numpy as np

# Asserts for full coverage on test set
n_test = len(X_test)
assert proba_xgb_full.shape[0] == n_test, "Mismatch in XGB proba length vs X_test rows"
assert not np.isnan(proba_xgb_full).any(), "NaNs found in XGB probabilities"

# If calibrated model run, check those too
if 'proba_xgb_cal' in globals():
    assert proba_xgb_cal.shape[0] == n_test, "Mismatch in calibrated XGB proba length vs X_test rows"
    assert not np.isnan(proba_xgb_cal).any(), "NaNs found in calibrated XGB probabilities"

print(f"All checks passed. Test rows: {n_test}. All probabilities computed with no sampling or NaNs.")
