# üè¶ Bank Fraud Detection - ML Training Pipeline

## Overview
This notebook implements a **professional fraud detection ML pipeline** with:
- **Dataset**: 50,000 realistic banking transactions
- **Features**: Real, interpretable features (Amount, Location, Channel, Age, Occupation, etc.)
- **Models**: Random Forest, XGBoost, LightGBM, CatBoost
- **Optimization**: Optuna hyperparameter tuning

---

## 1. Setup & Imports

In [None]:
# Core imports
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# ML imports
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    precision_recall_curve, roc_curve, f1_score, precision_score,
    recall_score, average_precision_score, accuracy_score
)
from sklearn.ensemble import RandomForestClassifier

# Boosting models
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Hyperparameter optimization
import optuna
from optuna.samplers import TPESampler

# Imbalanced data handling
from imblearn.over_sampling import SMOTE

# Model persistence
import joblib
import json
import os
from datetime import datetime

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('‚úÖ All imports successful!')
print(f'üìÖ Notebook run: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

## 2. Load Dataset

In [None]:
# Load the synthetic dataset with real features
DATA_PATH = 'data/fraud_dataset.csv'

if not os.path.exists(DATA_PATH):
    print('‚ö†Ô∏è Dataset not found. Generating...')
    exec(open('generate_dataset.py').read())

df = pd.read_csv(DATA_PATH)
print(f'‚úÖ Dataset loaded!')
print(f'üìä Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns')
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Dataset overview
print('=' * 60)
print('üìã DATASET OVERVIEW')
print('=' * 60)
print(f'\nüî¢ Total Transactions: {len(df):,}')
print(f'üìÅ Features: {df.shape[1]}')

# Class distribution
fraud_count = df['is_fraud'].sum()
normal_count = len(df) - fraud_count
fraud_pct = (fraud_count / len(df)) * 100

print(f'\nüìä Class Distribution:')
print(f'   ‚úÖ Normal: {normal_count:,} ({100-fraud_pct:.2f}%)')
print(f'   üö® Fraud:  {fraud_count:,} ({fraud_pct:.2f}%)')
print(f'\n‚öñÔ∏è Imbalance Ratio: 1:{int(normal_count/fraud_count)}')

In [None]:
# Feature overview
print('üìã Feature Types:')
print(df.dtypes)

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

colors = ['#2ecc71', '#e74c3c']
axes[0].pie([normal_count, fraud_count], labels=['Normal', 'Fraud'], 
            autopct='%1.2f%%', colors=colors, explode=(0, 0.1), shadow=True)
axes[0].set_title('Transaction Class Distribution', fontsize=14, fontweight='bold')

bars = axes[1].bar(['Normal', 'Fraud'], [normal_count, fraud_count], color=colors, edgecolor='black')
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_title('Transaction Counts by Class', fontsize=14, fontweight='bold')
for bar, count in zip(bars, [normal_count, fraud_count]):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 100, 
                f'{count:,}', ha='center', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Transaction Amount Distribution by Fraud Status
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Amount by fraud status
df.boxplot(column='TransactionAmount', by='is_fraud', ax=axes[0])
axes[0].set_xlabel('Fraud (0=No, 1=Yes)')
axes[0].set_ylabel('Amount ($)')
axes[0].set_title('Transaction Amount by Fraud Status', fontweight='bold')
plt.suptitle('')

# Hour distribution
df[df['is_fraud']==0]['hour'].hist(ax=axes[1], bins=24, alpha=0.7, label='Normal', color='green')
df[df['is_fraud']==1]['hour'].hist(ax=axes[1], bins=24, alpha=0.7, label='Fraud', color='red')
axes[1].set_xlabel('Hour of Day')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Transaction Hour Distribution', fontweight='bold')
axes[1].legend()

# Login attempts
df[df['is_fraud']==0]['LoginAttempts'].value_counts().sort_index().plot(kind='bar', ax=axes[2], alpha=0.7, label='Normal', color='green', position=1, width=0.4)
df[df['is_fraud']==1]['LoginAttempts'].value_counts().sort_index().plot(kind='bar', ax=axes[2], alpha=0.7, label='Fraud', color='red', position=0, width=0.4)
axes[2].set_xlabel('Login Attempts')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Login Attempts by Fraud Status', fontweight='bold')
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Categorical features analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Transaction Type
pd.crosstab(df['TransactionType'], df['is_fraud'], normalize='index').plot(kind='bar', ax=axes[0,0], color=['green', 'red'])
axes[0,0].set_title('Fraud Rate by Transaction Type', fontweight='bold')
axes[0,0].set_ylabel('Proportion')
axes[0,0].legend(['Normal', 'Fraud'])

# Channel
pd.crosstab(df['Channel'], df['is_fraud'], normalize='index').plot(kind='bar', ax=axes[0,1], color=['green', 'red'])
axes[0,1].set_title('Fraud Rate by Channel', fontweight='bold')
axes[0,1].set_ylabel('Proportion')
axes[0,1].legend(['Normal', 'Fraud'])

# Occupation
pd.crosstab(df['CustomerOccupation'], df['is_fraud'], normalize='index').plot(kind='bar', ax=axes[1,0], color=['green', 'red'])
axes[1,0].set_title('Fraud Rate by Occupation', fontweight='bold')
axes[1,0].set_ylabel('Proportion')
axes[1,0].legend(['Normal', 'Fraud'])
axes[1,0].tick_params(axis='x', rotation=45)

# Merchant Category
pd.crosstab(df['MerchantCategory'], df['is_fraud'], normalize='index').plot(kind='bar', ax=axes[1,1], color=['green', 'red'])
axes[1,1].set_title('Fraud Rate by Merchant Category', fontweight='bold')
axes[1,1].set_ylabel('Proportion')
axes[1,1].legend(['Normal', 'Fraud'])
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
# Prepare features
# Drop non-predictive columns
drop_cols = ['TransactionID', 'AccountID', 'TransactionDate']
df_model = df.drop(columns=drop_cols)

# Encode categorical variables
categorical_cols = ['TransactionType', 'Location', 'Channel', 'MerchantCategory', 'CustomerOccupation']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    label_encoders[col] = le

print('‚úÖ Categorical variables encoded!')
print(f'üìä Features shape: {df_model.shape}')
df_model.head()

In [None]:
# Split features and target
X = df_model.drop('is_fraud', axis=1)
y = df_model['is_fraud']

feature_names = list(X.columns)
print(f'üìã Features: {feature_names}')

In [None]:
# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=feature_names)

print('‚úÖ Features scaled!')

In [None]:
# Train/Validation/Test Split (70/15/15)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print('üìä Data Split Summary')
print('=' * 50)
print(f'Training:   {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)')
print(f'Validation: {len(X_val):,} samples ({len(X_val)/len(X)*100:.1f}%)')
print(f'Test:       {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)')
print(f'\nüéØ Fraud ratios preserved:')
print(f'   Train: {y_train.mean()*100:.2f}%')
print(f'   Val:   {y_val.mean()*100:.2f}%')
print(f'   Test:  {y_test.mean()*100:.2f}%')

In [None]:
# Apply SMOTE to handle class imbalance
print('‚öñÔ∏è Applying SMOTE...')
smote = SMOTE(random_state=42, sampling_strategy=0.5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f'\nüìä Before SMOTE: Normal={sum(y_train==0):,}, Fraud={sum(y_train==1):,}')
print(f'üìä After SMOTE:  Normal={sum(y_train_resampled==0):,}, Fraud={sum(y_train_resampled==1):,}')

## 5. Model Training with Optuna

In [None]:
# Results storage
results = {}

def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model and return metrics"""
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc_roc': roc_auc_score(y_test, y_pred_proba),
        'avg_precision': average_precision_score(y_test, y_pred_proba)
    }
    
    print(f'\nüéØ {model_name} Performance:')
    print('=' * 50)
    for name, value in metrics.items():
        print(f"   {name}: {value:.4f}")
    
    return metrics, y_pred, y_pred_proba

In [None]:
# Optuna config
optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = TPESampler(seed=42)
N_TRIALS = 20  # Adjust for more thorough search

In [None]:
# Random Forest
print('üå≤ Training Random Forest...')

def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'class_weight': 'balanced',
        'random_state': 42,
        'n_jobs': -1
    }
    model = RandomForestClassifier(**params)
    model.fit(X_train_resampled, y_train_resampled)
    return roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

study_rf = optuna.create_study(direction='maximize', sampler=sampler)
study_rf.optimize(objective_rf, n_trials=N_TRIALS, show_progress_bar=True)

# Train best RF
best_rf_params = study_rf.best_trial.params
best_rf_params.update({'class_weight': 'balanced', 'random_state': 42, 'n_jobs': -1})
rf_model = RandomForestClassifier(**best_rf_params)
rf_model.fit(X_train_resampled, y_train_resampled)
results['Random Forest'], rf_pred, rf_proba = evaluate_model(rf_model, X_test, y_test, 'Random Forest')

In [None]:
# XGBoost
print('üöÄ Training XGBoost...')

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'scale_pos_weight': sum(y_train==0)/sum(y_train==1),
        'random_state': 42,
        'eval_metric': 'auc'
    }
    model = xgb.XGBClassifier(**params)
    model.fit(X_train_resampled, y_train_resampled, verbose=False)
    return roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

study_xgb = optuna.create_study(direction='maximize', sampler=sampler)
study_xgb.optimize(objective_xgb, n_trials=N_TRIALS, show_progress_bar=True)

# Train best XGB
best_xgb_params = study_xgb.best_trial.params
best_xgb_params.update({'scale_pos_weight': sum(y_train==0)/sum(y_train==1), 'random_state': 42, 'eval_metric': 'auc'})
xgb_model = xgb.XGBClassifier(**best_xgb_params)
xgb_model.fit(X_train_resampled, y_train_resampled, verbose=False)
results['XGBoost'], xgb_pred, xgb_proba = evaluate_model(xgb_model, X_test, y_test, 'XGBoost')

In [None]:
# LightGBM
print('‚ö° Training LightGBM...')

def objective_lgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'class_weight': 'balanced',
        'random_state': 42,
        'verbose': -1
    }
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train_resampled, y_train_resampled)
    return roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

study_lgb = optuna.create_study(direction='maximize', sampler=sampler)
study_lgb.optimize(objective_lgb, n_trials=N_TRIALS, show_progress_bar=True)

# Train best LGB
best_lgb_params = study_lgb.best_trial.params
best_lgb_params.update({'class_weight': 'balanced', 'random_state': 42, 'verbose': -1})
lgb_model = lgb.LGBMClassifier(**best_lgb_params)
lgb_model.fit(X_train_resampled, y_train_resampled)
results['LightGBM'], lgb_pred, lgb_proba = evaluate_model(lgb_model, X_test, y_test, 'LightGBM')

In [None]:
# CatBoost
print('üê± Training CatBoost...')

def objective_cat(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 300),
        'depth': trial.suggest_int('depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'auto_class_weights': 'Balanced',
        'random_state': 42,
        'verbose': False
    }
    model = CatBoostClassifier(**params)
    model.fit(X_train_resampled, y_train_resampled, verbose=False)
    return roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

study_cat = optuna.create_study(direction='maximize', sampler=sampler)
study_cat.optimize(objective_cat, n_trials=N_TRIALS, show_progress_bar=True)

# Train best CatBoost
best_cat_params = study_cat.best_trial.params
best_cat_params.update({'auto_class_weights': 'Balanced', 'random_state': 42, 'verbose': False})
cat_model = CatBoostClassifier(**best_cat_params)
cat_model.fit(X_train_resampled, y_train_resampled, verbose=False)
results['CatBoost'], cat_pred, cat_proba = evaluate_model(cat_model, X_test, y_test, 'CatBoost')

## 6. Model Comparison

In [None]:
# Compare all models
comparison_df = pd.DataFrame(results).T.round(4)
comparison_df = comparison_df.sort_values('auc_roc', ascending=False)

print('\nüèÜ MODEL COMPARISON (sorted by AUC-ROC)')
print('=' * 80)
print(comparison_df.to_string())

best_model_name = comparison_df.index[0]
print(f'\nü•á Best Model: {best_model_name}')

In [None]:
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Metrics comparison
x = np.arange(len(comparison_df.index))
width = 0.2
colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6']
for i, metric in enumerate(['auc_roc', 'precision', 'recall', 'f1']):
    axes[0, 0].bar(x + i*width, comparison_df[metric], width, label=metric.upper(), color=colors[i])
axes[0, 0].set_xticks(x + width*1.5)
axes[0, 0].set_xticklabels(comparison_df.index, rotation=15)
axes[0, 0].legend()
axes[0, 0].set_title('Model Comparison', fontweight='bold')
axes[0, 0].set_ylim(0, 1.05)

# ROC Curves
models = {'Random Forest': rf_proba, 'XGBoost': xgb_proba, 'LightGBM': lgb_proba, 'CatBoost': cat_proba}
for name, proba in models.items():
    fpr, tpr, _ = roc_curve(y_test, proba)
    auc = roc_auc_score(y_test, proba)
    axes[0, 1].plot(fpr, tpr, label=f'{name} (AUC={auc:.4f})', linewidth=2)
axes[0, 1].plot([0, 1], [0, 1], 'k--')
axes[0, 1].set_xlabel('False Positive Rate')
axes[0, 1].set_ylabel('True Positive Rate')
axes[0, 1].set_title('ROC Curves', fontweight='bold')
axes[0, 1].legend()

# Precision-Recall Curves
for name, proba in models.items():
    precision, recall, _ = precision_recall_curve(y_test, proba)
    ap = average_precision_score(y_test, proba)
    axes[1, 0].plot(recall, precision, label=f'{name} (AP={ap:.4f})', linewidth=2)
axes[1, 0].set_xlabel('Recall')
axes[1, 0].set_ylabel('Precision')
axes[1, 0].set_title('Precision-Recall Curves', fontweight='bold')
axes[1, 0].legend()

# Confusion Matrix for best model
best_pred = {'Random Forest': rf_pred, 'XGBoost': xgb_pred, 'LightGBM': lgb_pred, 'CatBoost': cat_pred}[best_model_name]
cm = confusion_matrix(y_test, best_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 1],
            xticklabels=['Normal', 'Fraud'], yticklabels=['Normal', 'Fraud'])
axes[1, 1].set_xlabel('Predicted')
axes[1, 1].set_ylabel('Actual')
axes[1, 1].set_title(f'Confusion Matrix - {best_model_name}', fontweight='bold')

plt.tight_layout()
plt.savefig('notebooks/model_comparison.png', dpi=150)
plt.show()

In [None]:
# Feature Importance (XGBoost)
importance = pd.DataFrame({
    'feature': feature_names,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=True)

plt.figure(figsize=(10, 8))
plt.barh(importance['feature'], importance['importance'], color='#8b5cf6')
plt.xlabel('Importance')
plt.title('Feature Importance (XGBoost)', fontweight='bold')
plt.tight_layout()
plt.savefig('notebooks/feature_importance.png', dpi=150)
plt.show()

## 7. Save Models

In [None]:
# Create models directory
os.makedirs('models', exist_ok=True)

# Save all models
models_to_save = {
    'random_forest': rf_model,
    'xgboost': xgb_model,
    'lightgbm': lgb_model,
    'catboost': cat_model
}

for name, model in models_to_save.items():
    joblib.dump(model, f'models/{name}_model.pkl')
    print(f'‚úÖ Saved: models/{name}_model.pkl')

# Save best model
best_model = {'Random Forest': rf_model, 'XGBoost': xgb_model, 'LightGBM': lgb_model, 'CatBoost': cat_model}[best_model_name]
joblib.dump(best_model, 'models/best_model.pkl')
print(f'\nüèÜ Best model saved as: models/best_model.pkl')

# Save scaler and encoders
joblib.dump(scaler, 'models/scaler.pkl')
joblib.dump(label_encoders, 'models/label_encoders.pkl')
print('‚úÖ Saved: models/scaler.pkl')
print('‚úÖ Saved: models/label_encoders.pkl')

# Save feature names
with open('models/feature_names.json', 'w') as f:
    json.dump(feature_names, f)
print('‚úÖ Saved: models/feature_names.json')

# Save metrics
metrics_dict = {
    'best_model': best_model_name,
    'results': {k: {m: float(v) for m, v in metrics.items()} for k, metrics in results.items()},
    'training_date': datetime.now().isoformat(),
    'dataset_size': len(df),
    'fraud_rate': float(fraud_pct)
}

with open('models/model_metrics.json', 'w') as f:
    json.dump(metrics_dict, f, indent=2)
print('‚úÖ Saved: models/model_metrics.json')

## 8. Summary

In [None]:
print('\n' + '=' * 70)
print('üéâ TRAINING COMPLETE')
print('=' * 70)

print(f'\nüìä Dataset: {len(df):,} transactions')
print(f'   Fraud rate: {fraud_pct:.2f}%')
print(f'   Features: {len(feature_names)}')

print(f'\nü§ñ Models Trained:')
for name, metrics in results.items():
    print(f'   - {name}: AUC={metrics["auc_roc"]:.4f}, F1={metrics["f1"]:.4f}')

print(f'\nüèÜ Best Model: {best_model_name}')
print(f'   AUC-ROC:   {results[best_model_name]["auc_roc"]:.4f}')
print(f'   Precision: {results[best_model_name]["precision"]:.4f}')
print(f'   Recall:    {results[best_model_name]["recall"]:.4f}')

print(f'\nüìÅ Files Saved:')
print('   - models/best_model.pkl')
print('   - models/model_metrics.json')
print('   - models/scaler.pkl')
print('   - models/label_encoders.pkl')

print(f'\nüöÄ Next: Run the desktop app!')
print('   python app_desktop.py')
print('=' * 70)