In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import time
import pickle
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from scipy.stats import rankdata

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import sys
ROOT = Path.cwd().parent
sys.path.append(str(ROOT / 'functions'))
DATA = ROOT / 'data'

print('Libraries loaded')

Libraries loaded


## 1. Load Data

**Important**: We use `train_merged.csv` and perform an 80/20 temporal split (same as EDA.ipynb).
The `test_merged.csv` is the Kaggle competition test set without labels - not used for evaluation.

In [5]:
# Load raw merged data (train only - test_merged.csv has no labels!)
df_raw = pd.read_csv(DATA / 'train_merged.csv')

# Temporal split (same as EDA.ipynb - 80/20)
df_raw = df_raw.sort_values('TransactionDT').reset_index(drop=True)
split_idx = int(len(df_raw) * 0.8)

train_raw = df_raw.iloc[:split_idx].copy()
test_raw = df_raw.iloc[split_idx:].copy()

# Also load preprocessed data for Scenario 5 (feature importance)
train_preprocessed = pd.read_parquet(DATA / 'train_preprocessed.parquet')
test_preprocessed = pd.read_parquet(DATA / 'test_preprocessed.parquet')

with open(DATA / 'feature_lists.pkl', 'rb') as f:
    feature_lists = pickle.load(f)

print(f'Raw data loaded and split (80/20 temporal):')
print(f'  Train: {train_raw.shape}')
print(f'  Test: {test_raw.shape}')
print(f'Preprocessed train: {train_preprocessed.shape}')

Raw data loaded and split (80/20 temporal):
  Train: (472432, 434)
  Test: (118108, 434)
Preprocessed train: (472432, 195)


In [6]:
# Configuration
N_FOLDS = 3
CV_RANDOM_STATE = 42
BLEND_VAL_SIZE = 0.2

# Target variable (from temporal split)
y_train_full = train_raw['isFraud'].copy()
y_test = test_raw['isFraud'].copy()

# Class imbalance ratio
scale_pos_weight = (y_train_full == 0).sum() / (y_train_full == 1).sum()

print(f'Train samples: {len(y_train_full)}, Test samples: {len(y_test)}')
print(f'Train fraud rate: {y_train_full.mean()*100:.2f}%')
print(f'Test fraud rate: {y_test.mean()*100:.2f}%')
print(f'Class imbalance ratio: {scale_pos_weight:.2f}')

Train samples: 472432, Test samples: 118108
Train fraud rate: 3.51%
Test fraud rate: 3.44%
Class imbalance ratio: 27.46


## 2. Define Column Groups

In [7]:
# Identify column groups from raw data
all_cols = [c for c in train_raw.columns if c not in ['TransactionID', 'isFraud']]

# C columns (count features - historical)
c_cols = [c for c in all_cols if c.startswith('C') and c[1:].isdigit()]

# D columns (time delta features - historical)
d_cols = [c for c in all_cols if c.startswith('D') and c[1:].isdigit()]

# V columns (Vesta features - historical comparisons)
v_cols = [c for c in all_cols if c.startswith('V') and c[1:].isdigit()]

# M columns (match features)
m_cols = [c for c in all_cols if c.startswith('M') and len(c) <= 3]

# ID columns
id_cols = [c for c in all_cols if c.startswith('id_')]

# Transaction columns (non-historical)
transaction_cols = ['TransactionDT', 'TransactionAmt', 'ProductCD', 
                    'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
                    'addr1', 'addr2', 'dist1', 'dist2', 
                    'P_emaildomain', 'R_emaildomain']

# Device columns
device_cols = ['DeviceType', 'DeviceInfo']

# Categorical columns in raw data
raw_categorical = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain',
                   'DeviceType', 'DeviceInfo'] + m_cols + [c for c in id_cols if train_raw[c].dtype == 'object']

print(f'C columns (count): {len(c_cols)}')
print(f'D columns (time delta): {len(d_cols)}')
print(f'V columns (Vesta): {len(v_cols)}')
print(f'M columns (match): {len(m_cols)}')
print(f'ID columns: {len(id_cols)}')
print(f'Transaction columns: {len(transaction_cols)}')

C columns (count): 14
D columns (time delta): 15
V columns (Vesta): 339
M columns (match): 9
ID columns: 38
Transaction columns: 15


## 3. Model Factory Functions

In [8]:
def create_random_forest(use_imputer=False):
    rf = RandomForestClassifier(
        n_estimators=200, max_depth=15, min_samples_split=10,
        min_samples_leaf=5, class_weight='balanced', random_state=42, n_jobs=-1
    )
    if use_imputer:
        return Pipeline([('imputer', SimpleImputer(strategy='median')), ('classifier', rf)])
    return rf

def create_xgboost(spw):
    return XGBClassifier(
        n_estimators=300, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8, scale_pos_weight=spw,
        eval_metric='auc', random_state=42, n_jobs=-1
    )

def create_lightgbm():
    return LGBMClassifier(
        n_estimators=300, max_depth=8, learning_rate=0.05,
        num_leaves=31, subsample=0.8, colsample_bytree=0.8,
        is_unbalance=True, random_state=42, n_jobs=-1, verbose=-1
    )

def create_catboost(cat_features=None):
    return CatBoostClassifier(
        iterations=300, depth=6, learning_rate=0.05,
        auto_class_weights='Balanced', eval_metric='AUC',
        random_seed=42, verbose=0, cat_features=cat_features
    )

def get_base_models(spw, use_imputer=False, cat_features=None):
    return {
        'RandomForest': create_random_forest(use_imputer),
        'XGBoost': create_xgboost(spw),
        'LightGBM': create_lightgbm(),
        'CatBoost': create_catboost(cat_features)
    }

print('Model factories defined')

Model factories defined


## 4. Ensemble Methods

Four ensemble approaches:
1. **Stacking_Weighted**: Simple weighted average based on CV-AUC
2. **Stacking_MLP**: Neural network meta-learner trained on OOF predictions
3. **Stacking_Blend**: Holdout-based blending (faster, no OOF needed)
4. **Rank_Average**: Rank-based averaging (calibration-free, robust)

In [9]:
def ensemble_weighted_average(test_preds, cv_scores):
    """Weighted average based on CV-AUC scores."""
    model_names = list(test_preds.keys())
    total = sum(cv_scores[m] for m in model_names)
    weights = np.array([cv_scores[m] / total for m in model_names])
    test_matrix = np.column_stack([test_preds[m] for m in model_names])
    return np.dot(test_matrix, weights)


def ensemble_mlp(oof_preds, y_train, test_preds, model_names):
    """MLP neural network meta-learner trained on OOF predictions."""
    oof_matrix = np.column_stack([oof_preds[m] for m in model_names])
    test_matrix = np.column_stack([test_preds[m] for m in model_names])
    
    scaler = StandardScaler()
    oof_scaled = scaler.fit_transform(oof_matrix)
    test_scaled = scaler.transform(test_matrix)
    
    mlp = MLPClassifier(
        hidden_layer_sizes=(16, 8),
        activation='relu',
        solver='adam',
        alpha=0.01,
        max_iter=500,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1
    )
    mlp.fit(oof_scaled, y_train)
    return mlp.predict_proba(test_scaled)[:, 1]


def ensemble_blend(X_train, y_train, X_test, base_models, blend_size=0.2, random_state=42):
    """
    Blending: Split train into train/blend, train base models on train,
    predict on blend set to train meta-learner, then predict on test.
    Faster than OOF stacking.
    """
    X_tr, X_blend, y_tr, y_blend = train_test_split(
        X_train, y_train, test_size=blend_size, stratify=y_train, random_state=random_state
    )
    
    blend_preds = {}
    test_preds = {}
    
    for name, model in base_models.items():
        m = clone(model)
        m.fit(X_tr, y_tr)
        blend_preds[name] = m.predict_proba(X_blend)[:, 1]
        test_preds[name] = m.predict_proba(X_test)[:, 1]
    
    model_names = list(base_models.keys())
    blend_matrix = np.column_stack([blend_preds[m] for m in model_names])
    test_matrix = np.column_stack([test_preds[m] for m in model_names])
    
    meta = LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000, random_state=42)
    meta.fit(blend_matrix, y_blend)
    
    return meta.predict_proba(test_matrix)[:, 1]


def ensemble_rank_average(test_preds):
    """Rank-based averaging. Robust to different probability calibrations."""
    model_names = list(test_preds.keys())
    n_samples = len(test_preds[model_names[0]])
    
    rank_sum = np.zeros(n_samples)
    for name in model_names:
        ranks = rankdata(test_preds[name])
        rank_sum += ranks
    
    # Normalize to [0, 1]
    return rank_sum / (len(model_names) * n_samples)


print('Ensemble methods defined')

Ensemble methods defined


## 5. Data Preprocessing Functions

In [10]:
def preprocess_for_models(X_train, X_test, categorical_cols=None):
    """
    Minimal preprocessing for raw data:
    - Fill NaN with -999 for numerical
    - Label encode categoricals
    - Handle infinity
    """
    X_train = X_train.copy()
    X_test = X_test.copy()
    
    # Replace infinity
    X_train = X_train.replace([np.inf, -np.inf], np.nan)
    X_test = X_test.replace([np.inf, -np.inf], np.nan)
    
    # Identify categorical columns if not provided
    if categorical_cols is None:
        categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    # Label encode categoricals
    cat_indices = []
    for i, col in enumerate(X_train.columns):
        if col in categorical_cols:
            cat_indices.append(i)
            le = LabelEncoder()
            combined = pd.concat([X_train[col].astype(str), X_test[col].astype(str)])
            le.fit(combined)
            X_train[col] = le.transform(X_train[col].astype(str))
            X_test[col] = le.transform(X_test[col].astype(str))
    
    # Fill NaN with sentinel
    X_train = X_train.fillna(-999)
    X_test = X_test.fillna(-999)
    
    return X_train, X_test, cat_indices


def evaluate_on_test(y_true, y_prob):
    """Calculate test metrics with Youden-optimal threshold."""
    auc = roc_auc_score(y_true, y_prob)
    ap = average_precision_score(y_true, y_prob)
    
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    j_scores = tpr - fpr
    opt_idx = np.argmax(j_scores)
    opt_thresh = thresholds[opt_idx]
    
    y_pred = (y_prob >= opt_thresh).astype(int)
    
    return {
        'Test_AUC': auc,
        'Test_AP': ap,
        'Optimal_Threshold': opt_thresh,
        'Precision_opt': precision_score(y_true, y_pred, zero_division=0),
        'Recall_opt': recall_score(y_true, y_pred, zero_division=0),
        'F1_opt': f1_score(y_true, y_pred, zero_division=0)
    }


print('Preprocessing functions defined')

Preprocessing functions defined


## 6. Scenario Runner

In [11]:
def run_scenario(scenario_id, name, X_train, X_test, y_train, y_test, 
                 scale_pos_weight, cat_indices=None, n_folds=3):
    """
    Run complete scenario with base models and ensemble methods.
    """
    print(f'\n{"="*70}')
    print(f'SCENARIO {scenario_id}: {name}')
    print(f'Features: {X_train.shape[1]}, Samples: {len(X_train)}')
    print('='*70)
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=CV_RANDOM_STATE)
    base_models = get_base_models(scale_pos_weight, use_imputer=False, cat_features=cat_indices)
    
    results = []
    oof_predictions = {}
    test_predictions = {}
    cv_scores = {}
    
    # Train base models with CV
    for model_name, model in base_models.items():
        start = time.time()
        oof_probs = np.zeros(len(X_train))
        fold_scores = []
        
        for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
            X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
            
            m = clone(model)
            m.fit(X_tr, y_tr)
            
            probs = m.predict_proba(X_val)[:, 1]
            oof_probs[val_idx] = probs
            fold_scores.append(roc_auc_score(y_val, probs))
        
        cv_mean, cv_std = np.mean(fold_scores), np.std(fold_scores)
        oof_predictions[model_name] = oof_probs
        cv_scores[model_name] = cv_mean
        
        # Final model on full train
        final_model = clone(model)
        final_model.fit(X_train, y_train)
        test_probs = final_model.predict_proba(X_test)[:, 1]
        test_predictions[model_name] = test_probs
        
        metrics = evaluate_on_test(y_test, test_probs)
        elapsed = time.time() - start
        
        results.append({
            'Scenario': scenario_id,
            'Approach': model_name,
            'CV_AUC_mean': cv_mean,
            'CV_AUC_std': cv_std,
            **metrics
        })
        print(f'  {model_name}: CV={cv_mean:.4f}+/-{cv_std:.4f}, Test={metrics["Test_AUC"]:.4f} ({elapsed:.1f}s)')
    
    # Ensemble 1: Weighted Average
    weighted_probs = ensemble_weighted_average(test_predictions, cv_scores)
    metrics = evaluate_on_test(y_test, weighted_probs)
    results.append({'Scenario': scenario_id, 'Approach': 'Stacking_Weighted',
                    'CV_AUC_mean': np.nan, 'CV_AUC_std': np.nan, **metrics})
    print(f'  Stacking_Weighted: Test={metrics["Test_AUC"]:.4f}')
    
    # Ensemble 2: MLP Meta-learner
    model_names = list(base_models.keys())
    mlp_probs = ensemble_mlp(oof_predictions, y_train, test_predictions, model_names)
    metrics = evaluate_on_test(y_test, mlp_probs)
    results.append({'Scenario': scenario_id, 'Approach': 'Stacking_MLP',
                    'CV_AUC_mean': np.nan, 'CV_AUC_std': np.nan, **metrics})
    print(f'  Stacking_MLP: Test={metrics["Test_AUC"]:.4f}')
    
    # Ensemble 3: Blending
    blend_probs = ensemble_blend(X_train, y_train, X_test, base_models, 
                                  blend_size=BLEND_VAL_SIZE, random_state=CV_RANDOM_STATE)
    metrics = evaluate_on_test(y_test, blend_probs)
    results.append({'Scenario': scenario_id, 'Approach': 'Stacking_Blend',
                    'CV_AUC_mean': np.nan, 'CV_AUC_std': np.nan, **metrics})
    print(f'  Stacking_Blend: Test={metrics["Test_AUC"]:.4f}')
    
    # Ensemble 4: Rank Average
    rank_probs = ensemble_rank_average(test_predictions)
    metrics = evaluate_on_test(y_test, rank_probs)
    results.append({'Scenario': scenario_id, 'Approach': 'Rank_Average',
                    'CV_AUC_mean': np.nan, 'CV_AUC_std': np.nan, **metrics})
    print(f'  Rank_Average: Test={metrics["Test_AUC"]:.4f}')
    
    return pd.DataFrame(results)


print('Scenario runner defined')

Scenario runner defined


## 7. Define and Execute Scenarios

In [12]:
# Store scenario metadata
SCENARIOS = {}
all_results = []

### Scenario 1: Raw Data Baseline

Use train_merged and test_merged with minimal preprocessing. This establishes the "before" state to demonstrate the value of feature engineering.

In [13]:
# Scenario 1: Raw Data Baseline
feature_cols = [c for c in train_raw.columns if c not in ['TransactionID', 'isFraud']]

X_train_s1 = train_raw[feature_cols].copy()
X_test_s1 = test_raw[feature_cols].copy()
y_train_s1 = y_train_full.copy()

X_train_s1, X_test_s1, cat_idx_s1 = preprocess_for_models(X_train_s1, X_test_s1)

SCENARIOS[1] = {
    'name': 'Raw Data Baseline',
    'description': 'No feature engineering, minimal preprocessing',
    'n_features': X_train_s1.shape[1]
}

results_s1 = run_scenario(1, SCENARIOS[1]['name'], X_train_s1, X_test_s1, 
                          y_train_s1, y_test, scale_pos_weight, cat_idx_s1)
all_results.append(results_s1)


SCENARIO 1: Raw Data Baseline
Features: 432, Samples: 472432
  RandomForest: CV=0.9116+/-0.0016, Test=0.8723 (266.1s)
  XGBoost: CV=0.9380+/-0.0009, Test=0.9078 (760.7s)
  LightGBM: CV=0.9371+/-0.0007, Test=0.9085 (59.3s)
  CatBoost: CV=0.9170+/-0.0007, Test=0.8943 (824.7s)
  Stacking_Weighted: Test=0.9051
  Stacking_MLP: Test=0.9069
  Stacking_Blend: Test=0.8964
  Rank_Average: Test=0.9021


### Scenario 2: High Cardinality Categorical Focus

Focus on raw categorical columns without frequency encoding. Test which model handles high-cardinality best.

In [14]:
# Scenario 2: High Cardinality Categorical Focus
# Include card columns, email domains, device info, M columns
cat_focus_cols = ['TransactionAmt', 'ProductCD', 
                  'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
                  'addr1', 'addr2', 
                  'P_emaildomain', 'R_emaildomain',
                  'DeviceType', 'DeviceInfo'] + m_cols

cat_focus_cols = [c for c in cat_focus_cols if c in train_raw.columns]

X_train_s2 = train_raw[cat_focus_cols].copy()
X_test_s2 = test_raw[cat_focus_cols].copy()
y_train_s2 = y_train_full.copy()

X_train_s2, X_test_s2, cat_idx_s2 = preprocess_for_models(X_train_s2, X_test_s2)

SCENARIOS[2] = {
    'name': 'High Cardinality Categorical',
    'description': 'Raw categoricals with minimal encoding',
    'n_features': X_train_s2.shape[1]
}

results_s2 = run_scenario(2, SCENARIOS[2]['name'], X_train_s2, X_test_s2,
                          y_train_s2, y_test, scale_pos_weight, cat_idx_s2)
all_results.append(results_s2)


SCENARIO 2: High Cardinality Categorical
Features: 23, Samples: 472432
  RandomForest: CV=0.9000+/-0.0012, Test=0.8413 (69.0s)
  XGBoost: CV=0.8939+/-0.0015, Test=0.8413 (101.9s)
  LightGBM: CV=0.8888+/-0.0006, Test=0.8441 (11.4s)
  CatBoost: CV=0.8687+/-0.0012, Test=0.8211 (465.0s)
  Stacking_Weighted: Test=0.8449
  Stacking_MLP: Test=0.8467
  Stacking_Blend: Test=0.8428
  Rank_Average: Test=0.8442


### Scenario 3: Numerical Features Only

Remove all categorical features. Keep only numerical signals to test pure numerical optimization.

In [15]:
# Scenario 3: Numerical Only
numerical_cols = ['TransactionDT', 'TransactionAmt', 'dist1', 'dist2']
numerical_cols += c_cols + d_cols + v_cols
numerical_cols += [c for c in id_cols if train_raw[c].dtype in ['int64', 'float64']]
numerical_cols = [c for c in numerical_cols if c in train_raw.columns]

X_train_s3 = train_raw[numerical_cols].copy()
X_test_s3 = test_raw[numerical_cols].copy()
y_train_s3 = y_train_full.copy()

# No categoricals here
X_train_s3 = X_train_s3.replace([np.inf, -np.inf], np.nan).fillna(-999)
X_test_s3 = X_test_s3.replace([np.inf, -np.inf], np.nan).fillna(-999)

SCENARIOS[3] = {
    'name': 'Numerical Only',
    'description': 'No categorical features, pure numerical',
    'n_features': X_train_s3.shape[1]
}

results_s3 = run_scenario(3, SCENARIOS[3]['name'], X_train_s3, X_test_s3,
                          y_train_s3, y_test, scale_pos_weight, cat_indices=None)
all_results.append(results_s3)


SCENARIO 3: Numerical Only
Features: 395, Samples: 472432
  RandomForest: CV=0.9008+/-0.0012, Test=0.8630 (249.5s)
  XGBoost: CV=0.9208+/-0.0013, Test=0.8865 (707.0s)
  LightGBM: CV=0.9200+/-0.0014, Test=0.8849 (61.0s)
  CatBoost: CV=0.9014+/-0.0016, Test=0.8818 (144.2s)
  Stacking_Weighted: Test=0.8865
  Stacking_MLP: Test=0.8745
  Stacking_Blend: Test=0.8672
  Rank_Average: Test=0.8838


### Scenario 4: The Amnesiac System

**Story**: What if the system can't access historical data? During high-load events, we need to make decisions using only current transaction data.

Remove all "memory" columns:
- C columns (count history)
- D columns (time deltas)
- V columns (Vesta historical comparisons)

Keep only: Transaction basics, Card info, Address, Email, Device, Identity

In [16]:
# Scenario 4: Amnesiac System - No historical data (C, D, V columns)
history_cols = set(c_cols + d_cols + v_cols)
amnesiac_cols = [c for c in train_raw.columns 
                 if c not in ['TransactionID', 'isFraud'] and c not in history_cols]

X_train_s4 = train_raw[amnesiac_cols].copy()
X_test_s4 = test_raw[amnesiac_cols].copy()
y_train_s4 = y_train_full.copy()

X_train_s4, X_test_s4, cat_idx_s4 = preprocess_for_models(X_train_s4, X_test_s4)

SCENARIOS[4] = {
    'name': 'Amnesiac System',
    'description': 'No C/D/V columns - real-time only data',
    'n_features': X_train_s4.shape[1]
}

print(f'Historical columns removed: C={len(c_cols)}, D={len(d_cols)}, V={len(v_cols)}')
print(f'Remaining features: {X_train_s4.shape[1]}')

results_s4 = run_scenario(4, SCENARIOS[4]['name'], X_train_s4, X_test_s4,
                          y_train_s4, y_test, scale_pos_weight, cat_idx_s4)
all_results.append(results_s4)

Historical columns removed: C=14, D=15, V=339
Remaining features: 64

SCENARIO 4: Amnesiac System
Features: 64, Samples: 472432
  RandomForest: CV=0.9036+/-0.0016, Test=0.8356 (100.7s)
  XGBoost: CV=0.9058+/-0.0005, Test=0.8460 (187.1s)
  LightGBM: CV=0.9026+/-0.0008, Test=0.8519 (15.4s)
  CatBoost: CV=0.8788+/-0.0020, Test=0.8291 (739.2s)
  Stacking_Weighted: Test=0.8494
  Stacking_MLP: Test=0.8529
  Stacking_Blend: Test=0.8343
  Rank_Average: Test=0.8482


### Scenario 5: Top 40 Features

Select top 40 features based on importance from preprocessed data. Test which model generalizes best with limited but high-quality features.

In [17]:
# Scenario 5: Top 40 Features (from preprocessed data)
# Train a quick LightGBM to get feature importance
filtered_features = feature_lists['filtered_features']

X_temp = train_preprocessed[filtered_features].fillna(-999)
y_temp = train_preprocessed['isFraud']

lgb_temp = LGBMClassifier(n_estimators=100, max_depth=6, random_state=42, verbose=-1)
lgb_temp.fit(X_temp, y_temp)

importance_df = pd.DataFrame({
    'feature': filtered_features,
    'importance': lgb_temp.feature_importances_
}).sort_values('importance', ascending=False)

top_40_features = importance_df.head(40)['feature'].tolist()

print(f'Top 40 features selected')
print(f'Top 10: {top_40_features[:10]}')

Top 40 features selected
Top 10: ['card2', 'D1_normalized', 'card1', 'C13_C1_ratio', 'addr1', 'D2_normalized', 'TransactionAmt', 'C2_C1_ratio', 'card5', 'C1']


In [18]:
# Run Scenario 5
X_train_s5 = train_preprocessed[top_40_features].copy()
X_test_s5 = test_preprocessed[top_40_features].copy()
y_train_s5 = train_preprocessed['isFraud'].copy()
y_test_s5 = test_preprocessed['isFraud'].copy()

X_train_s5 = X_train_s5.replace([np.inf, -np.inf], np.nan).fillna(-999)
X_test_s5 = X_test_s5.replace([np.inf, -np.inf], np.nan).fillna(-999)

SCENARIOS[5] = {
    'name': 'Top 40 Features',
    'description': 'Top features by LightGBM importance',
    'n_features': X_train_s5.shape[1]
}

results_s5 = run_scenario(5, SCENARIOS[5]['name'], X_train_s5, X_test_s5,
                          y_train_s5, y_test_s5, scale_pos_weight, cat_indices=None)
all_results.append(results_s5)


SCENARIO 5: Top 40 Features
Features: 40, Samples: 472432
  RandomForest: CV=0.9458+/-0.0009, Test=0.8961 (137.7s)
  XGBoost: CV=0.9411+/-0.0004, Test=0.8951 (214.9s)
  LightGBM: CV=0.9375+/-0.0012, Test=0.9008 (16.6s)
  CatBoost: CV=0.9179+/-0.0020, Test=0.8885 (68.3s)
  Stacking_Weighted: Test=0.9032
  Stacking_MLP: Test=0.8952
  Stacking_Blend: Test=0.8756
  Rank_Average: Test=0.9028


## 8. Results Output

In [19]:
# Combine all results
master_results = pd.concat(all_results, ignore_index=True)

# Round for display
numeric_cols = ['CV_AUC_mean', 'CV_AUC_std', 'Test_AUC', 'Test_AP',
                'Optimal_Threshold', 'Precision_opt', 'Recall_opt', 'F1_opt']
results_display = master_results.copy()
for col in numeric_cols:
    results_display[col] = results_display[col].round(4)

print('\n' + '='*120)
print('MASTER RESULTS TABLE')
print('='*120)
print(results_display.to_string(index=False))


MASTER RESULTS TABLE
 Scenario          Approach  CV_AUC_mean  CV_AUC_std  Test_AUC  Test_AP  Optimal_Threshold  Precision_opt  Recall_opt  F1_opt
        1      RandomForest       0.9116      0.0016    0.8723   0.4567             0.3304         0.1251      0.7894  0.2159
        1           XGBoost       0.9380      0.0009    0.9078   0.5140             0.3899         0.1602      0.8159  0.2678
        1          LightGBM       0.9371      0.0007    0.9085   0.5162             0.3801         0.1502      0.8255  0.2541
        1          CatBoost       0.9170      0.0007    0.8943   0.4693             0.4443         0.1494      0.7972  0.2516
        1 Stacking_Weighted          NaN         NaN    0.9051   0.5122             0.3723         0.1466      0.8312  0.2492
        1      Stacking_MLP          NaN         NaN    0.9069   0.5242             0.0257         0.1616      0.8041  0.2690
        1    Stacking_Blend          NaN         NaN    0.8964   0.4921             0.3193      

In [20]:
# Per-scenario tables
scenario_results_dict = {}

print('\n' + '='*120)
print('RESULTS BY SCENARIO')
print('='*120)

for sid in sorted(SCENARIOS.keys()):
    scenario_df = results_display[results_display['Scenario'] == sid].copy()
    scenario_df = scenario_df.drop(columns=['Scenario'])
    scenario_df = scenario_df.sort_values('Test_AUC', ascending=False).reset_index(drop=True)
    scenario_results_dict[sid] = scenario_df
    
    print(f'\n{"="*70}')
    print(f'Scenario {sid} Results: {SCENARIOS[sid]["name"]}')
    print(f'({SCENARIOS[sid]["description"]}, {SCENARIOS[sid]["n_features"]} features)')
    print('='*70)
    print(scenario_df.to_string(index=False))
    
    best = scenario_df.iloc[0]
    print(f'\nBest: {best["Approach"]} (Test AUC: {best["Test_AUC"]:.4f})')


RESULTS BY SCENARIO

Scenario 1 Results: Raw Data Baseline
(No feature engineering, minimal preprocessing, 432 features)
         Approach  CV_AUC_mean  CV_AUC_std  Test_AUC  Test_AP  Optimal_Threshold  Precision_opt  Recall_opt  F1_opt
         LightGBM       0.9371      0.0007    0.9085   0.5162             0.3801         0.1502      0.8255  0.2541
          XGBoost       0.9380      0.0009    0.9078   0.5140             0.3899         0.1602      0.8159  0.2678
     Stacking_MLP          NaN         NaN    0.9069   0.5242             0.0257         0.1616      0.8041  0.2690
Stacking_Weighted          NaN         NaN    0.9051   0.5122             0.3723         0.1466      0.8312  0.2492
     Rank_Average          NaN         NaN    0.9021   0.5034             0.7924         0.1464      0.8184  0.2484
   Stacking_Blend          NaN         NaN    0.8964   0.4921             0.3193         0.1557      0.8115  0.2612
         CatBoost       0.9170      0.0007    0.8943   0.4693     

## 9. Export Results

In [21]:
# Create output directory
results_dir = ROOT / 'results' / 'Scenario_Tests_2'
results_dir.mkdir(parents=True, exist_ok=True)

# Save CSV
csv_path = results_dir / 'results_master.csv'
results_display.to_csv(csv_path, index=False)
print(f'Saved: {csv_path}')

# Save Excel
excel_path = results_dir / 'results_by_scenario.xlsx'
with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
    results_display.to_excel(writer, sheet_name='Master', index=False)
    for sid, sdf in scenario_results_dict.items():
        sdf.to_excel(writer, sheet_name=f'Scenario_{sid}', index=False)

print(f'Saved: {excel_path}')

Saved: c:\Users\Abdulkadir\Desktop\Uygulama çalışmaları\Fraud_Detection\Fraud_Detection\results\Scenario_Tests_2\results_master.csv
Saved: c:\Users\Abdulkadir\Desktop\Uygulama çalışmaları\Fraud_Detection\Fraud_Detection\results\Scenario_Tests_2\results_by_scenario.xlsx


In [22]:
# Summary table
print('\n' + '='*100)
print('SCENARIO SUMMARY')
print('='*100)

summary_rows = []
for sid in sorted(SCENARIOS.keys()):
    sdf = master_results[master_results['Scenario'] == sid]
    best = sdf.loc[sdf['Test_AUC'].idxmax()]
    summary_rows.append({
        'Scenario': sid,
        'Name': SCENARIOS[sid]['name'],
        'Features': SCENARIOS[sid]['n_features'],
        'Best_Model': best['Approach'],
        'Test_AUC': round(best['Test_AUC'], 4),
        'Test_AP': round(best['Test_AP'], 4)
    })

summary_df = pd.DataFrame(summary_rows)
print(summary_df.to_string(index=False))

# Save summary
summary_df.to_csv(results_dir / 'scenario_summary.csv', index=False)
print(f'\nResults saved to: {results_dir}')


SCENARIO SUMMARY
 Scenario                         Name  Features        Best_Model  Test_AUC  Test_AP
        1            Raw Data Baseline       432          LightGBM    0.9085   0.5162
        2 High Cardinality Categorical        23      Stacking_MLP    0.8467   0.2353
        3               Numerical Only       395 Stacking_Weighted    0.8865   0.4827
        4              Amnesiac System        64      Stacking_MLP    0.8529   0.3049
        5              Top 40 Features        40 Stacking_Weighted    0.9032   0.5036

Results saved to: c:\Users\Abdulkadir\Desktop\Uygulama çalışmaları\Fraud_Detection\Fraud_Detection\results\Scenario_Tests_2


In [23]:
# Save scenario details
details = '''Scenario Tests 2 - Experiment Documentation
================================================

Purpose: Creative scenarios to highlight different model strengths

ENSEMBLE METHODS
----------------
1. Stacking_Weighted: CV-AUC weighted average of base models
2. Stacking_MLP: Neural network meta-learner (16,8 hidden layers)
3. Stacking_Blend: Holdout-based blending with LogisticRegression
4. Rank_Average: Calibration-free rank averaging

SCENARIOS
---------
Scenario 1: Raw Data Baseline
- No feature engineering, minimal preprocessing
- Purpose: Establish "before" state

Scenario 2: High Cardinality Categorical
- Raw categoricals with minimal encoding
- Purpose: Test native categorical handling

Scenario 3: Numerical Only
- No categorical features
- Purpose: Test pure numerical optimization

Scenario 4: Amnesiac System
- No C/D/V columns (historical data)
- Purpose: Prove value of data engineering
- Story: Real-time fraud detection without database access

Scenario 5: Top 40 Features
- Top features by LightGBM importance
- Purpose: Test generalization with limited features

EVALUATION
----------
- CV: StratifiedKFold, n_splits=3, shuffle=True, random_state=42
- Metrics: CV AUC, Test AUC, Test AP, Youden threshold, Precision/Recall/F1
'''

with open(results_dir / 'scenario_details.txt', 'w') as f:
    f.write(details)

print('Scenario details saved')

Scenario details saved
