# Cell 1: Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.utils.class_weight import compute_class_weight
import joblib
import json
import time
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
print("Ready for model improvement")

# Cell 2: Load Data and Baseline

In [None]:
df = pd.read_csv('../data/preprocessed_creditcard.csv')
train_idx = np.load('../data/train_indices.npy')
test_idx = np.load('../data/test_indices.npy')
baseline_model = joblib.load('../models/baseline_for_comparison.pkl')
with open('../outputs/baseline_metrics.json', 'r') as f:
    baseline_metrics = json.load(f)
print(f"Baseline: FP={baseline_metrics['false_positives']:,}, FN={baseline_metrics['false_negatives']}, AUC={baseline_metrics['roc_auc']:.3f}")

# Cell 3: Feature Engineering

In [None]:
df_enh = df.copy()
v_cols = [c for c in df.columns if c.startswith('V')]
df_enh['v_mean'] = df[v_cols].mean(axis=1)
df_enh['v_std'] = df[v_cols].std(axis=1)
df_enh['v_max'] = df[v_cols].max(axis=1)
df_enh['v_min'] = df[v_cols].min(axis=1)
df_enh['v_range'] = df_enh['v_max'] - df_enh['v_min']
df_enh['amount_sq'] = df_enh['scaled_amount']**2
df_enh['amount_log'] = np.log1p(np.abs(df_enh['scaled_amount'])+1)
df_enh['time_sq'] = df_enh['scaled_time']**2
df_enh['amt_time'] = df_enh['scaled_amount'] * df_enh['scaled_time']
for col in ['V1','V2','V3','V4']:
    q1, q3 = df_enh[col].quantile(0.25), df_enh[col].quantile(0.75)
    iqr = q3 - q1
    df_enh[f'{col}_out'] = ((df_enh[col] < q1-1.5*iqr) | (df_enh[col] > q3+1.5*iqr)).astype(int)

X = df.drop('Class', axis=1)
X_enh = df_enh.drop('Class', axis=1)
y = df_enh['Class']
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
X_train_enh, X_test_enh = X_enh.iloc[train_idx], X_enh.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
print(f"Features: {len(df.columns)-1} → {len(df_enh.columns)-1}")

# Cell 4: Train Improved Model

In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
improved_model = RandomForestClassifier(
    n_estimators=100, max_depth=15, min_samples_split=20, min_samples_leaf=10,
    max_features='sqrt', class_weight=class_weight_dict, random_state=42, n_jobs=-1
)
start = time.time()
improved_model.fit(X_train_enh, y_train)
train_time = time.time() - start
y_pred_imp = improved_model.predict(X_test_enh)
y_proba_imp = improved_model.predict_proba(X_test_enh)[:, 1]
print(f"Trained in {train_time:.2f}s")

# Cell 5: Feature Importance

In [None]:
feat_imp = pd.DataFrame({'feature': X_train_enh.columns, 'importance': improved_model.feature_importances_}).sort_values('importance', ascending=False)
plt.figure(figsize=(10,6))
plt.barh(range(15), feat_imp.head(15)['importance'].values)
plt.yticks(range(15), feat_imp.head(15)['feature'].values)
plt.xlabel('Importance')
plt.title('Top 15 Features')
plt.tight_layout()
plt.show()

# Cell 6: Compare Models

In [None]:
y_pred_base = baseline_model.predict(X_test)
cm_b = confusion_matrix(y_test, y_pred_base)
cm_i = confusion_matrix(y_test, y_pred_imp)
tn_b, fp_b, fn_b, tp_b = cm_b.ravel()
tn_i, fp_i, fn_i, tp_i = cm_i.ravel()

fig = plt.figure(figsize=(18, 10))
# Confusion Matrices
plt.subplot(2,3,1)
sns.heatmap(cm_b, annot=True, fmt='d', cmap='Blues')
plt.title('Baseline')
plt.subplot(2,3,2)
sns.heatmap(cm_i, annot=True, fmt='d', cmap='Greens')
plt.title('Improved')

# Improvements
plt.subplot(2,3,3)
impr = {'FP': (fp_b-fp_i)/fp_b*100 if fp_b>0 else 0, 'FN': (fn_b-fn_i)/fn_b*100 if fn_b>0 else 0}
plt.bar(impr.keys(), impr.values(), color=['green' if v>0 else 'red' for v in impr.values()])
plt.ylabel('Reduction %')
plt.title('Error Reduction')

# ROC Comparison
plt.subplot(2,3,4)
fpr_b, tpr_b, _ = roc_curve(y_test, baseline_model.predict_proba(X_test)[:,1])
fpr_i, tpr_i, _ = roc_curve(y_test, y_proba_imp)
auc_b, auc_i = auc(fpr_b, tpr_b), auc(fpr_i, tpr_i)
plt.plot(fpr_b, tpr_b, 'b-', label=f'Base={auc_b:.3f}')
plt.plot(fpr_i, tpr_i, 'g-', label=f'Imp={auc_i:.3f}')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
plt.title('ROC Curves')

# Cost Analysis
plt.subplot(2,3,5)
cost_b = fp_b*10 + fn_b*100
cost_i = fp_i*10 + fn_i*100
savings = cost_b - cost_i
plt.bar(['Baseline','Improved','Savings'], [cost_b,cost_i,savings], color=['red','green','gold'])
plt.ylabel('Cost ($)')
plt.title('Financial Impact')

# Summary
ax = plt.subplot(2,3,6)
ax.axis('off')
ax.text(0.5, 0.5, f"""
MODEL COMPARISON
{'='*30}
FP: {fp_b:,} → {fp_i:,} ({(fp_i-fp_b)/fp_b*100:.1f}%)
FN: {fn_b} → {fn_i} ({(fn_i-fn_b)/fn_b*100:.1f}%)
AUC: {auc_b:.3f} → {auc_i:.3f}
Cost: ${cost_b:,} → ${cost_i:,}
SAVINGS: ${savings:,}
""", transform=ax.transAxes, fontsize=10, ha='center', va='center', family='monospace')
plt.suptitle('Model Comparison Dashboard', fontsize=14)
plt.tight_layout()
plt.show()

# Cell 7: 🎯 LIVE DEMO TUNING 🎯

In [None]:
print("="*60)
print("🎯 LIVE DEMO: Change DEMO_PARAMETER and re-run!")
print("="*60)

# ====== CHANGE THIS VALUE DURING DEMO! ======
DEMO_PARAMETER = 100  # Try 150 or 200!
# ============================================

demo_model = RandomForestClassifier(
    n_estimators=DEMO_PARAMETER,  # ← TUNABLE
    max_depth=15, min_samples_split=20, min_samples_leaf=10,
    max_features='sqrt', class_weight=class_weight_dict,
    random_state=42, n_jobs=-1
)
demo_start = time.time()
demo_model.fit(X_train_enh, y_train)
demo_time = time.time() - demo_start
y_pred_d = demo_model.predict(X_test_enh)
y_proba_d = demo_model.predict_proba(X_test_enh)[:,1]
cm_d = confusion_matrix(y_test, y_pred_d)
tn_d, fp_d, fn_d, tp_d = cm_d.ravel()
auc_d = roc_auc_score(y_test, y_proba_d)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
# Confusion Matrix
sns.heatmap(cm_d, annot=True, fmt='d', cmap='Purples', ax=axes[0])
axes[0].set_title(f'n_estimators={DEMO_PARAMETER}')
# Performance Comparison
x = np.arange(3)
axes[1].bar(x-0.25, [fp_b, fn_b, auc_b*100], 0.25, label='Baseline', color='red')
axes[1].bar(x, [fp_i, fn_i, auc_i*100], 0.25, label='Default(100)', color='green')
axes[1].bar(x+0.25, [fp_d, fn_d, auc_d*100], 0.25, label=f'Demo({DEMO_PARAMETER})', color='gold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(['FP', 'FN', 'AUC×100'])
axes[1].legend()
# Complexity vs Performance
axes[2].plot([50,100,DEMO_PARAMETER], [auc_b*0.98, auc_i, auc_d], 'go-')
axes[2].set_xlabel('n_estimators')
axes[2].set_ylabel('AUC')
axes[2].scatter([DEMO_PARAMETER], [auc_d], s=200, color='gold', edgecolor='black')
plt.suptitle(f'Live Tuning: n_estimators={DEMO_PARAMETER}', fontsize=14)
plt.tight_layout()
plt.show()

print(f"""
Performance with n_estimators={DEMO_PARAMETER}:
Time: {demo_time:.2f}s | FP: {fp_d} | FN: {fn_d} | AUC: {auc_d:.3f}
Change vs default: FP {(fp_d-fp_i)/fp_i*100:+.1f}%, AUC {(auc_d-auc_i)*100:+.2f}pp
""")

# Cell 8: Save Final Model

In [None]:
import os
os.makedirs('../models', exist_ok=True)
os.makedirs('../outputs', exist_ok=True)
joblib.dump(improved_model, '../models/improved_random_forest.pkl')
metrics_imp = {
    'accuracy': float(accuracy_score(y_test, y_pred_imp)),
    'precision': float(precision_score(y_test, y_pred_imp)),
    'recall': float(recall_score(y_test, y_pred_imp)),
    'f1_score': float(f1_score(y_test, y_pred_imp)),
    'roc_auc': float(auc_i),
    'false_positives': int(fp_i),
    'false_negatives': int(fn_i),
    'total_cost': float(cost_i),
    'savings_vs_baseline': float(savings),
    'fp_reduction_pct': float((fp_b-fp_i)/fp_b*100) if fp_b>0 else 0
}
with open('../outputs/improved_metrics.json', 'w') as f:
    json.dump(metrics_imp, f, indent=2)
feat_imp.to_csv('../outputs/feature_importance.csv', index=False)

print(f"""
{'='*60}
DEMO COMPLETE!
FP reduced: {(fp_b-fp_i)/fp_b*100:.1f}%
Savings: ${savings:,}
AUC: {auc_b:.3f} → {auc_i:.3f}

Remember: Cell 7 is the LIVE DEMO!
Change DEMO_PARAMETER to 150 or 200 and re-run!
{'='*60}
""")