# Cell 1: Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
import joblib
import json
import time
import os
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
print("Ready for baseline model")

# Cell 2: Load Data

In [None]:
df = pd.read_csv('../data/preprocessed_creditcard.csv')
train_indices = np.load('../data/train_indices.npy')
test_indices = np.load('../data/test_indices.npy')
X = df.drop('Class', axis=1)
y = df['Class']
X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]
X_test = X.iloc[test_indices]
y_test = y.iloc[test_indices]
print(f"Train: {len(X_train):,} | Test: {len(X_test):,} | Features: {X_train.shape[1]}")

# Cell 3: Train Baseline

In [None]:
baseline_model = LogisticRegression(random_state=42, max_iter=100, solver='liblinear', class_weight='balanced')
start = time.time()
baseline_model.fit(X_train, y_train)
training_time = time.time() - start
y_pred = baseline_model.predict(X_test)
y_pred_proba = baseline_model.predict_proba(X_test)[:, 1]
print(f"Trained in {training_time:.2f}s")

# Cell 4: Calculate Metrics

In [None]:
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
cost_per_fp, cost_per_fn = 10, 100
total_cost = fp * cost_per_fp + fn * cost_per_fn
print(f"FP: {fp:,} | FN: {fn} | Cost: ${total_cost:,} | AUC: {roc_auc:.3f}")

# Cell 5: Business Dashboard

In [None]:
fig = plt.figure(figsize=(20, 10))

# Confusion Matrix
ax1 = plt.subplot(2,3,1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Legit','Fraud'], yticklabels=['Legit','Fraud'])
plt.title('Confusion Matrix')

# Business Impact
ax2 = plt.subplot(2,3,2)
plt.bar(['FP\n(Angry)', 'FN\n(Lost)', 'TP\n(Caught)', 'TN\n(Happy)'], [fp, fn, tp, tn], 
        color=['orange','red','green','lightgreen'])
plt.title(f'Business Impact (Cost: ${total_cost:,})')
plt.ylabel('Count')

# ROC Curve
ax3 = plt.subplot(2,3,3)
plt.plot(fpr, tpr, 'b-', label=f'AUC={roc_auc:.3f}', linewidth=2)
plt.plot([0,1], [0,1], 'r--', alpha=0.5)
thresh_idx = np.argmin(np.abs(thresholds - 0.5))
plt.plot(fpr[thresh_idx], tpr[thresh_idx], 'go', markersize=10)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.legend()

# Threshold Analysis
ax4 = plt.subplot(2,3,4)
test_thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
fps_list = []
fns_list = []
for t in test_thresholds:
    pred_t = (y_pred_proba >= t).astype(int)
    cm_t = confusion_matrix(y_test, pred_t)
    fps_list.append(cm_t[0,1])
    fns_list.append(cm_t[1,0])
plt.plot(test_thresholds, fps_list, 'o-', label='FP', color='orange')
plt.plot(test_thresholds, fns_list, 's-', label='FN', color='red')
plt.axvline(0.5, color='green', linestyle='--')
plt.xlabel('Threshold')
plt.ylabel('Errors')
plt.title('Threshold Impact')
plt.legend()

# Summary
ax5 = plt.subplot(2,3,(5,6))
ax5.axis('off')
summary = f"""
BASELINE MODEL PERFORMANCE
{'='*40}
Accuracy: {accuracy:.2%} | Precision: {precision:.2%}
Recall: {recall:.2%} | F1: {f1:.2%}
ROC-AUC: {roc_auc:.3f}

BUSINESS IMPACT
False Positives: {fp:,} (${fp*cost_per_fp:,})
False Negatives: {fn} (${fn*cost_per_fn:,})
Total Cost: ${total_cost:,}
Train Time: {training_time:.2f}s
"""
ax5.text(0.5, 0.5, summary, transform=ax5.transAxes, fontsize=11, ha='center', va='center',
         bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8), family='monospace')

plt.suptitle('Baseline Model Dashboard', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Cell 6: Save Model & Metrics

In [None]:
os.makedirs('../models', exist_ok=True)
os.makedirs('../outputs', exist_ok=True)
joblib.dump(baseline_model, '../models/baseline_logistic_regression.pkl')
joblib.dump(baseline_model, '../models/baseline_for_comparison.pkl')

metrics = {
    'accuracy': float(accuracy), 'precision': float(precision), 'recall': float(recall),
    'f1_score': float(f1), 'roc_auc': float(roc_auc), 'false_positives': int(fp),
    'false_negatives': int(fn), 'true_positives': int(tp), 'true_negatives': int(tn),
    'total_cost': float(total_cost), 'training_time': float(training_time)
}
with open('../outputs/baseline_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"""
{'='*60}
BASELINE COMPLETE!
FP: {fp:,} | FN: {fn} | Cost: ${total_cost:,} | AUC: {roc_auc:.3f}
Challenge: Reduce FP by 30% (target: <{int(fp*0.7):,})
{'='*60}
""")