In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    roc_auc_score, roc_curve, auc,
    precision_score, recall_score, f1_score, accuracy_score
)
import joblib
import json
import time
import os
import sys

# Add parent directory to path for imports
sys.path.append('..')

import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Environment ready for baseline model training!")
print(f"Working directory: {os.getcwd()}")

In [None]:
print("Loading preprocessed data from Segment 1...")

# Load preprocessed data
df = pd.read_csv('../data/preprocessed_creditcard.csv')
train_indices = np.load('../data/train_indices.npy')
test_indices = np.load('../data/test_indices.npy')

print(f"✓ Data loaded successfully")
print(f"  Dataset shape: {df.shape}")

# Split features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Use saved indices for consistent train/test split
X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]
X_test = X.iloc[test_indices]
y_test = y.iloc[test_indices]

print(f"\nDataset Statistics:")
print(f"  Training set: {len(X_train):,} samples ({y_train.sum()} fraud cases)")
print(f"  Test set: {len(X_test):,} samples ({y_test.sum()} fraud cases)")
print(f"  Train fraud rate: {y_train.mean()*100:.3f}%")
print(f"  Test fraud rate: {y_test.mean()*100:.3f}%")
print(f"  Features: {X_train.shape[1]}")

In [None]:
print("\n" + "="*60)
print("TRAINING BASELINE MODEL")
print("="*60)

# Use these exact parameters for reproducibility and speed
baseline_model = LogisticRegression(
    random_state=42,
    max_iter=100,  # Keep low for speed
    solver='liblinear',  # Fast for binary classification
    class_weight='balanced'  # Handle class imbalance
)

# Time the training
print("Training Logistic Regression model...")
start_time = time.time()
baseline_model.fit(X_train, y_train)
training_time = time.time() - start_time

print(f"✓ Model trained in {training_time:.2f} seconds")

# Generate predictions
y_pred = baseline_model.predict(X_test)
y_pred_proba = baseline_model.predict_proba(X_test)[:, 1]

print("✓ Predictions generated")

In [None]:
# Calculate all metrics
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Business metrics
cost_per_fp = 10  # Customer service cost
cost_per_fn = 100  # Average fraud loss
total_cost = fp * cost_per_fp + fn * cost_per_fn

print(f"\n📊 Quick Performance Summary:")
print(f"  Accuracy: {accuracy:.2%}")
print(f"  Precision: {precision:.2%}")
print(f"  Recall: {recall:.2%}")
print(f"  F1-Score: {f1:.2%}")
print(f"  ROC-AUC: {roc_auc:.3f}")
print(f"\n💰 Business Impact:")
print(f"  False Positives: {fp:,} (${fp * cost_per_fp:,} cost)")
print(f"  False Negatives: {fn} (${fn * cost_per_fn:,} cost)")
print(f"  Total Cost: ${total_cost:,}")

In [None]:
print("\n📈 Creating Business Impact Dashboard...")

# Create comprehensive evaluation figure
fig = plt.figure(figsize=(20, 10))

# Subplot 1: Confusion Matrix Heatmap
ax1 = plt.subplot(2, 3, 1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Legitimate', 'Fraud'],
            yticklabels=['Legitimate', 'Fraud'],
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix\nBaseline Model', fontsize=14, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Subplot 2: Business Cost Analysis
ax2 = plt.subplot(2, 3, 2)
categories = ['False Positives\n(Angry Customers)', 'False Negatives\n(Money Lost)', 
              'True Positives\n(Fraud Caught)', 'True Negatives\n(Happy Customers)']
values = [fp, fn, tp, tn]
colors = ['orange', 'red', 'green', 'lightgreen']
bars = plt.bar(categories, values, color=colors, edgecolor='black', linewidth=1)
plt.title('Business Impact Metrics', fontsize=14, fontweight='bold')
plt.ylabel('Count')
plt.xticks(rotation=0)

# Add value labels on bars
for bar, val in zip(bars, values):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + max(values)*0.01, 
             f'{val:,}', ha='center', fontweight='bold', fontsize=10)

# Add cost annotation box
cost_text = f'Total Cost: ${total_cost:,}\n' \
            f'FP Cost: ${fp * cost_per_fp:,}\n' \
            f'FN Cost: ${fn * cost_per_fn:,}'
plt.text(0.98, 0.97, cost_text, transform=ax2.transAxes, 
         fontsize=11, verticalalignment='top', horizontalalignment='right',
         bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.7))

# Subplot 3: ROC Curve
ax3 = plt.subplot(2, 3, 3)
plt.plot(fpr, tpr, 'b-', label=f'ROC curve (AUC = {roc_auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'r--', label='Random classifier', alpha=0.5)

# Mark current operating point (threshold = 0.5)
default_threshold_idx = np.argmin(np.abs(thresholds - 0.5))
plt.plot(fpr[default_threshold_idx], tpr[default_threshold_idx], 'go', 
         markersize=10, label=f'Current threshold (0.5)')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Analysis', fontsize=14, fontweight='bold')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)

# Subplot 4: Threshold Analysis
ax4 = plt.subplot(2, 3, 4)
thresholds_to_test = [0.3, 0.4, 0.5, 0.6, 0.7]
fps_list = []
fns_list = []

for thresh in thresholds_to_test:
    y_pred_thresh = (y_pred_proba >= thresh).astype(int)
    cm_thresh = confusion_matrix(y_test, y_pred_thresh)
    fps_list.append(cm_thresh[0, 1])
    fns_list.append(cm_thresh[1, 0])

plt.plot(thresholds_to_test, fps_list, 'o-', label='False Positives', 
         color='orange', linewidth=2, markersize=8)
plt.plot(thresholds_to_test, fns_list, 's-', label='False Negatives', 
         color='red', linewidth=2, markersize=8)
plt.xlabel('Probability Threshold')
plt.ylabel('Error Count')
plt.title('Threshold Impact on Errors', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
# Mark default threshold
plt.axvline(x=0.5, color='green', linestyle='--', alpha=0.5, linewidth=2)
plt.text(0.51, max(max(fps_list), max(fns_list))*0.9, 'Default', 
         color='green', fontweight='bold')

# Subplot 5-6: Performance Summary Box (spans 2 subplots)
ax5 = plt.subplot(2, 3, (5, 6))
ax5.axis('off')

summary_text = f"""
{'='*55}
         BASELINE MODEL PERFORMANCE
{'='*55}

Classification Metrics:
  • Accuracy: {accuracy:.2%}
  • Precision (Fraud): {precision:.2%}
  • Recall (Fraud): {recall:.2%}
  • F1-Score (Fraud): {f1:.2%}
  • ROC-AUC: {roc_auc:.3f}

Business Impact:
  • False Positives: {fp:,} transactions
    → {fp:,} angry customers!
    → ${fp * cost_per_fp:,} in service costs
  
  • False Negatives: {fn} transactions  
    → ${fn * cost_per_fn:,} in losses!
    
  • Total Financial Impact: ${total_cost:,}
  • Fraud Detection Rate: {recall:.1%}
  • Customer Friction Rate: {fp/(tn+fp)*100:.2f}%

Model Details:
  • Algorithm: Logistic Regression (balanced)
  • Training Time: {training_time:.2f} seconds
  • Features Used: {X_train.shape[1]}
"""

ax5.text(0.5, 0.5, summary_text, transform=ax5.transAxes, 
         fontsize=11, ha='center', va='center',
         bbox=dict(boxstyle='round,pad=1', facecolor='lightblue', alpha=0.8),
         family='monospace')

plt.suptitle('Baseline Fraud Detection Model - Performance Dashboard', 
             fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

print("✓ Dashboard created successfully")

In [None]:
print("\n💾 Saving model and metrics...")

# Create directories if they don't exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../outputs', exist_ok=True)

# Save model
model_path = '../models/baseline_logistic_regression.pkl'
joblib.dump(baseline_model, model_path)
print(f"✓ Model saved to {model_path}")

# Save metrics for comparison in Segment 3
metrics = {
    'accuracy': float(accuracy),
    'precision': float(precision),
    'recall': float(recall),
    'f1_score': float(f1),
    'roc_auc': float(roc_auc),
    'false_positives': int(fp),
    'false_negatives': int(fn),
    'true_positives': int(tp),
    'true_negatives': int(tn),
    'total_cost': float(total_cost),
    'training_time': float(training_time),
    'customer_friction_rate': float(fp/(tn+fp)*100),
    'fraud_detection_rate': float(recall*100)
}

metrics_path = '../outputs/baseline_metrics.json'
with open(metrics_path, 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"✓ Metrics saved to {metrics_path}")

# Also save the model for use in Segment 3 comparison
joblib.dump(baseline_model, '../models/baseline_for_comparison.pkl')
print("✓ Comparison model saved")

In [None]:
# Create a brief summary report
report = f"""
# Baseline Model Report

## Model Performance
- **Algorithm**: Logistic Regression with balanced class weights
- **Training Time**: {training_time:.2f} seconds
- **ROC-AUC Score**: {roc_auc:.3f}

## Business Metrics
- **False Positives**: {fp:,} (customers incorrectly blocked)
- **False Negatives**: {fn} (frauds missed)
- **Estimated Loss**: ${total_cost:,}

## Key Insights
- The model catches {recall:.1%} of fraud cases
- {fp/(tn+fp)*100:.2f}% of legitimate transactions are incorrectly flagged
- Each false negative costs ~10x more than a false positive

## Next Steps
- Improve feature engineering
- Try more sophisticated algorithms
- Optimize the decision threshold
"""

with open('../outputs/baseline_report.md', 'w') as f:
    f.write(report)
print("✓ Report saved to outputs/baseline_report.md")

In [None]:
print("\n" + "="*60)
print("🎯 BASELINE MODEL COMPLETE!")
print("="*60)
print(f"Summary:")
print(f"  • Model: Logistic Regression")
print(f"  • Training Time: {training_time:.2f}s")
print(f"  • False Positives: {fp:,} transactions")
print(f"  • False Negatives: {fn} transactions")
print(f"  • Total Cost: ${total_cost:,}")
print(f"  • ROC-AUC: {roc_auc:.3f}")
print("="*60)
print("Challenge for Segment 3:")
print(f"  → Reduce false positives by 30% (target: <{int(fp*0.7):,})")
print(f"  → Maintain or improve recall (current: {recall:.1%})")
print("="*60)
print("\n✅ Ready for Segment 3: Model Improvement!")