In [None]:
# Imports
import pandas as pd
import numpy as np
import pickle
import joblib
import math
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, 
                             roc_auc_score, mean_absolute_error, mean_squared_error)

## 1. Load Preprocessed Data

In [None]:
# Load preprocessed data from previous notebook
with open('preprocessed_data.pkl', 'rb') as f:
    data = pickle.load(f)

X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']
numeric_cols = data['numeric_cols']
categorical_cols = data['categorical_cols']

with open('preprocessor.pkl', 'rb') as f:
    preprocessor = pickle.load(f)

print('Loaded training set:', X_train.shape)
print('Loaded test set:', X_test.shape)

## 2. Train Logistic Regression

In [None]:
# Create and train Logistic Regression pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

print('Training Logistic Regression...')
lr_pipeline.fit(X_train, y_train)
print('Training completed!')

## 3. Train Random Forest

In [None]:
# Create and train Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

print('Training Random Forest...')
rf_pipeline.fit(X_train, y_train)
print('Training completed!')

## 4. Evaluation Function

In [None]:
def evaluate_model(pipeline, X_test, y_test, model_name='Model'):
    """Evaluate a model and print metrics."""
    y_pred = pipeline.predict(X_test)
    
    # Get probabilities for ROC AUC
    probs = None
    try:
        probs = pipeline.predict_proba(X_test)[:, 1]
    except Exception:
        pass
    
    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = math.sqrt(mean_squared_error(y_test, y_pred))
    auc = None
    if probs is not None:
        try:
            auc = roc_auc_score(y_test, probs)
        except Exception:
            pass
    
    # Print results
    print(f"\n{'='*50}")
    print(f"  {model_name} Evaluation")
    print(f"{'='*50}")
    print(f"Accuracy: {acc:.4f}")
    if auc is not None:
        print(f"ROC AUC: {auc:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"\nConfusion Matrix:\n{cm}")
    print(f"\nClassification Report:\n{report}")
    
    return {
        'accuracy': acc,
        'auc': auc,
        'mae': mae,
        'rmse': rmse,
        'cm': cm,
        'report': report,
        'predictions': y_pred,
        'probabilities': probs
    }

## 5. Evaluate Both Models

In [None]:
# Evaluate both models
lr_metrics = evaluate_model(lr_pipeline, X_test, y_test, 'Logistic Regression')
rf_metrics = evaluate_model(rf_pipeline, X_test, y_test, 'Random Forest')

## 6. Model Comparison & Recommendation

In [None]:
# Compare models
print("\n" + "="*50)
print("  Model Comparison")
print("="*50)
print(f"\nLogistic Regression:")
print(f"  Accuracy: {lr_metrics['accuracy']:.4f}")
print(f"  ROC AUC:  {lr_metrics['auc']:.4f}" if lr_metrics['auc'] else "  ROC AUC: N/A")

print(f"\nRandom Forest:")
print(f"  Accuracy: {rf_metrics['accuracy']:.4f}")
print(f"  ROC AUC:  {rf_metrics['auc']:.4f}" if rf_metrics['auc'] else "  ROC AUC: N/A")

# Determine best model
if lr_metrics['accuracy'] >= rf_metrics['accuracy']:
    best_model = lr_pipeline
    best_name = 'Logistic Regression'
    best_metrics = lr_metrics
else:
    best_model = rf_pipeline
    best_name = 'Random Forest'
    best_metrics = rf_metrics

print(f"\n{'='*50}")
print(f"âœ“ Recommended model: {best_name}")
print(f"  Accuracy: {best_metrics['accuracy']:.4f}")
print(f"{'='*50}")

## 7. Save Trained Models

In [None]:
# Save both pipelines
joblib.dump(lr_pipeline, 'lr_model.joblib')
joblib.dump(rf_pipeline, 'rf_model.joblib')
joblib.dump(best_model, 'best_model.joblib')

# Save metrics
with open('model_metrics.pkl', 'wb') as f:
    pickle.dump({
        'lr_metrics': lr_metrics,
        'rf_metrics': rf_metrics,
        'best_model_name': best_name,
        'X_test': X_test,
        'y_test': y_test
    }, f)

print('Saved: lr_model.joblib')
print('Saved: rf_model.joblib')
print('Saved: best_model.joblib')
print('Saved: model_metrics.pkl')