# Model Training
Training and evaluating machine learning models to predict financial fragility

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle

from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("Libraries imported successfully")

## Load Processed Data

In [None]:
# Load processed data from preprocessing notebook
X_train_scaled = np.load('../data/processed/X_train_scaled.npy')
X_test_scaled = np.load('../data/processed/X_test_scaled.npy')
y_train = np.load('../data/processed/y_train.npy')
y_test = np.load('../data/processed/y_test.npy')

# Load feature names
with open('../data/processed/feature_names.pkl', 'rb') as f:
    feature_names = pickle.load(f)

# Create DataFrame for easier manipulation later
X = pd.DataFrame(X_train_scaled, columns=feature_names)

print(f"Training set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")
print(f"Features: {len(feature_names)}")
print(f"\nClass balance in training:")
print(pd.Series(y_train).value_counts(normalize=True))

## Define Models

We'll train and compare multiple models:
1. **Logistic Regression** - Baseline interpretable model
2. **Random Forest** - Handles non-linear relationships
3. **Gradient Boosting** - Often highest performance

In [None]:
models = {
    'Logistic Regression': LogisticRegression(
        random_state=RANDOM_STATE,
        max_iter=1000,
        solver='lbfgs'
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        max_depth=10
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=RANDOM_STATE
    )
}

print(f"Models to train: {list(models.keys())}")

## Train and Evaluate Models

In [None]:
results = []
trained_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")

    start = time.time()
    model.fit(X_train_scaled, y_train)
    train_time = time.time() - start

    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else np.nan

    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='f1')

    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'ROC-AUC': roc_auc,
        'CV Mean': cv_scores.mean(),
        'CV Std': cv_scores.std(),
        'Time': train_time
    })

    trained_models[name] = {'model': model, 'predictions': y_pred, 'probabilities': y_pred_proba}

    auc_str = f"{roc_auc:.3f}" if not np.isnan(roc_auc) else "N/A"
    print(f"  Accuracy: {accuracy:.3f} | F1: {f1:.3f} | ROC-AUC: {auc_str}")

results_df = pd.DataFrame(results).sort_values('F1', ascending=False)
print("\n" + "="*80)
print("Model Performance Summary")
print("="*80)
print(results_df.to_string(index=False))

## Visualize Model Comparison

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1']
for idx, metric in enumerate(metrics):
    ax = axes[idx//2, idx%2]
    results_df.plot(x='Model', y=metric, kind='barh', ax=ax, legend=False, color='#3498db')
    ax.set_title(f'{metric} Comparison')
    ax.set_xlabel(metric)
    ax.set_ylabel('')
plt.tight_layout()
plt.show()

## Detailed Analysis of Best Model

In [None]:
best_model_name = results_df.iloc[0]['Model']

best_model_info = trained_models[best_model_name]
best_model = best_model_info['model']
best_predictions = best_model_info['predictions']

print(f"\nDetailed Analysis: {best_model_name}")
print(classification_report(y_test, best_predictions, target_names=['Cannot Cover', 'Can Cover']))

### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, best_predictions)

fig, ax = plt.subplots(1, 1, figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=['Cannot Cover', 'Can Cover'],
            yticklabels=['Cannot Cover', 'Can Cover'])
ax.set_title(f'Confusion Matrix: {best_model_name}')
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')
plt.tight_layout()
plt.show()

print(f"\nTrue Negatives: {cm[0,0]:,}")
print(f"False Positives: {cm[0,1]:,}")
print(f"False Negatives: {cm[1,0]:,}")
print(f"True Positives: {cm[1,1]:,}")

### ROC Curve

In [None]:
if best_model_info['probabilities'] is not None:
    fpr, tpr, thresholds = roc_curve(y_test, best_model_info['probabilities'])
    roc_auc = roc_auc_score(y_test, best_model_info['probabilities'])

    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, color='#2ecc71', lw=2, label=f'ROC (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve: {best_model_name}')
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.show()

### Feature Importance

In [None]:
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)

    print("\nTop 20 Most Important Features:")
    print(feature_importance.head(20).to_string(index=False))

    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(20)
    plt.barh(range(len(top_features)), top_features['Importance'], color='#2ecc71')
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Importance')
    plt.title(f'Top 20 Features: {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

elif hasattr(best_model, 'coef_'):
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Coefficient': best_model.coef_[0]
    }).sort_values('Coefficient', key=abs, ascending=False)

    print("\nTop 20 Most Important Features (by coefficient magnitude):")
    print(feature_importance.head(20).to_string(index=False))

    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(20)
    colors = ['#e74c3c' if x < 0 else '#2ecc71' for x in top_features['Coefficient']]
    plt.barh(range(len(top_features)), top_features['Coefficient'], color=colors)
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Coefficient')
    plt.title(f'Top 20 Features: {best_model_name}')
    plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print(f"\nFeature importance not available for {best_model_name}")

## Save Models and Results

In [None]:
# Save best model
with open('../data/processed/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save all trained models
with open('../data/processed/trained_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)

# Save results dataframe
results_df.to_csv('../data/processed/model_results.csv', index=False)

# Save best model name for next notebook
with open('../data/processed/best_model_name.pkl', 'wb') as f:
    pickle.dump(best_model_name, f)

print(f"\nBest model ({best_model_name}) saved successfully")
print(f"All results saved to ../data/processed/")

## Summary

In [None]:
print("=" * 80)
print("MODEL TRAINING SUMMARY")
print("=" * 80)
print(f"\nBest Model: {best_model_name}")
print(f"\nPerformance:")
best_row = results_df.iloc[0]
print(f"  Accuracy:  {best_row['Accuracy']:.4f}")
print(f"  Precision: {best_row['Precision']:.4f}")
print(f"  Recall:    {best_row['Recall']:.4f}")
print(f"  F1-Score:  {best_row['F1']:.4f}")
print(f"  ROC-AUC:   {best_row['ROC-AUC']:.4f}")
print(f"\nCross-validation F1: {best_row['CV Mean']:.4f} (Â±{best_row['CV Std']:.4f})")
print(f"\nTraining time: {best_row['Time']:.2f} seconds")
print("\n" + "=" * 80)
print("Ready to proceed to analysis notebook for SHAP and hyperparameter tuning")
print("=" * 80)