In [3]:
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.5
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# # Revolution Risk Predictor - Model Development
# 
# This notebook handles model training, evaluation, and selection for the revolution risk prediction.

# ## Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, average_precision_score, 
                             confusion_matrix, classification_report, RocCurveDisplay,
                             PrecisionRecallDisplay)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("viridis")

# ## Generate Synthetic Data (if files don't exist)
# This section creates synthetic data if the CSV files are missing

try:
    X_train = pd.read_csv('../data/X_train.csv')
    X_test = pd.read_csv('../data/X_test.csv')
    y_train = pd.read_csv('../data/y_train.csv').iloc[:, 0]
    y_test = pd.read_csv('../data/y_test.csv').iloc[:, 0]
    print("Loaded data from CSV files")
except FileNotFoundError:
    print("Data files not found. Generating synthetic data...")
    
    # Generate synthetic data
    np.random.seed(42)
    n_samples = 1000
    
    # Create features
    X = pd.DataFrame({
        'log_gdp': np.random.normal(10, 1, n_samples),
        'unemployment': np.random.uniform(2, 20, n_samples),
        'youth_pct': np.random.uniform(15, 45, n_samples),
        'internet_pct': np.random.uniform(30, 95, n_samples),
        'polity': np.random.uniform(-10, 10, n_samples),
        'prev_events': np.random.choice([0, 1], n_samples, p=[0.8, 0.2])
    })
    
    # Create target based on features (with some noise)
    risk_score = (
        (10 - X['polity']) * 0.1 +
        X['unemployment'] * 0.05 +
        X['youth_pct'] * 0.03 +
        (100 - X['internet_pct']) * 0.01 +
        (10 / X['log_gdp']) * 2 +
        X['prev_events'] * 0.3 +
        np.random.normal(0, 0.5, n_samples)
    )
    
    y = (risk_score > 2.5).astype(int)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    print("Generated synthetic data")

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution in training set: {pd.Series(y_train).value_counts().to_dict()}")
print(f"Class distribution in test set: {pd.Series(y_test).value_counts().to_dict()}")

# ## Model 1: Logistic Regression

print("Training Logistic Regression model...")

# Initialize and train model
logreg = LogisticRegression(
    random_state=42, 
    class_weight='balanced',
    max_iter=1000
)
logreg.fit(X_train, y_train)

# Make predictions
y_pred_logreg = logreg.predict(X_test)
y_proba_logreg = logreg.predict_proba(X_test)[:, 1]

# Evaluate model
print("Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Precision:", precision_score(y_test, y_pred_logreg))
print("Recall:", recall_score(y_test, y_pred_logreg))
print("F1 Score:", f1_score(y_test, y_pred_logreg))
print("ROC AUC:", roc_auc_score(y_test, y_proba_logreg))
print("PR AUC:", average_precision_score(y_test, y_proba_logreg))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_logreg)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Logistic Regression - Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# ROC Curve
plt.figure(figsize=(8, 6))
RocCurveDisplay.from_estimator(logreg, X_test, y_test)
plt.title('Logistic Regression - ROC Curve')
plt.tight_layout()
plt.show()

# Precision-Recall Curve
plt.figure(figsize=(8, 6))
PrecisionRecallDisplay.from_estimator(logreg, X_test, y_test)
plt.title('Logistic Regression - Precision-Recall Curve')
plt.tight_layout()
plt.show()

# Save model
joblib.dump(logreg, '../models/logistic_regression.joblib')
print("Logistic Regression model saved!")

# ## Model 2: Random Forest

print("\nTraining Random Forest model...")

# Initialize and train model
rf = RandomForestClassifier(
    random_state=42,
    class_weight='balanced',
    n_estimators=100,
    max_depth=10
)
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

# Evaluate model
print("Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_proba_rf))
print("PR AUC:", average_precision_score(y_test, y_proba_rf))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Random Forest - Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# ROC Curve
plt.figure(figsize=(8, 6))
RocCurveDisplay.from_estimator(rf, X_test, y_test)
plt.title('Random Forest - ROC Curve')
plt.tight_layout()
plt.show()

# Precision-Recall Curve
plt.figure(figsize=(8, 6))
PrecisionRecallDisplay.from_estimator(rf, X_test, y_test)
plt.title('Random Forest - Precision-Recall Curve')
plt.tight_layout()
plt.show()

# Feature Importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Random Forest - Feature Importance')
plt.tight_layout()
plt.show()

# Save model
joblib.dump(rf, '../models/random_forest.joblib')
print("Random Forest model saved!")

# ## Model Comparison

# Create comparison dataframe
models = ['Logistic Regression', 'Random Forest']
accuracy = [accuracy_score(y_test, y_pred_logreg), accuracy_score(y_test, y_pred_rf)]
precision = [precision_score(y_test, y_pred_logreg), precision_score(y_test, y_pred_rf)]
recall = [recall_score(y_test, y_pred_logreg), recall_score(y_test, y_pred_rf)]
f1 = [f1_score(y_test, y_pred_logreg), f1_score(y_test, y_pred_rf)]
roc_auc = [roc_auc_score(y_test, y_proba_logreg), roc_auc_score(y_test, y_proba_rf)]
pr_auc = [average_precision_score(y_test, y_proba_logreg), average_precision_score(y_test, y_proba_rf)]

comparison_df = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'ROC AUC': roc_auc,
    'PR AUC': pr_auc
}).set_index('Model')

print("Model Comparison:")
print(comparison_df.round(3))

# Visual comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC']
comparison_df[metrics].plot(kind='bar', figsize=(12, 6))
plt.title('Model Comparison - Performance Metrics')
plt.xticks(rotation=0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# ## Select Best Model

# Based on PR AUC (most important for imbalanced classification)
best_model_name = comparison_df['PR AUC'].idxmax()
best_model = logreg if best_model_name == 'Logistic Regression' else rf

print(f"Best model based on PR AUC: {best_model_name}")
print(f"PR AUC: {comparison_df.loc[best_model_name, 'PR AUC']:.3f}")

# Save best model
joblib.dump(best_model, '../models/best_model.joblib')
print("Best model saved!")

print("\nModel development completed!")

Loaded data from CSV files
Training set: (724, 6)
Test set: (406, 6)
Class distribution in training set: {1: 690, 0: 37}
Class distribution in test set: {1: 390, 0: 13}
Training Logistic Regression model...


ValueError: Found input variables with inconsistent numbers of samples: [724, 727]