# Modeling - Predict Failure in 24 Hours

Train and evaluate models on the processed data. Focus on **recall** (catching failures) while minimizing false alarms.

In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

plt.style.use('ggplot')

In [None]:
# Load preprocessed data (run 02_preprocessing.ipynb first)
X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv').iloc[:, 0]
y_test = pd.read_csv('data/y_test.csv').iloc[:, 0]

with open('data/feature_cols.json') as f:
    feature_cols = json.load(f)

print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)
print("Train positive rate:", y_train.mean()*100, "%")
print("Test positive rate:", y_test.mean()*100, "%")

In [None]:
# Fix y_train if needed (pandas read_csv behavior)
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.iloc[:, 0]
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.iloc[:, 0]
y_train = y_train.astype(int)
y_test = y_test.astype(int)

## Handle Class Imbalance with SMOTE

In [None]:
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("After SMOTE:")
print(f"  Train samples: {len(X_train_smote)}")
print(f"  Positive rate: {y_train_smote.mean()*100:.1f}%")

## Train Random Forest

In [None]:
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
model.fit(X_train_smote, y_train_smote)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Failure', 'Failure']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nROC-AUC:", roc_auc_score(y_test, y_proba))

In [None]:
# Feature importance
imp = pd.DataFrame({'feature': feature_cols, 'importance': model.feature_importances_})
imp = imp.sort_values('importance', ascending=False)
imp.plot(x='feature', y='importance', kind='barh', figsize=(8, 5), legend=False)
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f'RF (AUC={roc_auc_score(y_test, y_proba):.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.tight_layout()
plt.show()

## XGBoost (Optional)

In [None]:
try:
    import xgboost as xgb
    xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=6, scale_pos_weight=len(y_train)/max(y_train.sum(), 1), random_state=42)
    xgb_model.fit(X_train_smote, y_train_smote)
    y_pred_xgb = xgb_model.predict(X_test)
    print("XGBoost Classification Report:")
    print(classification_report(y_test, y_pred_xgb, target_names=['No Failure', 'Failure']))
    print("ROC-AUC:", roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1]))
except ImportError:
    print("XGBoost not installed. pip install xgboost")

In [None]:
# Save best model
Path('models').mkdir(exist_ok=True)
import joblib
joblib.dump(model, 'models/rf_model.joblib')
print("Model saved to models/rf_model.joblib")