In [None]:
# 🧠 Model Development & Evaluation

# 📦 Imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib

# Load data
DATA_DIR = 'data/processed/'

X_train = np.load(DATA_DIR + 'X_train.npy')
y_train = np.load(DATA_DIR + 'y_train.npy')
X_test = np.load(DATA_DIR + 'X_test.npy')
y_test = np.load(DATA_DIR + 'y_test.npy')

X_train_pca = np.load(DATA_DIR + 'X_train_pca.npy')
X_test_pca = np.load(DATA_DIR + 'X_test_pca.npy')

# 📊 Evaluation Function
def evaluate_model(model, X, y_true):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1] if hasattr(model, 'predict_proba') else y_pred
    print(classification_report(y_true, y_pred))
    print("ROC-AUC:", roc_auc_score(y_true, y_prob))

    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

    fpr, tpr, _ = roc_curve(y_true, y_prob)
    plt.plot(fpr, tpr, label='ROC Curve')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.grid(True)
    plt.show()

# ✅ Train and Evaluate on Full Feature Set
print("=== Logistic Regression (Full) ===")
lr_full = LogisticRegression(max_iter=1000)
lr_full.fit(X_train, y_train)
evaluate_model(lr_full, X_test, y_test)

print("=== Random Forest (Full) ===")
rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_full.fit(X_train, y_train)
evaluate_model(rf_full, X_test, y_test)

print("=== XGBoost (Full) ===")
xgb_full = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_full.fit(X_train, y_train)
evaluate_model(xgb_full, X_test, y_test)

# ✅ Train and Evaluate on PCA-Reduced Feature Set
print("=== Logistic Regression (PCA) ===")
lr_pca = LogisticRegression(max_iter=1000)
lr_pca.fit(X_train_pca, y_train)
evaluate_model(lr_pca, X_test_pca, y_test)

print("=== Random Forest (PCA) ===")
rf_pca = RandomForestClassifier(n_estimators=100, random_state=42)
rf_pca.fit(X_train_pca, y_train)
evaluate_model(rf_pca, X_test_pca, y_test)

print("=== XGBoost (PCA) ===")
xgb_pca = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_pca.fit(X_train_pca, y_train)
evaluate_model(xgb_pca, X_test_pca, y_test)

# 💾 Save best models
joblib.dump(xgb_full, DATA_DIR + 'xgb_full_model.joblib')
joblib.dump(xgb_pca, DATA_DIR + 'xgb_pca_model.joblib')
print("✅ Models saved!")
