In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# ---------------------------------------
# 1. LOAD PERFORMANCE RESULTS FROM CSV
# ---------------------------------------
df = pd.read_csv("model_results.csv")
df['SMOTE'] = df['SMOTE'].map({True: 'SMOTE', False: 'No SMOTE', 1: 'SMOTE', 0: 'No SMOTE'})

# ---------------------------------------
# 2. PLOT BAR CHARTS: TP and FP by Model
# ---------------------------------------
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='Model', y='TP', hue='SMOTE')
plt.title("True Positives (TP) by Model and Balancing Method")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='Model', y='FP', hue='SMOTE')
plt.title("False Positives (FP) by Model and Balancing Method")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# ---------------------------------------
# 3. HEATMAP: TP vs. FP Distribution
# ---------------------------------------
heatmap_data = df.groupby(['TP', 'FP']).size().unstack(fill_value=0)
plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, annot=True, fmt="d", cmap="coolwarm")
plt.title("Heatmap of TP vs. FP")
plt.xlabel("False Positives")
plt.ylabel("True Positives")
plt.tight_layout()
plt.show()

# ---------------------------------------
# 4. TP/FP by Feature Set
# ---------------------------------------
plt.figure(figsize=(14, 6))
sns.barplot(data=df, x='Feature_Set', y='TP', hue='Model')
plt.title("TP by Feature Set and Model")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 6))
sns.barplot(data=df, x='Feature_Set', y='FP', hue='Model')
plt.title("FP by Feature Set and Model")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# ----------------------------------------------------
# 5. LOAD YOUR CLEANED TRAINING DATA (for CV & SMOTE)
# ----------------------------------------------------
# Replace with your actual features and target
# Example: df_all = pd.read_csv("final_dataset.csv")
#          X = df_all.drop("ICU", axis=1)
#          y = df_all["ICU"]

# MOCK EXAMPLE - remove below in actual use
# X, y = your actual cleaned feature matrix and ICU target column

# --------------------------------------------
# 6. APPLY SMOTE TO BALANCE TRAINING DATA
# --------------------------------------------
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, y)

# --------------------------------------------
# 7. CROSS-VALIDATION + GRID SEARCH
# --------------------------------------------
# Example: XGBoost Fine-Tuning
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_params = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'scale_pos_weight': [1, 3]  # Handle imbalance
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_xgb = GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=1)
# grid_xgb.fit(X_resampled, y_resampled)
# print("Best XGBoost Params:", grid_xgb.best_params_)
# print("Best AUC:", grid_xgb.best_score_)

# Example: Random Forest Fine-Tuning
rf_model = RandomForestClassifier()
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'class_weight': ['balanced']
}

grid_rf = GridSearchCV(rf_model, rf_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=1)
# grid_rf.fit(X_resampled, y_resampled)
# print("Best RF Params:", grid_rf.best_params_)
# print("Best RF AUC:", grid_rf.best_score_)
