In [None]:
# Stroke_Prediction_Starter.ipynb

# ==========================
# 1️⃣ Import Libraries
# ==========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE

import xgboost as xgb
import joblib
import shap

# ==========================
# 2️⃣ Load Dataset
# ==========================
df = pd.read_csv("data/stroke_data.csv")
df.head()

# ==========================
# 3️⃣ Data Cleaning
# ==========================
# Drop irrelevant columns
df = df.drop(columns=['id'], errors='ignore')

# Handle missing BMI values
df['bmi'].fillna(df['bmi'].median(), inplace=True)

# Encode categorical features
label_cols = ['gender','ever_married','work_type','Residence_type','smoking_status']
for col in label_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# ==========================
# 4️⃣ Exploratory Data Analysis (EDA)
# ==========================
# Correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

# Stroke distribution by age / hypertension
sns.countplot(x='age', hue='stroke', data=df)
plt.show()

# ==========================
# 5️⃣ Handle Class Imbalance
# ==========================
X = df.drop('stroke', axis=1)
y = df['stroke']

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

# ==========================
# 6️⃣ Scale Features
# ==========================
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)

# ==========================
# 7️⃣ Train-Test Split
# ==========================
X_train, X_test, y_train, y_test = train_test_split(X_res_scaled, y_res, test_size=0.2, random_state=42)

# ==========================
# 8️⃣ Train Models
# ==========================
# Example: XGBoost
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

# ==========================
# 9️⃣ Evaluate Model
# ==========================
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_proba))

# ==========================
# 10️⃣ Feature Importance & SHAP
# ==========================
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test, feature_names=X.columns)

# ==========================
# 11️⃣ Save Model & Scaler
# ==========================
joblib.dump(model, "models/xgb_model.pkl")
joblib.dump(scaler, "models/scaler.pkl")

# ==========================
# 12️⃣ Optional: Integrate with Streamlit
# ==========================
# In main_app.py you can load model/scaler and predict on user inputs
