<a href="https://colab.research.google.com/github/ali-sdg/los_final_project/blob/main/final_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %% [1] KÃœTÃœPHANELER VE AYARLAR
# Gerekli kÃ¼tÃ¼phanelerin yÃ¼klenmesi ve ortam ayarlarÄ±
# !pip install shap imbalanced-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shap  # Yorumlanabilirlik iÃ§in (Interpretability)

# Google Colab Drive BaÄŸlantÄ±sÄ±
from google.colab import drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Scikit-Learn ModÃ¼lleri
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.calibration import calibration_curve
from sklearn.metrics import (classification_report, roc_auc_score, roc_curve,
                             confusion_matrix, accuracy_score, precision_recall_curve,
                             auc, f1_score, brier_score_loss)

# Dengesiz Veri YÃ¶netimi iÃ§in SMOTE (Ã–nemli: Raporda belirtildiÄŸi gibi)
from imblearn.over_sampling import SMOTE

# Modelleme AlgoritmalarÄ±
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

# Grafik AyarlarÄ±
plt.rcParams['figure.dpi'] = 300
sns.set_style("whitegrid")

# %% [2] VERÄ° YÃœKLEME VE Ã–N Ä°ÅžLEME
# Veri setinin okunmasÄ± ve temel temizlik iÅŸlemleri
FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/final project/MIMIC_ICU_Ultimate_Raw.csv'
print(">>> [1/7] Veri YÃ¼kleniyor ve Ã–n Ä°ÅŸleniyor...")

df = pd.read_csv(FILE_PATH)
df = df.dropna(subset=['HeartRate']) # Hayalet kayÄ±tlarÄ±n (Ghost records) temizlenmesi

# %% [3] GELÄ°ÅžMÄ°Åž Ã–ZNÄ°TELÄ°K MÃœHENDÄ°SLÄ°ÄžÄ° (CLINICAL FEATURE ENGINEERING)
# Raporda belirtilen klinik belirteÃ§lerin ve oranlarÄ±n hesaplanmasÄ±
print(">>> [2/7] Klinik Ã–znitelik MÃ¼hendisliÄŸi YapÄ±lÄ±yor...")

# A. Hemodinamik Ä°ndeksler (Hemodynamic Indices)
df['Shock_Index'] = df['HeartRate'] / (df['SysBP'] + 1e-5)
df['Pulse_Pressure'] = df['SysBP'] - df['DiasBP']
df['Calculated_MAP'] = (df['SysBP'] + (2 * df['DiasBP'])) / 3
df['BUN_Creatinine_Ratio'] = df['BUN'] / (df['Creatinine'] + 1e-5)
df['Age_CCI_Interaction'] = df['AGE'] * df['CCI_Score']

# B. Dinamik Dalgalanmalar (Dynamic Fluctuations)
if 'SysBP_Max' in df.columns: df['SysBP_Fluctuation'] = df['SysBP_Max'] - df['SysBP_Min']
if 'HeartRate_Max' in df.columns: df['HeartRate_Fluctuation'] = df['HeartRate_Max'] - df['HeartRate_Min']

# C. Ä°drar DÃ¶nÃ¼ÅŸÃ¼mleri (Urine Transformations)
if 'Urine_Output' in df.columns:
    df['Low_Urine_Flag'] = (df['Urine_Output'] < 500).astype(float)
    df['Log_Urine'] = np.log1p(df['Urine_Output'].fillna(0))

# AykÄ±rÄ± DeÄŸerlerin Temizlenmesi (Outlier Cleaning)
df.loc[(df['HeartRate'] < 20) | (df['HeartRate'] > 300), 'HeartRate'] = np.nan
df.loc[(df['SysBP'] < 40) | (df['SysBP'] > 300), 'SysBP'] = np.nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Hedef DeÄŸiÅŸkenin TanÄ±mlanmasÄ± (Target Definition)
THRESHOLD_DAYS = 5
df['LOS_Label'] = (df['LOS'] > THRESHOLD_DAYS).astype(int)

# %% [4] EKSÄ°K VERÄ° TAMAMLAMA (MICE) VE Ã–LÃ‡EKLENDÄ°RME
# Veri setinin modelleme iÃ§in hazÄ±rlanmasÄ±
print(">>> [3/7] MICE ile Eksik Veri Tamamlama ve Ã–lÃ§eklendirme...")

# X ve y ayrÄ±mÄ±
meta_cols = ['HADM_ID', 'SUBJECT_ID', 'LOS', 'LOS_Label', 'HOSPITAL_EXPIRE_FLAG', 'CRP']
X = df.drop(columns=[c for c in meta_cols if c in df.columns])
y = df['LOS_Label']

# Ã‡ok seyrek sÃ¼tunlarÄ±n temizlenmesi
missing_ratio = X.isnull().mean()
X = X.drop(columns=missing_ratio[missing_ratio > 0.40].index)

# MICE (Multivariate Imputation by Chained Equations) UygulamasÄ±
imputer = IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, n_jobs=-1), max_iter=5, random_state=42)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# StandartlaÅŸtÄ±rma (Scaling)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X.columns)

# EÄŸitim ve Test Seti AyrÄ±mÄ± (Stratified Split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# %% [5] SMOTE VE MODEL EÄžÄ°TÄ°MÄ° (ENSEMBLE LEARNING)
# Dengesiz verilerin SMOTE ile dÃ¼zeltilmesi ve Topluluk Modeli eÄŸitimi
print(">>> [4/7] SMOTE UygulanÄ±yor ve Topluluk Modeli EÄŸitiliyor...")

# --- SMOTE UYGULAMASI (Rapora uygun olarak eklendi) ---
print("    - SMOTE ile Ã¶rneklem artÄ±rma iÅŸlemi...")
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
print(f"    - Orijinal EÄŸitim Boyutu: {y_train.shape}, DengelenmiÅŸ EÄŸitim Boyutu: {y_train_bal.shape}")

# Model TanÄ±mlamalarÄ±
xgb_clf = XGBClassifier(n_estimators=500, learning_rate=0.03, max_depth=7,
                    subsample=0.8, colsample_bytree=0.7, eval_metric='auc',
                    random_state=42, n_jobs=-1)

lgb_clf = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.03, num_leaves=35,
                               random_state=42, verbose=-1)

rf_clf = RandomForestClassifier(n_estimators=500, max_depth=15, min_samples_leaf=2,
                            random_state=42, n_jobs=-1)

# Voting Ensemble (Oylama SÄ±nÄ±flandÄ±rÄ±cÄ±sÄ±)
ensemble = VotingClassifier(estimators=[('xgb', xgb_clf), ('lgb', lgb_clf), ('rf', rf_clf)], voting='soft')

# Modeli DENGELENMÄ°Åž (SMOTE uygulanmÄ±ÅŸ) veri Ã¼zerinde eÄŸit
ensemble.fit(X_train_bal, y_train_bal)

# Test seti Ã¼zerinde tahmin (OlasÄ±lÄ±klar)
y_prob = ensemble.predict_proba(X_test)[:, 1]

# %% [6] GÃœVENLÄ°K Ã–NCELÄ°KLÄ° DEÄžERLENDÄ°RME (SAFETY-FIRST STRATEGY)
# EÅŸik deÄŸerinin ayarlanmasÄ± ve metriklerin hesaplanmasÄ±
print(">>> [5/7] GÃ¼venlik Ã–ncelikli EÅŸik DeÄŸeri (0.38) UygulanÄ±yor...")
SAFETY_THRESHOLD = 0.38
y_pred_safe = (y_prob >= SAFETY_THRESHOLD).astype(int)

# --- Metriklerin HesaplanmasÄ± ---
cm = confusion_matrix(y_test, y_pred_safe)
tn, fp, fn, tp = cm.ravel()

recall_long = tp / (tp + fn)       # DuyarlÄ±lÄ±k (Sensitivity)
specificity = tn / (tn + fp)       # Ã–zgÃ¼llÃ¼k (Specificity)
precision_long = tp / (tp + fp)
f1_long = 2 * (precision_long * recall_long) / (precision_long + recall_long)
roc_auc = roc_auc_score(y_test, y_prob)

precision_curve_vals, recall_curve_vals, _ = precision_recall_curve(y_test, y_prob)
pr_auc = auc(recall_curve_vals, precision_curve_vals)

# Ek Metrikler (Appendix)
brier = brier_score_loss(y_test, y_prob)
f1_macro = f1_score(y_test, y_pred_safe, average='macro')
f1_weighted = f1_score(y_test, y_pred_safe, average='weighted')

# SONUÃ‡LARIN YAZDIRILMASI
print("\n" + "="*50)
print("      ðŸ“Š BÃ–LÃœM 1: ANA RAPOR SONUÃ‡LARI       ")
print("="*50)
results_main = pd.DataFrame({
    'Metrik': ['DuyarlÄ±lÄ±k (Sensitivity)', 'Ã–zgÃ¼llÃ¼k (Specificity)', 'F1-Skoru (Uzun KalÄ±ÅŸ)', 'ROC-AUC', 'PR-AUC'],
    'DeÄŸer': [recall_long, specificity, f1_long, roc_auc, pr_auc]
})
print(results_main.to_markdown(index=False, floatfmt=".4f"))

print("\n" + "="*50)
print("      ðŸ“‚ BÃ–LÃœM 2: EK SONUÃ‡LAR (APPENDIX)       ")
print("="*50)
results_appendix = pd.DataFrame({
    'Metrik': ['Brier Skoru', 'Makro F1', 'AÄŸÄ±rlÄ±klÄ± F1'],
    'DeÄŸer': [brier, f1_macro, f1_weighted]
})
print(results_appendix.to_markdown(index=False, floatfmt=".4f"))

# %% [7] GRAFÄ°K Ã‡Ä°ZÄ°MÄ° (VISUALIZATION)
# Makale standartlarÄ±na uygun grafiklerin oluÅŸturulmasÄ±
print(">>> [6/7] Grafikler Ã‡iziliyor...")

# ÅžEKÄ°L 1: KarmaÅŸÄ±klÄ±k Matrisi (Confusion Matrix)
# RENK AYARI: Rapordaki Fig. 4 ile eÅŸleÅŸmesi iÃ§in 'Blues' (Mavi) kullanÄ±ldÄ±.
plt.figure(figsize=(6, 5))
group_names = ['GerÃ§ek Neg','YanlÄ±ÅŸ Poz','YanlÄ±ÅŸ Neg','GerÃ§ek Poz']
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names, ["{0:0.0f}".format(value) for value in cm.flatten()])]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues', cbar=False, annot_kws={"size": 12}) # cmap='Blues' olarak gÃ¼ncellendi
plt.title(f'GÃ¼venlik Ã–ncelikli KarmaÅŸÄ±klÄ±k Matrisi (EÅŸik={SAFETY_THRESHOLD})', fontsize=14, weight='bold')
plt.ylabel('GerÃ§ek Durum')
plt.xlabel('Tahmin Edilen Durum')
plt.show()

# ÅžEKÄ°L 2: ROC ve PR EÄŸrileri
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# ROC
fpr, tpr, _ = roc_curve(y_test, y_prob)
ax1.plot(fpr, tpr, color='#2c3e50', lw=3, label=f'Topluluk Modeli (AUC = {roc_auc:.3f})')
ax1.plot([0, 1], [0, 1], 'k--')
ax1.set_title('ROC EÄŸrisi', weight='bold')
ax1.set_xlabel('YanlÄ±ÅŸ Pozitif OranÄ±')
ax1.set_ylabel('GerÃ§ek Pozitif OranÄ±')
ax1.legend(loc='lower right')
# PR - RENK: Rapordaki Fig. 5b ile eÅŸleÅŸmesi iÃ§in Mor (#8e44ad) korundu.
ax2.plot(recall_curve_vals, precision_curve_vals, color='#8e44ad', lw=3, label=f'PR-AUC = {pr_auc:.3f}')
ax2.set_title('Kesinlik-DuyarlÄ±lÄ±k (PR) EÄŸrisi', weight='bold')
ax2.set_xlabel('DuyarlÄ±lÄ±k (Recall)')
ax2.set_ylabel('Kesinlik (Precision)')
ax2.legend(loc='lower left')
plt.tight_layout()
plt.show()

# ÅžEKÄ°L 3: Kalibrasyon GrafiÄŸi
# RENK: Rapordaki Fig. 6 ile eÅŸleÅŸmesi iÃ§in YeÅŸil (#27ae60) korundu.
prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
plt.figure(figsize=(7, 7))
plt.plot(prob_pred, prob_true, marker='o', linewidth=2, label='Topluluk Modeli', color='#27ae60')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='MÃ¼kemmel Kalibrasyon')
plt.xlabel('Ortalama Tahmin OlasÄ±lÄ±ÄŸÄ±')
plt.ylabel('Pozitiflerin OranÄ±')
plt.title('Kalibrasyon GrafiÄŸi', weight='bold')
plt.legend()
plt.show()

# ÅžEKÄ°L 4: Karar EÄŸrisi Analizi (DCA)
# RENK: Rapordaki Fig. 7 ile eÅŸleÅŸmesi iÃ§in KÄ±rmÄ±zÄ± (#e74c3c) korundu.
def calculate_net_benefit(y_true, y_prob, thresholds):
    net_benefits = []
    for thresh in thresholds:
        y_pred_thresh = (y_prob >= thresh).astype(int)
        tp = np.sum((y_true == 1) & (y_pred_thresh == 1))
        fp = np.sum((y_true == 0) & (y_pred_thresh == 1))
        n = len(y_true)
        if thresh == 1.0: net_benefit = 0
        else: net_benefit = (tp / n) - (fp / n) * (thresh / (1 - thresh))
        net_benefits.append(net_benefit)
    return np.array(net_benefits)

thresholds = np.linspace(0.01, 0.99, 100)
nb_model = calculate_net_benefit(y_test, y_prob, thresholds)
nb_all = calculate_net_benefit(y_test, np.ones_like(y_test), thresholds)

plt.figure(figsize=(8, 6))
plt.plot(thresholds, nb_model, label='Ã–nerilen Model', color='#e74c3c', lw=3)
plt.plot(thresholds, nb_all, label='Herkese Tedavi Uygula', color='gray', linestyle='--')
plt.axhline(y=0, color='black', linestyle='-')
plt.ylim(-0.05, 0.6)
plt.xlabel('EÅŸik OlasÄ±lÄ±ÄŸÄ±')
plt.ylabel('Net Fayda (Net Benefit)')
plt.title('Karar EÄŸrisi Analizi (DCA)', weight='bold')
plt.legend()
plt.show()

# %% [8] SHAP Ä°LE YORUMLANABÄ°LÄ°RLÄ°K (INTERPRETABILITY)
# Modelin kararlarÄ±nÄ± etkileyen faktÃ¶rlerin analizi
print(">>> [7/7] SHAP Analizi OluÅŸturuluyor...")

# Not: Topluluk modelini doÄŸrudan aÃ§Ä±klamak karmaÅŸÄ±ktÄ±r.
# Bu nedenle, topluluk iÃ§indeki en gÃ¼Ã§lÃ¼ bileÅŸen (XGBoost) Ã¼zerinden aÃ§Ä±klama yapÄ±yoruz.
fitted_xgb = ensemble.estimators_[0]

# SHAP Tree Explainer TanÄ±mlanmasÄ±
explainer = shap.TreeExplainer(fitted_xgb)
# TÃ¼m test seti Ã¼zerinde SHAP deÄŸerlerinin hesaplanmasÄ±
shap_values = explainer.shap_values(X_test)

plt.figure(figsize=(10, 8))
# Ã–zellik Ã¶nem sÄ±ralamasÄ± (Åžekil 8 ile uyumlu)
shap.summary_plot(shap_values, X_test, show=False)
plt.title('SHAP Ã–znitelik Ã–nem DÃ¼zeyi (XGB BileÅŸeni)', fontsize=14, weight='bold')
plt.tight_layout()
plt.show()