<a href="https://colab.research.google.com/github/byeori-jang/Deep-learning/blob/main/2025-09-30/DL_diabates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===== 0) 설치/임포트 =====
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, roc_curve

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

# ===== 1) 데이터 로드 =====
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/diabetes.csv")

# 타깃 자동 인식
target_col = 'Outcome' if 'Outcome' in df.columns else df.columns[-1]
X = df.drop(columns=[target_col]).copy()
y = df[target_col].astype(int).copy()

# (선택) Pima 데이터 관행: 0이 나올 수 없는 변수의 0 -> NaN -> 중앙값 대체
#    (Glucose, BloodPressure, SkinThickness, Insulin, BMI)
zero_invalid_cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for c in zero_invalid_cols:
    if c in X.columns:
        X.loc[X[c] == 0, c] = np.nan
        X[c] = X[c].fillna(X[c].median())

# ===== 2) Train/Val/Test 분할 =====
# 먼저 train+temp / test 로 나누고, temp를 다시 train/val로 쪼갬
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)
# 비율: train 80%, val 10%, test 10%

# ===== 3) 스케일링 (훈련셋 기준으로만 fit) =====
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_val_sc   = scaler.transform(X_val)
X_test_sc  = scaler.transform(X_test)

# ===== 4) 클래스 불균형 대응: class_weight =====
# (데이터에 따라 0/1 비율이 많이 다르면 도움이 됨)
neg, pos = np.bincount(y_train)
total = neg + pos
# 반비례 가중치 (Keras에 딕셔너리로 전달)
class_weight = {
    0: total / (2.0 * neg),
    1: total / (2.0 * pos),
}
print("class_weight:", class_weight)

# ===== 5) 모델 정의 (작고 탄탄한 MLP) =====
n_features = X_train_sc.shape[1]

def build_model():
    inputs = keras.Input(shape=(n_features,))
    x = layers.Dense(64, activation="relu")(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Dense(32, activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)

    outputs = layers.Dense(1, activation="sigmoid")(x)  # 이진분류
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss="binary_crossentropy",
        metrics=[
            keras.metrics.BinaryAccuracy(name="acc"),
            keras.metrics.AUC(curve="ROC", name="auc"),
            keras.metrics.AUC(curve="PR", name="auprc"),
        ],
    )
    return model

model = build_model()
model.summary()

# ===== 6) 콜백 (얼리스타핑, 체크포인트, 러닝레이트 감소) =====
es = callbacks.EarlyStopping(
    monitor="val_auc", mode="max",
    patience=20, restore_best_weights=True
)
ckpt = callbacks.ModelCheckpoint(
    "/content/best_diabetes_mlp.keras",
    monitor="val_auc", mode="max",
    save_best_only=True
)
rlr = callbacks.ReduceLROnPlateau(
    monitor="val_auc", mode="max",
    factor=0.5, patience=5, min_lr=1e-5, verbose=1
)

# ===== 7) 학습 =====
history = model.fit(
    X_train_sc, y_train,
    validation_data=(X_val_sc, y_val),
    epochs=200,
    batch_size=32,
    class_weight=class_weight,   # 불균형일 때 성능 안정
    callbacks=[es, ckpt, rlr],
    verbose=0
)

# ===== 8) 임계값 튜닝 (Validation ROC로 최적 threshold 찾기) =====
val_proba = model.predict(X_val_sc).ravel()
fpr, tpr, thr = roc_curve(y_val, val_proba)
# Youden's J = TPR - FPR 최대값에서 threshold 선택
best_idx = np.argmax(tpr - fpr)
best_thr = thr[best_idx]
print(f"Best threshold from validation: {best_thr:.4f}")

# ===== 9) 테스트 평가 =====
test_proba = model.predict(X_test_sc).ravel()
test_pred  = (test_proba >= best_thr).astype(int)

acc  = accuracy_score(y_test, test_pred)
auc  = roc_auc_score(y_test, test_proba)
cm   = confusion_matrix(y_test, test_pred)
rep  = classification_report(y_test, test_pred, digits=4)

print("\n=== Test Metrics (threshold tuned) ===")
print(f"Accuracy : {acc:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", rep)


class_weight: {0: np.float64(0.7675), 1: np.float64(1.4345794392523366)}



Epoch 15: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 29: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 34: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 39: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 44: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Best threshold from validation: 0.4202
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

=== Test Metrics (threshold tuned) ===
Accuracy : 0.7273
ROC AUC  : 0.8370
Confusion Matrix:
 [[35 15]
 [ 6 21]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8537    0.7000    0.7692        50
           1     0.5833    0.7778    0.6667        27

    accuracy                         0.7273        77
   macro avg     0.7185    0.7389    0.7179        77
weighted avg     