In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
import pickle
import os
import pymysql
from sqlalchemy import create_engine
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# ✅ DB 접속 설정
DB_CONFIG = {
    "user": "root",
    "password": "1234",
    "host": "localhost",
    "database": "finfit"
}
engine = create_engine(f"mysql+pymysql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}/{DB_CONFIG['database']}")

# ✅ MySQL에서 데이터 로드
query = "SELECT * FROM medica"
df = pd.read_sql(query, engine.connect())

# 기본 + 추가 feature 목록
basic_features = [
    'sex', 'age', 'height', 'weight', 'bmi', 'alchol', 'smoking_history',
    'chol_ldl', 'glycated_hemoglobin', 'sbp_average'
]
optional_features = [
    'sleep', 'phq_total', 'dyslipidemia_status', 'chol_total', 'chol_hdl', 'chol_tg',
    'diabetes', 'fasting_blood_sugar', 'high_blood_pressure', 'dbp_average',
    'cancer_diagnosis_fathers', 'cancer_diagnosis_mother', 'cancer_diagnosis_sibling',
    'white_blood_cell_count', 'red_blood_cell_count', 'alchol_1year', 'stress'
]

# 🔹 PHQ-9 문항 추가
phq_features = [f'phq_{i}' for i in range(1, 10)]
available_phq_features = [col for col in phq_features if col in df.columns]

# phq_total 생성 및 타겟 컬럼 설정
if all(col in df.columns for col in phq_features):
    df['phq_total'] = df[phq_features].sum(axis=1)
    bins = [0, 4, 9, 19, 27]
    labels = ['우울증 없음', '가벼운 우울증', '중간 정도 우울증', '심한 우울증']
    df['depression_level'] = pd.cut(df['phq_total'], bins=bins, labels=labels, include_lowest=True)
    df = df.dropna(subset=['depression_level'])
else:
    raise ValueError("PHQ-9 관련 컬럼이 데이터에 존재하지 않습니다.")

target = 'depression_level'
optional_features.remove('phq_total')

available_features = [col for col in (basic_features + optional_features) if col in df.columns]
available_features += available_phq_features

# 결측값 처리
df.fillna(df.mean(numeric_only=True), inplace=True)
mode_df = df.mode()
if not mode_df.empty:
    df.fillna(mode_df.iloc[0], inplace=True)

# 스트레스 점수 방향 맞추기
if 'stress' in df.columns:
    df['stress'] = df['stress'].apply(lambda x: x if pd.isna(x) else int(x))

# 수면 위험도 계산 함수
def calculate_sleep_risk(sleep):
    if sleep == 0:
        return 0.0
    if sleep < 5:
        return 3.74
    elif sleep > 9:
        return 2.53
    else:
        return 1.0

if 'sleep' in df.columns:
    df['sleep_risk'] = df['sleep'].apply(calculate_sleep_risk)
    if 'sleep' in available_features:
        available_features.remove('sleep')
    available_features.append('sleep_risk')

# 'stress' 값이 NaN인 경우 처리
if 'stress' in df.columns:
    df['stress'] = df['stress'].apply(lambda x: x if pd.notna(x) else 0)

# 성별 인코딩
if df['sex'].dtype == 'object':
    df['sex'] = LabelEncoder().fit_transform(df['sex'])

X = df[available_features]
y = df[target]

# 타겟 인코딩
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# SMOTE로 클래스 불균형 처리
if len(np.unique(y_encoded)) > 1:
    smote = SMOTE(random_state=42, k_neighbors=1)
    X_smote, y_smote = smote.fit_resample(X, y_encoded)
else:
    print("⚠️ 클래스가 하나뿐이므로 SMOTE 생략")
    X_smote, y_smote = X, y_encoded

# 분할
if X_smote.shape[0] == 0:
    raise ValueError("❌ 사용할 수 있는 학습 데이터가 없습니다. medica 테이블을 확인하세요.")

X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42, stratify=y_smote)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 모델 학습 (XGBoost)
model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
model.fit(X_train_scaled, y_train)

# 평가 지표 출력
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print(f"✅ Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
print("\n[분류 리포트]")
print(classification_report(y_test, y_pred, target_names=encoder.classes_))

# 저장
model_data = {
    "model": model,
    "scaler": scaler,
    "features": available_features,
    "encoder": encoder,
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1_score": f1
}

os.makedirs("models", exist_ok=True)
with open("models/depression_model.pkl", "wb") as f:
    pickle.dump(model_data, f)

print("✅ 모델이 models/depression_model.pkl에 저장되었습니다.")

# ✅ 샘플 예측 테스트 (누락 feature 포함 수정)
sample_templates = {
    "우울증 없음": {
        "sex": 1, "age": 30, "height": 170, "weight": 68, "bmi": 23.5, "alchol": 0, "smoking_history": 0,
        "chol_ldl": 90, "glycated_hemoglobin": 5.4, "sbp_average": 115, "sleep_risk": 1.0, "stress": 3,
        "phq_1": 0, "phq_2": 0, "phq_3": 0, "phq_4": 0, "phq_5": 0, "phq_6": 0, "phq_7": 0, "phq_8": 0, "phq_9": 0
    },
    "가벼운 우울증": {
        "sex": 0, "age": 25, "height": 160, "weight": 52, "bmi": 20.3, "alchol": 1, "smoking_history": 0,
        "chol_ldl": 110, "glycated_hemoglobin": 5.8, "sbp_average": 120, "sleep_risk": 1.0, "stress": 2,
        "phq_1": 1, "phq_2": 1, "phq_3": 0, "phq_4": 1, "phq_5": 0, "phq_6": 0, "phq_7": 0, "phq_8": 1, "phq_9": 0
    },
    "중간 정도 우울증": {
        "sex": 1, "age": 40, "height": 175, "weight": 85, "bmi": 27.8, "alchol": 2, "smoking_history": 1,
        "chol_ldl": 135, "glycated_hemoglobin": 6.2, "sbp_average": 130, "sleep_risk": 3.74, "stress": 2,
        "phq_1": 2, "phq_2": 2, "phq_3": 1, "phq_4": 1, "phq_5": 2, "phq_6": 1, "phq_7": 1, "phq_8": 2, "phq_9": 1
    },
    "심한 우울증": {
        "sex": 0, "age": 55, "height": 158, "weight": 90, "bmi": 36.0, "alchol": 3, "smoking_history": 2,
        "chol_ldl": 160, "glycated_hemoglobin": 7.5, "sbp_average": 145, "sleep_risk": 2.53, "stress": 1,
        "phq_1": 3, "phq_2": 3, "phq_3": 3, "phq_4": 2, "phq_5": 2, "phq_6": 3, "phq_7": 2, "phq_8": 3, "phq_9": 2
    }
}

# 누락된 feature를 기본값으로 보완
def fill_missing_features(sample, required_features):
    for col in required_features:
        if col not in sample:
            sample[col] = 0
    return sample

print("\n[샘플 예측 결과]")
for label, sample in sample_templates.items():
    sample = fill_missing_features(sample, available_features)
    sample_df = pd.DataFrame([sample])
    sample_df = sample_df[available_features]
    sample_scaled = scaler.transform(sample_df)
    pred = model.predict(sample_scaled)
    pred_label = encoder.inverse_transform(pred)[0]
    print(f"입력: {label} ➤ 예측 결과: {pred_label}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Accuracy: 0.9956, Precision: 0.9956, Recall: 0.9956, F1-score: 0.9956

[분류 리포트]
              precision    recall  f1-score   support

     가벼운 우울증       0.99      1.00      0.99       795
      심한 우울증       1.00      1.00      1.00       794
      우울증 없음       1.00      0.99      0.99       795
   중간 정도 우울증       1.00      1.00      1.00       794

    accuracy                           1.00      3178
   macro avg       1.00      1.00      1.00      3178
weighted avg       1.00      1.00      1.00      3178

✅ 모델이 models/depression_model.pkl에 저장되었습니다.

[샘플 예측 결과]
입력: 우울증 없음 ➤ 예측 결과: 우울증 없음
입력: 가벼운 우울증 ➤ 예측 결과: 우울증 없음
입력: 중간 정도 우울증 ➤ 예측 결과: 중간 정도 우울증
입력: 심한 우울증 ➤ 예측 결과: 심한 우울증
