In [89]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support


In [90]:
# 1. 데이터 불러오기
data = pd.read_csv('azdigar nav sartirovka+0 delete.csv')

In [91]:
columns = [
    ('w08chronic_a', '고혈압'), 
    ('w08chronic_b', '당뇨병'),
    ('w08chronic_c', '암 및 악성종양'),
    ('w08chronic_d', '만성 폐질환'),
    ('w08chronic_e', '간질환'),
    ('w08chronic_f', '심장질환'),
    ('w08chronic_g', '뇌혈관질환'),
    ('w08chronic_h', '정신과적 질환'),
    ('w08chronic_i', '관절염 또는 류마티스'),
    ('w08chronic_k', '소화기계 질환'),
    ('w08chronic_l', '디스크 진단')
    ('w08chronic_m', '지매')

]


In [92]:
# Target 컬럼명만 추출
target_columns = [col[0] for col in columns]

In [93]:
# 3. 데이터 클리닝 - 'w08chronic_m'에서 값이 3인 행 제거
cleaned_data = data[data['w08chronic_m'] != 3]


In [94]:
# 4. 인코딩 처리 (1 -> 0 (no), 5 -> 1 (yes))
cleaned_data[target_columns] = cleaned_data[target_columns].replace({1: 0, 5: 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data[target_columns] = cleaned_data[target_columns].replace({1: 0, 5: 1})


In [95]:
# 5. 피처와 타겟 정의
X = cleaned_data.drop(columns=target_columns)
y_all = cleaned_data[target_columns]

In [96]:
# 6. 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [97]:
# 7. KFold 교차 검증 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [98]:
# 8. 모델 정의
random_forest = MultiOutputClassifier(RandomForestClassifier(random_state=42))
decision_tree = MultiOutputClassifier(DecisionTreeClassifier(random_state=42))


In [99]:
# 9. 모델별 예측값 얻기
rf_pred = cross_val_predict(random_forest, X_scaled, y_all, cv=kf)
dt_pred = cross_val_predict(decision_tree, X_scaled, y_all, cv=kf)

In [100]:
# 10. 전체 정확도 계산
rf_accuracy = accuracy_score(y_all, rf_pred)
dt_accuracy = accuracy_score(y_all, dt_pred)

# 11. Precision, Recall, F1-Score 계산 (Random Forest & Decision Tree)
def calculate_metrics(y_true, y_pred, model_name):
    results = []
    for i, (col, desc) in enumerate(columns):
        precision, recall, f1, _ = precision_recall_fscore_support(y_true[col], y_pred[:, i], average='binary')
        accuracy = accuracy_score(y_true[col], y_pred[:, i])  # 정확도 계산
        results.append({
            'Target': col,
            'Description': desc,
            'Model': model_name,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
             'Accuracy': accuracy  # 정확도 추가
        })
    return results

rf_results = calculate_metrics(y_all, rf_pred, 'RandomForest')
dt_results = calculate_metrics(y_all, dt_pred, 'DecisionTree')

In [101]:
# 12. Micro F1, Macro F1, Weighted F1 계산
rf_micro_f1 = f1_score(y_all, rf_pred, average='micro')
rf_macro_f1 = f1_score(y_all, rf_pred, average='macro')
rf_weighted_f1 = f1_score(y_all, rf_pred, average='weighted')

dt_micro_f1 = f1_score(y_all, dt_pred, average='micro')
dt_macro_f1 = f1_score(y_all, dt_pred, average='macro')
dt_weighted_f1 = f1_score(y_all, dt_pred, average='weighted')

In [102]:
# 13. 결과 DataFrame으로 변환 및 출력
rf_result_table = pd.DataFrame(rf_results)
dt_result_table = pd.DataFrame(dt_results)

rf_result_table = rf_result_table.style.set_properties(**{'text-align': 'left'})
rf_result_table = rf_result_table.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

dt_result_table = dt_result_table.style.set_properties(**{'text-align': 'left'})
dt_result_table = dt_result_table.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])


print("RandomForest Results")
display(rf_result_table)
print("DecisionTree Results")
display(dt_result_table)

RandomForest Results


Unnamed: 0,Target,Description,Model,Precision,Recall,F1-Score,Accuracy
0,w08chronic_a,고혈압,RandomForest,0.78733,0.663841,0.720331,0.731508
1,w08chronic_b,당뇨병,RandomForest,0.781843,0.990393,0.873847,0.776551
2,w08chronic_c,암 및 악성종양,RandomForest,0.921837,1.0,0.959329,0.921837
3,w08chronic_d,만성 폐질환,RandomForest,0.969088,1.0,0.984301,0.969088
4,w08chronic_e,간질환,RandomForest,0.970192,1.0,0.984871,0.970192
5,w08chronic_f,심장질환,RandomForest,0.901744,1.0,0.948334,0.901744
6,w08chronic_g,뇌혈관질환,RandomForest,0.94496,0.999299,0.97137,0.944359
7,w08chronic_h,정신과적 질환,RandomForest,0.954295,1.0,0.976613,0.954295
8,w08chronic_i,관절염 또는 류마티스,RandomForest,0.804517,0.926181,0.861073,0.779201
9,w08chronic_k,소화기계 질환,RandomForest,0.981232,1.0,0.990527,0.981232


DecisionTree Results


Unnamed: 0,Target,Description,Model,Precision,Recall,F1-Score,Accuracy
0,w08chronic_a,고혈압,DecisionTree,0.676433,0.675286,0.675859,0.662619
1,w08chronic_b,당뇨병,DecisionTree,0.804498,0.788358,0.796347,0.684919
2,w08chronic_c,암 및 악성종양,DecisionTree,0.92598,0.916886,0.921411,0.855818
3,w08chronic_d,만성 폐질환,DecisionTree,0.969662,0.961267,0.965446,0.933319
4,w08chronic_e,간질환,DecisionTree,0.970948,0.958352,0.964609,0.931773
5,w08chronic_f,심장질환,DecisionTree,0.905871,0.895446,0.900628,0.821815
6,w08chronic_g,뇌혈관질환,DecisionTree,0.949528,0.941094,0.945292,0.897108
7,w08chronic_h,정신과적 질환,DecisionTree,0.959126,0.944702,0.951859,0.90881
8,w08chronic_i,관절염 또는 류마티스,DecisionTree,0.800961,0.79737,0.799161,0.703908
9,w08chronic_k,소화기계 질환,DecisionTree,0.981876,0.975248,0.97855,0.958048


In [103]:
# 14. 전체 결과 출력
print(f"RandomForest Overall Accuracy: {rf_accuracy}")
print(f"RandomForest Micro F1: {rf_micro_f1}, Macro F1: {rf_macro_f1}, Weighted F1: {rf_weighted_f1}")
print(f"DecisionTree Overall Accuracy: {dt_accuracy}")
print(f"DecisionTree Micro F1: {dt_micro_f1}, Macro F1: {dt_macro_f1}, Weighted F1: {dt_weighted_f1}")

RandomForest Overall Accuracy: 0.3634356370059616
RandomForest Micro F1: 0.945454948023337, Macro F1: 0.9328702355667225, Weighted F1: 0.945337357893712
DecisionTree Overall Accuracy: 0.22676087436520204
DecisionTree Micro F1: 0.9125848036458393, Macro F1: 0.8980870089328307, Weighted F1: 0.9126091796761371
