In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import graphviz
import gc
import itertools
import matplotlib.pyplot as plt
import xgboost as xgb
import time
from sklearn.tree import DecisionTreeClassifier

## SEEDFIX 및 지표함수 정의

In [2]:
def print_results(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    matrix = confusion_matrix(y_true, y_pred)

    print("Accuracy: {:.2f}".format(accuracy))
    print("Recall: {:.2f}".format(recall))
    print("Precision: {:.2f}".format(precision))
    print("F1-Score: {:.2f}".format(f1))
    print("Confusion Matrix:\n", matrix)
    print("\n")

In [3]:
df = pd.read_csv('./drop_data.csv')

In [4]:
seed = 21

In [5]:
x = df.drop(['hospital_expire_flag'], axis=1)
y = df['hospital_expire_flag']

## XGBoost 추론시간 측정 및 총 노드 수 계산

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=seed)

# no oversampling
xgb_clf = xgb.XGBClassifier(random_state=seed)

# 모델 학습
xgb_clf.fit(x_train, y_train)

# 테스트 시작 시간 기록
start_time = time.time()

# 테스트 데이터에 대한 예측 수행
y_pred = xgb_clf.predict(x_test)

# 테스트 종료 시간 기록
end_time = time.time()

# 테스트 시간 계산
testing_time = end_time - start_time
print(f"Testing Time: {testing_time} seconds")

# 결과 출력
print_results(y_test, y_pred)


Testing Time: 0.44086289405822754 seconds
Accuracy: 0.99
Recall: 0.79
Precision: 0.96
F1-Score: 0.87
Confusion Matrix:
 [[319898    373]
 [  2763  10207]]




In [7]:
total_nodes = sum(len(tree.split('\n')) for tree in xgb_clf.get_booster().get_dump())
print(f"Total number of nodes in XGBoost model: {total_nodes}")

Total number of nodes in XGBoost model: 9766


## DT 추론시간 측정 및 총 노드 수 계산

In [8]:
# 데이터 분할 (x, y는 실제 데이터셋으로 대체)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=seed)

# Decision Tree 모델 초기화 및 학습
dt_clf = DecisionTreeClassifier(random_state=seed, max_depth=20)
dt_clf.fit(x_train, y_train)

# 테스트 데이터에 대한 예측 수행 및 시간 측정
start_time = time.time()
y_pred = dt_clf.predict(x_test)
end_time = time.time()

# 총 노드 수 계산 및 출력
total_nodes = dt_clf.tree_.node_count
print(f"Total number of nodes in Decision Tree model: {total_nodes}")


Total number of nodes in Decision Tree model: 3351


In [9]:
y_pred = xgb_clf.predict(x_test)
y_pred_proba = xgb_clf.predict_proba(x_test)[:, 1]  # 클래스 1에 대한 확률

## 시나리오 분석

TOP 5 Features

In [10]:
discrete_vars = ["drg_severity"]
continuous_vars = ["Lactate", "Bicarbonate", "pCO2","White Blood Cells"]

In [11]:
# 확률이 0.5를 넘지 않는 가장 큰 확률을 가진 데이터의 인덱스 및 데이터 찾기
max_proba_under_50_index, max_proba_under_50_value = max(((i, p) for i, p in enumerate(y_pred_proba) if p < 0.5), key=lambda x: x[1])
max_proba_under_50_data = x_test.iloc[max_proba_under_50_index]

# 결과 출력
print(f"Index with the highest probability under 0.5: {max_proba_under_50_index}")
print(f"Highest probability under 0.5: {max_proba_under_50_value}")
print("Data with the highest probability under 0.5:")
print(max_proba_under_50_data)

# discrete_vars와 continuous_vars에 해당하는 변수들의 값을 출력
selected_vars = discrete_vars + continuous_vars
selected_data = max_proba_under_50_data[selected_vars]
print("\nSelected variables from the data with the highest probability under 0.5:")
print(selected_data)

Index with the highest probability under 0.5: 67893
Highest probability under 0.5: 0.4998077154159546
Data with the highest probability under 0.5:
hadm_id             22951112.0
anchor_age                74.0
drg_severity               4.0
% Hemoglobin A1c           0.0
24 hr Creatinine           0.0
                       ...    
admission_los_4            0.0
edr_los_1                  1.0
edr_los_2                  0.0
edr_los_3                  0.0
edr_los_4                  0.0
Name: 892005, Length: 340, dtype: float64

Selected variables from the data with the highest probability under 0.5:
drg_severity          4.000000
Lactate               1.020000
Bicarbonate          26.733333
pCO2                 40.409091
White Blood Cells    15.122222
Name: 892005, dtype: float64


## 

연속형 변수는 Train dataset의 최소 최대를 이용, 이산형 변수는 모든 값의 경우의 수를 조합하여 시나리오 데이터 생성

In [20]:
import numpy as np
import pandas as pd
import itertools

# 인덱스(특정 환자) 선택
i = 67893
data_point = x_test.iloc[i]

# 이산형 변수의 모든 가능한 조합 생성
discrete_combinations = list(itertools.product(*[x_train[var].unique() for var in discrete_vars]))

# 연속형 변수에 대한 새로운 값 생성 
continuous_values = {}
for var in continuous_vars:
    qmin = x_train[var].quantile(0.1)
    qmax = x_train[var].quantile(0.9)
    continuous_values[var] = np.random.uniform(qmin, qmax, 5)

new_dataset = []

for combo in discrete_combinations:
    for values in itertools.product(*continuous_values.values()):
        new_point = data_point.copy()
        for var_index, var_name in enumerate(discrete_vars):
            new_point[var_name] = combo[var_index]
        for var_index, var_name in enumerate(continuous_vars):
            new_point[var_name] = values[var_index]
        new_dataset.append(new_point)

scenario_df = pd.DataFrame(new_dataset)


In [21]:
# scenario_df에 대한 예측 확률 뽑기
y_pred_proba = xgb_clf.predict_proba(scenario_df)

# 예측 확률 출력
print(y_pred_proba)

[[0.26470363 0.73529637]
 [0.22192818 0.7780718 ]
 [0.19990951 0.8000905 ]
 ...
 [0.91410303 0.08589697]
 [0.9589868  0.0410132 ]
 [0.9419863  0.05801365]]


In [22]:
proba_df = pd.DataFrame(y_pred_proba, columns=['Prob_Class_0', 'Prob_Class_1'])
# 인덱스 리셋
scenario_df = scenario_df.reset_index(drop=True)
proba_df = proba_df.reset_index(drop=True)

# 병합
merged_df = pd.concat([scenario_df, proba_df], axis=1)

import pandas as pd

# 확률 데이터를 DataFrame으로 변환
# 클래스별 확률을 나타내는 컬럼 이름 지정
proba_df = pd.DataFrame(y_pred_proba, columns=['Prob_Class_0', 'Prob_Class_1'])

# 원래의 scenario_df와 확률 데이터를 병합
merged_df = pd.concat([scenario_df, proba_df], axis=1)


In [23]:
# Prob_Class_1에 따라 내림차순 정렬
merged_df_sorted = merged_df.sort_values(by='Prob_Class_1', ascending=False)

# 가장 높은 확률과 낮은 확률을 가진 행 추출
highest_proba_row = merged_df_sorted.iloc[0]
lowest_proba_row = merged_df_sorted.iloc[-1]

# discrete_vars와 continuous_vars 정의
discrete_vars = ["drg_severity"]
continuous_vars = ["Lactate", "Bicarbonate", "pCO2", "White Blood Cells"]

# 가장 높은 확률과 낮은 확률을 가진 데이터에 대한 선택된 변수들의 값 출력
print("Highest Probability Row Selected Variables:")
print(highest_proba_row[discrete_vars + continuous_vars])

print("\nLowest Probability Row Selected Variables:")
print(lowest_proba_row[discrete_vars + continuous_vars])



Highest Probability Row Selected Variables:
drg_severity          4.000000
Lactate               1.786472
Bicarbonate          19.460826
pCO2                 42.038008
White Blood Cells    13.594768
Name: 147, dtype: float64

Lowest Probability Row Selected Variables:
drg_severity          1.000000
Lactate               0.559693
Bicarbonate          29.027624
pCO2                 20.516497
White Blood Cells     5.329167
Name: 1903, dtype: float64


사망확률이 가장 높은 시나리오

In [24]:
highest_proba_row

hadm_id             2.295111e+07
anchor_age          7.400000e+01
drg_severity        4.000000e+00
% Hemoglobin A1c    0.000000e+00
24 hr Creatinine    0.000000e+00
                        ...     
edr_los_2           0.000000e+00
edr_los_3           0.000000e+00
edr_los_4           0.000000e+00
Prob_Class_0        6.976664e-02
Prob_Class_1        9.302334e-01
Name: 147, Length: 342, dtype: float64

사망확률이 가장 높은 시나리오

In [19]:
lowest_proba_row

hadm_id             2.295111e+07
anchor_age          7.400000e+01
drg_severity        1.000000e+00
% Hemoglobin A1c    0.000000e+00
24 hr Creatinine    0.000000e+00
                        ...     
edr_los_2           0.000000e+00
edr_los_3           0.000000e+00
edr_los_4           0.000000e+00
Prob_Class_0        9.570877e-01
Prob_Class_1        4.291230e-02
Name: 2012, Length: 342, dtype: float64