In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

heart = pd.read_csv('./data/heart.csv')

# 범주형/연속형 변수 분리
cat_col = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']
con_col = ['age', 'trtbps', 'thalachh', 'oldpeak']

heart_raw = heart.copy() #원본 데이터 copy

#get_dummies()로 범주형 변수 원핫인코딩
heart = pd.get_dummies(heart, columns=cat_col, drop_first=True)

#train, test, val 분리
X = heart.drop(['output'], axis=1)
y = heart[['output']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

#연속형 변수 스케일링
scaler = StandardScaler()
scaler.fit(X_train.loc[:,con_col])

X_train.loc[:,con_col] = scaler.transform(X_train.loc[:,con_col])
X_test.loc[:,con_col]  = scaler.transform(X_test.loc[:,con_col])
X_val.loc[:,con_col]   = scaler.transform(X_val.loc[:,con_col])

In [17]:
heart

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,output,sex_1,cp_1,cp_2,cp_3,...,exng_1,slp_1,slp_2,caa_1,caa_2,caa_3,caa_4,thall_1,thall_2,thall_3
0,63,145,233,150,2.3,1,True,False,False,True,...,False,False,False,False,False,False,False,True,False,False
1,37,130,250,187,3.5,1,True,False,True,False,...,False,False,False,False,False,False,False,False,True,False
2,41,130,204,172,1.4,1,False,True,False,False,...,False,False,True,False,False,False,False,False,True,False
3,56,120,236,178,0.8,1,True,True,False,False,...,False,False,True,False,False,False,False,False,True,False
4,57,120,354,163,0.6,1,False,False,False,False,...,True,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,140,241,123,0.2,0,False,False,False,False,...,True,True,False,False,False,False,False,False,False,True
299,45,110,264,132,1.2,0,True,False,False,True,...,False,True,False,False,False,False,False,False,False,True
300,68,144,193,141,3.4,0,True,False,False,False,...,False,True,False,False,True,False,False,False,False,True
301,57,130,131,115,1.2,0,True,False,False,False,...,True,True,False,True,False,False,False,False,False,True


In [19]:
import lightgbm as lgb

lgb_clf = lgb.LGBMClassifier(num_leaves=31, objective='binary')
lgb_clf.fit(X_train, y_train)
y_pred = lgb_clf.predict(X_val)
accuracy_score(y_val, y_pred)

[LightGBM] [Info] Number of positive: 95, number of negative: 86
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000355 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 181, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524862 -> initscore=0.099530
[LightGBM] [Info] Start training from score 0.099530


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


0.7377049180327869

num_iterations : 반복 수행하려는 트리의 개수 (너무 크면 오버피팅 발생) <br/>
objective : 수치예측이면 regression, 이진분류면 binary<br/>
learning_rate<br/>
max_depth<br/>

num_leaves : 하나의 트리가 가질 수 있는 최대 리프 개수<br/>
boosting : 부스팅 방법(gbdt : Gradient Boosting Decision Tree / rf : RandomForest)<br/>
bagging_fraction : 데이터 샘플링 비율, 오버피팅 제어<br/>
feature_fraction : 개별 트리 학습 시 무작위로 선택하는 feature 비율<br/>

In [21]:
params = {
    'num_leaves': 31,
    'objective': 'binary',
    'metric': 'auc',
}

train_data = lgb.Dataset(X_train, label=y_train.values)

num_round = 10

bst = lgb.train(params, train_data, num_round)

y_pred_val = bst.predict(X_val)
y_pred_val


[LightGBM] [Info] Number of positive: 95, number of negative: 86
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000547 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 181, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524862 -> initscore=0.099530
[LightGBM] [Info] Start training from score 0.099530




array([0.20890107, 0.45017168, 0.77123713, 0.35603908, 0.70002735,
       0.82582974, 0.20890107, 0.60121618, 0.20753224, 0.301988  ,
       0.70002735, 0.41447514, 0.58466583, 0.71112315, 0.22345288,
       0.301988  , 0.54115463, 0.38349527, 0.50697945, 0.29529759,
       0.57860863, 0.5395771 , 0.49696849, 0.76304332, 0.72702221,
       0.57043289, 0.78208072, 0.76013972, 0.74891211, 0.46161715,
       0.75056153, 0.83365445, 0.46554801, 0.54888557, 0.34590778,
       0.80047785, 0.66180092, 0.4128855 , 0.50503087, 0.32155372,
       0.73640383, 0.54904846, 0.55252206, 0.68819842, 0.25529351,
       0.19819232, 0.77744254, 0.76722272, 0.62191243, 0.77796071,
       0.76361612, 0.27210352, 0.32673485, 0.76304332, 0.42700139,
       0.55315744, 0.53217611, 0.18927561, 0.78079699, 0.76304332,
       0.62191243])

In [23]:
#확률값을 라벨값으로
y_pred_val = pd.Series(y_pred_val).apply(lambda x: 1 if x>=0.5 else 0) 

#성능평가
accuracy_score(y_val, y_pred_val)


0.7868852459016393