# 데이터셋 출처
   + https://www.kaggle.com/uciml/pima-indians-diabetes-database

## 데이터 구성

   + Pregnancies : 임신 횟수
   + Glucose : 2시간 동안의 경구 포도당 내성 검사에서 혈장 포도당 농도
   + BloodPressure : 이완기 혈압 (mm Hg)
   + SkinThickness : 삼두근 피부 주름 두께 (mm), 체지방을 추정하는데 사용되는 값
   + Insulin : 2시간 혈청 인슐린 (mu U / ml)
   + BMI : 체질량 지수 (체중kg / 키(m)^2)
   + DiabetesPedigreeFunction : 당뇨병 혈통 기능
   + Age : 나이
   + Outcome : 768개 중에 268개의 결과 클래스 변수(0 또는 1)는 1이고 나머지는 0입니다.


# 필요한 라이브러리 로드

In [23]:
# 데이터 분석을 위한 pandas, 수치계산을 위한 numpy
# 시각화를 위한 seaborn, matplotlib.pyplot 을 로드합니다.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 데이터셋 로드

In [24]:
df = pd.read_csv("diabetes_feature.csv")
df.shape

(768, 16)

# 학습,예측 데이터 만들기

In [25]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_Insulin'],
      dtype='object')

In [26]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness',
    'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
    'Insulin_nan', 'low_glu_Insulin']]
X.shape

(768, 9)

In [27]:
y = df["Outcome"]
y.shape

(768,)

In [28]:
# 사이킷런에서 제공하는 model_selection 의 train_test_split 으로 만듭니다.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=42)

In [29]:
# train 세트의 문제와 정답의 데이터 수를 확인해 주세요.

X_train.shape, y_train.shape

((614, 9), (614,))

In [30]:
# test 세트의 문제와 정답을 확인해 주세요.

X_test.shape, y_test.shape

((154, 9), (154,))

# 학습과 예측하기

In [62]:
# DecisionTree 를 불러 옵니다.

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=5, random_state=42)
model

DecisionTreeClassifier(max_depth=5, random_state=42)

## 최적의 max_depth 값 찾기

In [75]:
for max_depth in range(3, 12):
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    y_predict = model.fit(X_train, y_train).predict(X_test)
    score = accuracy_score(y_test, y_predict) * 100
    print(max_depth, score)

3 85.06493506493507
4 87.66233766233766
5 85.71428571428571
6 81.81818181818183
7 81.81818181818183
8 81.81818181818183
9 83.76623376623377
10 79.22077922077922
11 81.81818181818183


# Grid Search

   + https://sklearn.org/modules/cross_validation.html#cross-validation
   + https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [79]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier(random_state=42)
param_grid = {"max_depth" : range(3,12), 
              "max_features": [0.3, 0.5, 0.7, 0.9, 1]}
clf = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:    4.8s finished


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(3, 12),
                         'max_features': [0.3, 0.5, 0.7, 0.9, 1]},
             verbose=1)

In [80]:
clf.best_params_

{'max_depth': 5, 'max_features': 0.7}

In [81]:
clf.best_estimator_

DecisionTreeClassifier(max_depth=5, max_features=0.7, random_state=42)

In [82]:
clf.best_score_

0.8664934026389444

In [85]:
pd.DataFrame(clf.cv_results_).sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
12,0.005784,0.000977,0.003791,0.002308776,5,0.7,"{'max_depth': 5, 'max_features': 0.7}",0.878049,0.910569,0.813008,0.837398,0.893443,0.866493,0.036082,1
7,0.004987,0.00063,0.002793,0.000399185,4,0.7,"{'max_depth': 4, 'max_features': 0.7}",0.813008,0.886179,0.829268,0.861789,0.918033,0.861655,0.037935,2
8,0.008179,0.002779,0.003789,0.001829661,4,0.9,"{'max_depth': 4, 'max_features': 0.9}",0.821138,0.886179,0.853659,0.853659,0.893443,0.861615,0.026005,3
18,0.006185,0.002396,0.001992,0.0006260873,6,0.9,"{'max_depth': 6, 'max_features': 0.9}",0.829268,0.894309,0.821138,0.878049,0.877049,0.859963,0.029149,4
27,0.004588,0.000799,0.002793,0.002129808,8,0.7,"{'max_depth': 8, 'max_features': 0.7}",0.861789,0.878049,0.837398,0.853659,0.860656,0.85831,0.013162,5
17,0.004987,0.00126,0.002594,0.0008010788,6,0.7,"{'max_depth': 6, 'max_features': 0.7}",0.861789,0.869919,0.804878,0.894309,0.860656,0.85831,0.029337,5
31,0.004188,0.000746,0.002394,0.0007974628,9,0.5,"{'max_depth': 9, 'max_features': 0.5}",0.853659,0.902439,0.780488,0.861789,0.885246,0.856724,0.041834,7
11,0.004188,0.000747,0.002593,0.001739665,5,0.5,"{'max_depth': 5, 'max_features': 0.5}",0.837398,0.878049,0.821138,0.878049,0.868852,0.856697,0.023222,8
1,0.006485,0.001483,0.003195,0.0004050973,3,0.5,"{'max_depth': 3, 'max_features': 0.5}",0.780488,0.910569,0.813008,0.853659,0.918033,0.855151,0.053625,9
6,0.005986,0.001669,0.002792,0.0003986307,4,0.5,"{'max_depth': 4, 'max_features': 0.5}",0.788618,0.886179,0.804878,0.878049,0.918033,0.855151,0.04979,9


In [86]:
clf.predict(X_test)

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
      dtype=int64)

In [88]:
clf.score(X_test, y_test)

0.8701298701298701

# 정확도(Accuracy)측정하기

In [67]:
# 다르게 예측한 개수를 구해서 diff_count 에 할당해 줍니다.

abs(y_predict - y_test).sum()

22

In [68]:
# accuracy score 를 구합니다.

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict) * 100

85.71428571428571

# 머신러닝 알고리즘 가져오기

In [52]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(, random_state=42)
model

DecisionTreeClassifier(max_depth=3, random_state=42)

# 학습(훈련)
   + 시험을 볼 때 기출문제(X_train)와 정답(y_train)을 보고 공부하는 과정과 유사합니다.

In [None]:
model.fit(X_train, y_train)

# 예측
   + 실전 시험문제(X_test)라고 보면 됩니다. 우리가 정답을 직접 예측합니다.

In [None]:
y_predict = model.predict(X_test)
y_predict[:5]

# 트리 알고리즘 분석하기
   + 의사결정나무를 시각화합니다.

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(20,20))
tree = plot_tree(model,
                 feature_names=feature_names,
                 filled=True,
                 fontsize=10)

In [None]:
# graphviz 를 통해 시각화합니다.
# graphviz 는 별도의 설치가 필요합니다.
# graphviz 와 파이썬에서 graphviz 를 사용할 수 있게 해주는 도구 2가지를 설치해주셔야 합니다.
import graphviz
from sklearn.tree import export_graphviz

dot_tree = export_graphviz(model,
                            feature_names = feature_names,
                            filled=True)
graphviz.Source(dot_tree)

In [None]:
# 피처의 중요도를 추출하기

model.feature_importances_

In [None]:
# 피처의 중요도 시각화하기

sns.barplot(x=model.feature_importances_, y = feature_names)

In [None]:
# 실제값 - 예측값을 빼주면 같은 값은 0으로 나오게 됩니다.
# 여기에서 절대값을 씌운 값이 1인 값이 다르게 예측한 값이 됩니다.
# 44 => 39(Pregnancies_high) => 49(나이25세 기준) => 55(나이30세 기준)) 
# => 23(인슐린 결측치를 평균으로 대체) => 16(인슐린 결측치를 중앙값으로 대체)
# 15(파생변수 low_glu_insulin 추가)
# => 15(인슐린 수치 600 이상 제거)
# => 15(Glucose, DiabetesPedigreeFunction 스케일링)
diff_count = abs(y_test - y_predict).sum() 
diff_count

In [None]:
# 예측의 정확도를 구합니다. 100점 만점에 몇 점을 맞았는지 구한다고 보면 됩니다.

(len(y_test) - diff_count) / len(y_test) * 100

In [None]:
# 위에서 처럼 직접 구할 수도 있지만 미리 구현된 알고리즘을 가져와 사용합니다.

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict) * 100

In [None]:
# model 의 score 로 점수를 계산합니다.

model.score(X_test,y_test)