# 피마인디언 당뇨병 예측하기

In [100]:
import numpy as np
import pandas as pd

### 1. 데이터 전처리

In [101]:
df = pd.read_csv('pima-indians-diabetes.csv', sep=',', skiprows=9, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [102]:
pdf = df
pdf.columns = ['P','G','BP','ST','I','BMI','DPF','A','Class']
pdf.head()

Unnamed: 0,P,G,BP,ST,I,BMI,DPF,A,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [103]:
X = pdf.iloc[:,:-1]
y = pdf['Class']
X.shape, y.shape

((768, 8), (768,))

In [104]:
type(y)     # 인덱스 값이 그대로 유지됨 (나중 작업에서 iloc를 사용해야 함)

pandas.core.series.Series

In [105]:
X = pdf.iloc[:,:-1].values
y = pdf['Class'].values
X.shape, y.shape

((768, 8), (768,))

In [106]:
type(y)     # 기존 인덱스 대신 새로운 인덱스가 생성됨

numpy.ndarray

In [107]:
np.unique(y, return_counts=True)

(array([0, 1], dtype=int64), array([500, 268], dtype=int64))

### 2. Train/Test dataset 분리

In [108]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021
)

In [85]:
# X, y변수를 만들기 생략
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    pdf.iloc[:,:-1], pdf['Class'], stratify=y, test_size=0.2, random_state=2021
)

In [109]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([400, 214], dtype=int64))

### 3. Model 생성 및 학습

In [110]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)

In [111]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

### 4. 예측 및 평가

In [112]:
pred = dtc.predict(X_test)

In [113]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7077922077922078

In [114]:
dtc.score(X_test, y_test)

0.7077922077922078

### 5. 최적의 하이퍼 파라메터 도출 및 교차 검증

max_depth - 이진 가지치기를 계속하다보면 이상치를 하나씩 분리할 수 있어서 정확해지지만 과적합이 됨 따라서 max_depth로 제시한 숫자에서 자름

In [115]:
params = {
    'max_depth': [2,4,6], 
    'min_samples_split': [2,4,6]
}

In [116]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 6],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [117]:
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [118]:
params = {
    'max_depth': [2,3,4], 
    'min_samples_split': [2,3,4]
}

In [119]:
grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [120]:
grid_dt.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [121]:
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.7142857142857143

### 실제 값 하나가 주어졌을 때 당뇨병 여부를 확인하는 법

In [122]:
y_test[33]

0

In [124]:
test_data = X_test[33]

In [127]:
result = best_dt.predict(test_data.reshape(1,8))[0]     # reshape로 2차원으로 만들어줘야 함(자주 에러가 나는 부분)
print('음성' if result == 0 else '양성')

음성
