#### 피마 인디언 당뇨병 예측

In [106]:
import numpy as np
import pandas as pd

In [107]:
df = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, header=None)
df.columns = ['P','G','BP','S','I','BMI','D','Age','Target']
df.head()

Unnamed: 0,P,G,BP,S,I,BMI,D,Age,Target
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [108]:
# X, y데이터를 ndarray로 추출- 대부분의 CSV에 적용 가능(일반적 방법)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

``` 
1. 각 Feature의 최소값이 0, 최대값이 1이 되도록 MinMaxScaler를 사용하여 변환하세요.(10)
```

In [109]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [110]:
df_mm = MinMaxScaler().fit_transform(X)

for i in range(4):
  print(df_mm[:,i].min(), df_mm[:,i].max())

0.0 1.0
0.0 1.0
0.0 1.0
0.0 1.0


```
2. SVC, KNN, Logistic Regression을 소프트 보팅 방식으로 앙상블 학습을 하되,   SVC의 C 파라메터와 LR의 C 파라메터를 최적화해서 분류를 시도하세요.(20)
```

In [111]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_mm, y, stratify=y, test_size=0.1, random_state=2023  #
)

lrc = LogisticRegression(random_state=2023)
svc = SVC(probability=True, random_state=2023).fit(X_train, y_train)
knn = KNeighborsClassifier()

#soft voting
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC', svc), ('KNN', knn)],
    voting='soft'
)

voc.fit(X_train, y_train)
voc.score(X_test, y_test)

0.7532467532467533

In [112]:
voc.predict_proba(X_test[:3])

array([[0.95286143, 0.04713857],
       [0.32132646, 0.67867354],
       [0.49972367, 0.50027633]])

In [113]:
# 분류 - SVC의 C, LR의 C 파라메터를 최적화
params = {
    'LRC__C': [0.1, 1, 10],
    'SVC__C': [0.1, 1, 10]
}
from sklearn.model_selection import GridSearchCV
grid_voc = GridSearchCV(voc, params, scoring='accuracy', cv=5)
grid_voc.fit(X_train, y_train)
grid_voc.best_params_

{'LRC__C': 1, 'SVC__C': 1}

In [114]:
grid_voc.best_estimator_.score(X_test, y_test)

0.7532467532467533

```
3. 결정트리를 기반으로 하는 앙상블 학습기 3종류를 이용하여, 
정확도 / 정밀도 / 재현율을 구하세요.(20)
```

In [115]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
import warnings
warnings.filterwarnings(('ignore'))

In [116]:
# Bagging (Bootstrap Aggregating)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2023)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)
pred_rfc = rfc.predict(X_test)

In [117]:
rfc.predict_proba(X_test[:3])

array([[0.97, 0.03],
       [0.22, 0.78],
       [0.45, 0.55]])

In [118]:
accuracy_score(y_test, pred_rfc), precision_score(y_test, pred_rfc), recall_score(y_test, pred_rfc)

(0.7272727272727273, 0.65, 0.48148148148148145)

In [119]:
# Boosting:  XGBoost
from sklearn.preprocessing import StandardScaler
df_std = StandardScaler().fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_std, y, stratify=y, test_size=0.2, random_state=2023
)

import xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
xgb.score(X_test, y_test)

0.7402597402597403

In [120]:
xgb.predict_proba(X_test[:3])

array([[0.98096657, 0.01903344],
       [0.02669376, 0.97330624],
       [0.88944924, 0.11055077]], dtype=float32)

In [121]:
accuracy_score(y_test, pred_xgb), precision_score(y_test, pred_xgb), recall_score(y_test, pred_xgb)

(0.7402597402597403, 0.6458333333333334, 0.5740740740740741)

In [122]:
# lightgbm
import lightgbm
from lightgbm import LGBMClassifier
lgb = LGBMClassifier()

In [123]:
evals = [(X_test, y_test)]
lgb.fit(X_train, y_train, eval_set=evals, eval_metric='logloss', verbose=True)

[1]	valid_0's binary_logloss: 0.615506
[2]	valid_0's binary_logloss: 0.5971
[3]	valid_0's binary_logloss: 0.578737
[4]	valid_0's binary_logloss: 0.563357
[5]	valid_0's binary_logloss: 0.547289
[6]	valid_0's binary_logloss: 0.536826
[7]	valid_0's binary_logloss: 0.526478
[8]	valid_0's binary_logloss: 0.5196
[9]	valid_0's binary_logloss: 0.509382
[10]	valid_0's binary_logloss: 0.502299
[11]	valid_0's binary_logloss: 0.499422
[12]	valid_0's binary_logloss: 0.49336
[13]	valid_0's binary_logloss: 0.48997
[14]	valid_0's binary_logloss: 0.489997
[15]	valid_0's binary_logloss: 0.489147
[16]	valid_0's binary_logloss: 0.488191
[17]	valid_0's binary_logloss: 0.488111
[18]	valid_0's binary_logloss: 0.488663
[19]	valid_0's binary_logloss: 0.486831
[20]	valid_0's binary_logloss: 0.485895
[21]	valid_0's binary_logloss: 0.486498
[22]	valid_0's binary_logloss: 0.486552
[23]	valid_0's binary_logloss: 0.489444
[24]	valid_0's binary_logloss: 0.489001
[25]	valid_0's binary_logloss: 0.487257
[26]	valid_0's 

In [124]:
lgb.score(X_test, y_test)

0.7142857142857143

In [125]:
pred_lgb = lgb.predict(X_test)

In [126]:
accuracy_score(y_test, pred_lgb), precision_score(y_test, pred_lgb), recall_score(y_test, pred_lgb)

(0.7142857142857143, 0.6, 0.5555555555555556)