In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [5]:
pima = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, encoding = 'utf-8', header=None)
pima.columns = ['임신횟수', '2h 포도당 농도', '이완시 혈압', '피부두께', '인슐린농도', 'BMI', '당뇨유전가능성', '나이', '당뇨여부']
pima

Unnamed: 0,임신횟수,2h 포도당 농도,이완시 혈압,피부두께,인슐린농도,BMI,당뇨유전가능성,나이,당뇨여부
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# 1. 데이터 전처리

* 결측치 확인

In [6]:
pima.isna().sum()

임신횟수         0
2h 포도당 농도    0
이완시 혈압       0
피부두께         0
인슐린농도        0
BMI          0
당뇨유전가능성      0
나이           0
당뇨여부         0
dtype: int64

* 카테고리 값이 없으므로 인코딩은 생략

* X데이터와 y데이터를 생성

In [13]:
X = pima.iloc[:, :-1].values
y = pima['당뇨여부'].values
X.shape, y.shape

((768, 8), (768,))

# 2. Train/Test 데이터셋으로 분리

In [14]:
np.unique(y, return_counts=True)

(array([0, 1], dtype=int64), array([500, 268], dtype=int64))

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [17]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([400, 214], dtype=int64))

# 3. RandomForest 모델로 학습

In [18]:
from sklearn.ensemble import RandomForestClassifier

* RandomForestClassifier의 객체 생성

In [21]:
rfc = RandomForestClassifier()
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [23]:
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [24]:
rfc.score(X_test, y_test)

0.7857142857142857

* 최적 파라미터 튜닝하기

In [22]:
params = {
    'max_depth' : [2, 4, 6, 8],
    'min_samples_split' : [2, 4, 6]
}

In [25]:
from sklearn.model_selection import GridSearchCV
grid_rfc = GridSearchCV(rfc, param_grid=params, scoring = 'accuracy', cv = 5)

In [26]:
grid_rfc.fit(X_train, y_train)
grid_rfc.best_params_

{'max_depth': 4, 'min_samples_split': 2}

In [27]:
params = {
    'max_depth' : [3, 4, 5],
    'min_samples_split' : [2, 3, 4]
}

grid_rfc = GridSearchCV(rfc, param_grid=params, scoring = 'accuracy', cv = 5)
grid_rfc.fit(X_train, y_train)
grid_rfc.best_params_

{'max_depth': 4, 'min_samples_split': 3}

In [30]:
# Train set에 대한 교차검증에서의 best score

grid_rfc.best_score_

0.7523790483806477

In [29]:
# Test set에 대한 score

best_rfc = grid_rfc.best_estimator_
best_rfc.score(X_test, y_test)

0.8311688311688312

# restart

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pima = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, encoding = 'utf-8', header=None)
pima.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
pima

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [6]:
np.unique(pima.Outcome, return_counts=True)

(array([0, 1], dtype=int64), array([500, 268], dtype=int64))

In [7]:
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
X = pima.iloc[:, :-1]
y = pima.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=156
)

# 학습
lr_clf = LogisticRegression(max_iter=500)
lr_clf.fit(X_train, y_train)

# 예측
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)

In [9]:
pred_proba()

TypeError: 'numpy.ndarray' object is not callable