In [1]:
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_breast_cancer()

x_data = data.data
y_label = data.target

x_train, x_test, y_train, y_test = train_test_split(x_data, y_label, test_size=0.2, random_state=0)

In [2]:
knn = KNeighborsClassifier(n_neighbors=4)
rf = RandomForestClassifier(n_estimators=100, random_state=0)
dt = DecisionTreeClassifier()
ada = AdaBoostClassifier(n_estimators=100)
lr = LogisticRegression(C=10)

knn.fit(x_train, y_train)
rf.fit(x_train, y_train)
dt.fit(x_train, y_train)
ada.fit(x_train, y_train)

AdaBoostClassifier(n_estimators=100)

In [3]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f} F1: {3:.4f}, AUC:{4:.4f}'
          .format(accuracy, precision, recall, f1, roc_auc))

In [4]:
knn_pred = knn.predict(x_test)
rf_pred = rf.predict(x_test)
dt_pred = dt.predict(x_test)
ada_pred = ada.predict(x_test)

knn_proba = knn.predict_proba(x_test)[:, 1]
rf_proba = rf.predict_proba(x_test)[:, 1]
dt_proba = dt.predict_proba(x_test)[:, 1]
ada_proba = ada.predict_proba(x_test)[:, 1]

get_clf_eval(y_test, knn_pred, knn_proba)

오차 행렬
[[44  3]
 [ 6 61]]
정확도: 0.9211, 정밀도: 0.9531, 재현율: 0.9104 F1: 0.9313, AUC:0.9568


In [5]:
get_clf_eval(y_test, rf_pred, rf_proba)

오차 행렬
[[46  1]
 [ 3 64]]
정확도: 0.9649, 정밀도: 0.9846, 재현율: 0.9552 F1: 0.9697, AUC:0.9965


In [6]:
get_clf_eval(y_test, dt_pred, dt_proba)

오차 행렬
[[43  4]
 [ 8 59]]
정확도: 0.8947, 정밀도: 0.9365, 재현율: 0.8806 F1: 0.9077, AUC:0.8977


In [7]:
get_clf_eval(y_test, ada_pred, ada_proba)

오차 행렬
[[44  3]
 [ 2 65]]
정확도: 0.9561, 정밀도: 0.9559, 재현율: 0.9701 F1: 0.9630, AUC:0.9968


In [8]:
preds = np.array([knn_pred, rf_pred, dt_pred, ada_pred])
preds = np.transpose(preds)
preds

array([[0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 1, 0, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 0, 0, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 1, 1],
       [0, 0, 1, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 1, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 1, 0, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 1,

In [9]:
preds.shape

(114, 4)

In [11]:
lr.fit(preds, y_test)
lr_pred = lr.predict(preds)
lr_prob = lr.predict_proba(preds)[:, 1]
get_clf_eval(y_test, lr_pred, lr_prob)

오차 행렬
[[46  1]
 [ 2 65]]
정확도: 0.9737, 정밀도: 0.9848, 재현율: 0.9701 F1: 0.9774, AUC:0.9790


In [12]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, x_train_n, y_train_n, x_test_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
    train_pred = np.zeros((x_train_n.shape[0],1))
    test_pred = np.zeros((x_test_n.shape[0], n_folds))
    print(model.__class__.__name__, ' 시작')
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(x_train_n)):
        print('\t 폴드 세트: ', folder_counter, ' 시작')
        x_tr = x_train_n[train_index]
        y_tr = y_train_n[train_index]
        x_te = x_train_n[valid_index]
        
        model.fit(x_tr, y_tr)
        train_pred[valid_index, :] = model.predict(x_te).reshape(-1, 1)
        test_pred[:, folder_counter] = model.predict(x_test_n)
        
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)
    return train_pred, test_pred_mean

In [13]:
knn_train, knn_test = get_stacking_base_datasets(knn, x_train, y_train, x_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf, x_train, y_train, x_test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt, x_train, y_train, x_test, 7)
ada_train, ada_test = get_stacking_base_datasets(ada, x_train, y_train, x_test, 7)



KNeighborsClassifier  시작
	 폴드 세트:  0  시작
	 폴드 세트:  1  시작
	 폴드 세트:  2  시작
	 폴드 세트:  3  시작
	 폴드 세트:  4  시작
	 폴드 세트:  5  시작
	 폴드 세트:  6  시작
RandomForestClassifier  시작
	 폴드 세트:  0  시작
	 폴드 세트:  1  시작
	 폴드 세트:  2  시작
	 폴드 세트:  3  시작
	 폴드 세트:  4  시작
	 폴드 세트:  5  시작
	 폴드 세트:  6  시작




DecisionTreeClassifier  시작
	 폴드 세트:  0  시작
	 폴드 세트:  1  시작
	 폴드 세트:  2  시작
	 폴드 세트:  3  시작
	 폴드 세트:  4  시작
	 폴드 세트:  5  시작
	 폴드 세트:  6  시작
AdaBoostClassifier  시작
	 폴드 세트:  0  시작
	 폴드 세트:  1  시작
	 폴드 세트:  2  시작
	 폴드 세트:  3  시작
	 폴드 세트:  4  시작
	 폴드 세트:  5  시작
	 폴드 세트:  6  시작


In [14]:
stack_train = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis=1)
stack_test = np.concatenate((knn_test, rf_test, dt_test, ada_test), axis=1)

In [15]:
print(x_train.shape, x_test.shape)
print(stack_train.shape, stack_test.shape)

(455, 30) (114, 30)
(455, 4) (114, 4)


In [17]:
lr.fit(stack_train, y_train)
final_pred = lr.predict(stack_test)
final_prob = lr.predict_proba(stack_test)[:, 1]
get_clf_eval(y_test, final_pred, final_prob)

오차 행렬
[[46  1]
 [ 1 66]]
정확도: 0.9825, 정밀도: 0.9851, 재현율: 0.9851 F1: 0.9851, AUC:0.9975


In [18]:
accuracy_score(y_test, final_pred)

0.9824561403508771