## 공통 라이브러리

In [1]:
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics

## 위스콘신 유방암 데이터셋 설명
- 위스콘신 대학교에서 제공한 유방암 진단결과 데이터
- 레코드 개수 : 569개
- 컬럼 개수 : 32개

In [2]:
data = datasets.load_breast_cancer()
# 속성데이터
X = data.data
# 클래스 데이터
y = data.target
print(data.DESCR)

# 예제 데이터 세트 중 80%를 학습, 20%를 테스트 데이터셋으로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=156)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [42]:
def get_clf_eval(y_test, y_pred):
    confusion = metrics.confusion_matrix(y_test, y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    F1 = metrics.f1_score(y_test, y_pred)
    AUC = metrics.roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))
    

In [27]:
def get_stacking_data(model, X_train, y_train, X_test, n_folds=5):
    kfold = KFold(n_splits = n_folds, random_state = 0)
    
    # 최종 모델에서 사용할 데이터 셋 셋팅(0 값으로)
    # 만약 shape가 (100, 10) 이었으면 폴드의 검증 과정에서 저장할 데이터는 (100, 1) 모양을 갖게 한다.
    train_fold_predict = np.zeros((X_train.shape[0], 1))
    # test는 X_test 값을 이용해서 매 폴드마다 예측을 하기 때문에 (100, fold개수) 만큼의 shape를 갖게 한다.
    # 그래서 해당 폴드마다 X_test의 예측 값을 해당 fold에 해당되는 열에 넣는다.
    test_predict = np.zeros((X_test.shape[0], n_folds))
    print("model : ", model.__class__.__name__)
    
    for cnt, (train_index, valid_index) in enumerate(kfold.split(X_train)):
        X_train_ = X_train[train_index]
        y_train_ = y_train[train_index]
        X_validation = X_train[valid_index]
        
        # 학습
        model.fit(X_train_, y_train_)
        
        # 해당 폴드에서 학습된 모델에다가 검증 데이터(X_validation)로 예측 후 저장
        train_fold_predict[valid_index, :] = model.predict(X_validation).reshape(-1, 1)
        
        # 해당 폴드에서 생성된 모델에게 원본 테스트 데이터(X_test)를 이용해서 예측을 수행하고 저장
        test_predict[:, cnt] = model.predict(X_test)
    
    # for문이 끝나면 test_pred는 평균을 내서 하나로 합친다.
    test_predict_mean = np.mean(test_predict, axis = 1).reshape(-1, 1)
    
    return train_fold_predict, test_predict_mean

## 학습모델
- SVM
- RandomForestClassifier
- LogisticRegression

In [25]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

In [36]:
svm_train, svm_test = get_stacking_data(SVC(kernel='linear'), X_train, y_train, X_test)
rf_train, rf_test = get_stacking_data(RandomForestClassifier(), X_train, y_train, X_test)
lr_train, lr_test = get_stacking_data(LogisticRegression(max_iter=5000), X_train, y_train, X_test)



model :  SVC




model :  RandomForestClassifier




model :  LogisticRegression


In [37]:
new_X_train = np.concatenate((svm_train, rf_train, lr_train), axis = 1)
new_X_test = np.concatenate((svm_test, rf_test, lr_test), axis = 1)

In [38]:
from lightgbm import LGBMClassifier

In [39]:
lgbm = LGBMClassifier(n_estimators=400)
lgbm.fit(new_X_train, y_train)
stack_pred = lgbm.predict(new_X_test)

In [43]:
get_clf_eval(y_test, stack_pred)

오차행렬:
 [[33  4]
 [ 0 77]]

정확도: 0.9649
정밀도: 0.9506
재현율: 1.0000
F1: 0.9747
AUC: 0.9459
