In [1]:
import pandas as pd

from sklearn.ensemble import VotingClassifier  # voting 분류기 가져오기
from sklearn.linear_model import LogisticRegression  # 로지스틱회귀(분류모델임!!) 가져오기
from sklearn.neighbors import KNeighborsClassifier  # KNN 모델 가져오기
from sklearn.datasets import load_breast_cancer  # 유방암 데이터셋 가져오기 -> 사이킷런에서 제공하는 기본 데이터셋 중 하나임
from sklearn.model_selection import train_test_split  # 테스트, 트레인 셋 분류
from sklearn.metrics import accuracy_score  # 정확도 가져오기

cancer = load_breast_cancer()

df = pd.DataFrame(cancer.data, columns=cancer.feature_names)

In [2]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
df.columns
# 종양이 악성인지 양성인지를 파악하는 문제이며, 종양의 크기, 모양 등 형태와 관련한 feature(column과 같은 의미)가 존재

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [7]:
# 로지스틱 회귀 + KNN의 소프트 보팅 방식의 분류기를 생성
lr = LogisticRegression(max_iter=5000)
knn = KNeighborsClassifier(n_neighbors=8)

# 각 모델들을 불러오고, knn의 경우 총 8개로 분류하기 위해 n_neighbors라는 하이퍼파라미터를 조절해줌
vot = VotingClassifier(estimators=[('LR', lr), ('KNN', knn)], voting='soft')

# Voting Classifier의 estimators는 앙상블 할 모델들을 튜플의 형식으로 입력받음
# voting은 default 값은 'hard'임

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=156)

In [8]:
vot.fit(X_train, y_train)
pred = vot.predict(X_test)

print('Voting Classifier accuracy: {0:.4f}'.format(accuracy_score(y_test, pred)))

Voting Classifier accuracy: 0.9474


In [9]:
classifier = [lr, knn]  # 앙상블 학습 시 사용한 모델들 각각의 정확도 파악

for i in classifier:
    i.fit(X_train, y_train)
    pred = i.predict(X_test)
    class_name = i.__class__.__name__
    print('{0} 정확도 : {1:.4f}'.format(class_name, accuracy_score(y_test, pred)))

LogisticRegression 정확도 : 0.9649
KNeighborsClassifier 정확도 : 0.9386
