In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

cancer = load_breast_cancer()

cancer_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
cancer_df["target"] = cancer.target
print(cancer_df)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

In [5]:
# 입력(특성)데이터와 타겟데이터를 각각 넘파이 배열로 반환
cancer_input = cancer_df[cancer.feature_names].to_numpy()
cancer_target = cancer_df['target'].to_numpy()

# 스케일링 전처리 : 평균이 0, 분산이 1인 데이터 분포가 되도록 변환
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
cancer_input = ss.fit_transform(cancer_input)

[[ 1.09706398 -2.07333501  1.26993369 ...  2.29607613  2.75062224
   1.93701461]
 [ 1.82982061 -0.35363241  1.68595471 ...  1.0870843  -0.24388967
   0.28118999]
 [ 1.57988811  0.45618695  1.56650313 ...  1.95500035  1.152255
   0.20139121]
 ...
 [ 0.70228425  2.0455738   0.67267578 ...  0.41406869 -1.10454895
  -0.31840916]
 [ 1.83834103  2.33645719  1.98252415 ...  2.28998549  1.91908301
   2.21963528]
 [-1.80840125  1.22179204 -1.81438851 ... -1.74506282 -0.04813821
  -0.75120669]]


In [11]:
from sklearn.model_selection import train_test_split

# 훈련 세트와 검증 세트 7:3 으로 분할
train_input, test_input, train_target, test_target = \
train_test_split(cancer_input, cancer_target, test_size=0.3, random_state=23)

from sklearn.linear_model import LogisticRegression
import numpy as np

lr = LogisticRegression()
lr.fit(train_input, train_target)

print(lr.classes_)
print(cancer.target_names) # 0일 때가 악성종양, 1일 때가 양성종양 : 즉 0일 때가 암

[0 1]
['malignant' 'benign']


In [12]:
print(lr.score(train_input, train_target))
print(lr.score(test_input, test_target))

0.9899497487437185
0.9766081871345029


In [13]:
print(lr.predict(test_input[:10]))
proba = lr.predict_proba(test_input[:10])
print(np.round(proba, decimals=3))

[1 1 0 1 0 0 0 1 1 0]
[[0.007 0.993]
 [0.467 0.533]
 [1.    0.   ]
 [0.03  0.97 ]
 [0.998 0.002]
 [1.    0.   ]
 [1.    0.   ]
 [0.089 0.911]
 [0.005 0.995]
 [0.999 0.001]]


In [14]:
# KNN(K최근접이웃) 분류 모델로 하면 어떨지?
from sklearn.neighbors import KNeighborsClassifier

# 근접 이웃 수에 따라 확률이 단순하게 정해지므로 신뢰도가 높지 않다
kn = KNeighborsClassifier(n_neighbors=5)
kn.fit(train_input, train_target)

print(kn.score(train_input, train_target))
print(kn.score(test_input, test_target))

print(kn.predict(test_input[:10]))
proba = kn.predict_proba(test_input[:10])
print(np.round(proba, decimals=3))

0.9698492462311558
0.9766081871345029
[1 0 0 1 0 0 0 1 1 0]
[[0.  1. ]
 [0.6 0.4]
 [1.  0. ]
 [0.  1. ]
 [1.  0. ]
 [1.  0. ]
 [1.  0. ]
 [0.2 0.8]
 [0.  1. ]
 [1.  0. ]]
