In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. 데이터 준비

In [2]:
breast_cancer = load_breast_cancer()

# 2. 데이터 이해하기

In [3]:
# Feature Data 지정하기
breast_cancer_data = breast_cancer.data
# Label Data 지정하기
breast_cancer_label = breast_cancer.target
# 데이터 Describe 해 보기
print(breast_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

## 2.1 breast cancer에 어떤 정보가 있는가?

In [4]:
print(breast_cancer.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


## 2.2 breast cancer의 입력 데이터는 어떤 구조인가?

In [5]:
print(breast_cancer_data.shape)

(569, 30)


## 2.3 breast cancer의 출력 데이터는 어떤 구조인가?

In [6]:
print(breast_cancer_label.shape)

(569,)


In [7]:
# Target Names 출력해 보기
print(breast_cancer.target_names)

['malignant' 'benign']


# 3. 데이터 전처리

In [8]:
# train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, breast_cancer_label, test_size=0.2, random_state=7)

# 4. 여러 모델로 학습 해보기

## 4.1 Decision Tree 모델 학습 및 예측

In [9]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114



## 4.2 Random Forest 모델 학습 및 예측

In [10]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        74

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114



## 4.3 SVM 모델 학습 및 예측

In [11]:
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro avg       0.94      0.86      0.89       114
weighted avg       0.92      0.90      0.90       114



## 4.4 SGD Classifier 모델 학습 및 예측

In [12]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.95      0.81        40
           1       0.97      0.78      0.87        74

    accuracy                           0.84       114
   macro avg       0.84      0.87      0.84       114
weighted avg       0.87      0.84      0.85       114



## 4.5 Logistic Regression 모델 학습 및 예측

In [13]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=10000)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        40
           1       0.93      1.00      0.96        74

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



# 5. 모델 평가

### Breast cancer dataset에서 사용한 여러 모델중 정확도와 percision, recall 모든 지표가 좋은 모델은 Decision Tree, Random Forest, Logistic Regression 모델이였다.
### 위 3가지 모델 중 Random Forest는 과적합때문에 100%의 정확도가 나왔다고 생각한다.
### 때문에 Random Forest는 모델 후보군에서 제외하고자 한다.
# 
### Decision Tree와 Logistic Regression는 좋은 정확도와 적절한 percision, recall을 보여주었다.
### 유방암 환자 선별 모델은 실제 암 환자를 절때 선별하지 못하면 안되므로, recall 지표가 가장 중요하다고 생각한다.
### 두 후보군 중, Logistic Regression의 recall 가장 좋은 결과를 나타냈으므로 Logistic Regression를 사용하고자 한다.
# 
### sklearn.metrics모듈에서는 분류 메트릭이 적합하다고 생각한다.
### 훈련 데이터가 암 여부를 나타내는 범주형 데이터이기 때문이다.