# 1. module import

In [21]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# 2. Load Data

In [22]:
cancer = load_breast_cancer()

In [23]:
print(dir(cancer))

['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']


# 3. EDA

- Feature Data
- Label Data
- Print Target Names
- Data describe

In [24]:
cancer_data = cancer.data
cancer_label = cancer.target

In [25]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [26]:
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [27]:
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [28]:
cancer_data = cancer.data

print(cancer_data.shape)

(569, 30)


# 4. Split train, test data set

In [29]:
X_train, X_test, y_train, y_test = train_test_split(cancer_data, 
                                                    cancer_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

# 5. Model training

In [30]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)

In [31]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)

In [32]:
#SVM
from sklearn import svm
svm_model = svm.SVC()

svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

In [33]:
#SGD Classifier
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()

sgd_model.fit(X_train, y_train)
y_pred_sgd = sgd_model.predict(X_test)

In [34]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=5000) 
#add max_iter option to solve "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT." error.

logistic_model.fit(X_train, y_train)
y_pred_lp = logistic_model.predict(X_test)

# 6. Model Evaluation

In [35]:
# Decision tree
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114



In [36]:
#Random Forest
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        74

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114



In [37]:
#SVM
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro avg       0.94      0.86      0.89       114
weighted avg       0.92      0.90      0.90       114



In [38]:
#SGD Classifier
print(classification_report(y_test, y_pred_sgd, zero_division=0))
# To solve error, add 'zero_division=0'

              precision    recall  f1-score   support

           0       1.00      0.53      0.69        40
           1       0.80      1.00      0.89        74

    accuracy                           0.83       114
   macro avg       0.90      0.76      0.79       114
weighted avg       0.87      0.83      0.82       114



In [39]:
#Logistic Regression
print(classification_report(y_test, y_pred_lp))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        40
           1       0.93      1.00      0.96        74

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



# Description

1. How Random Forest's Score is 1.00?
2. I recommend Logistic Regrssion because of RF overfitting issue.