# 3가지 데이터셋으로 모델 평가해보기 

## 프로젝트 공통

In [4]:
# 필요한 모듈 import
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [38]:
# 공통적으로 사용하게 될 모델의 모듈 import 및 생성 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

# 의사결정나무 
decision_tree = DecisionTreeClassifier(random_state=32)
# 랜덤포레스트
random_forest = RandomForestClassifier(random_state=32)
# SVM
svm_model = svm.SVC(random_state=32)
# SGD Classifier
sgd_model = SGDClassifier(random_state=32)
# Logistic Regression
logistic_model = LogisticRegression(max_iter=4000,random_state=32)

In [31]:
?classification_report

## 프로젝트 1 : load_digits : 손글씨 분류

`목표 : 손글씨 이미지를 0~9까지 열가지 카테고리로 분류` 

In [13]:
from sklearn.datasets import load_digits

# (1-1) 데이터 준비
digits = load_digits()
digits_data = digits.data
digits_label = digits.target

# (1-2) 데이터 이해하기
print(digits_data.shape)
print(digits_label.shape)
print(digits.target_names)
#print(digits.DESCR)

(1797, 64)
(1797,)
[0 1 2 3 4 5 6 7 8 9]


In [41]:
# (1-3) train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(digits_data, 
                                                    digits_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

# (1-4) 모델별 훈련, 예측, 평가 
for model in (decision_tree, random_forest, svm_model, sgd_model, logistic_model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model.__class__.__name__, accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

DecisionTreeClassifier 0.8555555555555555
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.81      0.81      0.81        42
           2       0.79      0.82      0.80        40
           3       0.79      0.91      0.85        34
           4       0.83      0.95      0.89        37
           5       0.90      0.96      0.93        28
           6       0.84      0.93      0.88        28
           7       0.96      0.82      0.89        33
           8       0.88      0.65      0.75        43
           9       0.78      0.78      0.78        32

    accuracy                           0.86       360
   macro avg       0.86      0.86      0.86       360
weighted avg       0.86      0.86      0.85       360

RandomForestClassifier 0.9638888888888889
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.93      1.00      0.97     

## 프로젝트 2 : load_wine : 와인 분류
`목표 : 와인의 어떤 특징으로 와인의 종류를 분류`

In [40]:
from sklearn.datasets import load_wine
# (2-1) 데이터 준비
wine = load_wine()
wine_data = wine.data
wine_label = wine.target

# (2-2) 데이터 이해하기
print(wine_data.shape)
print(wine_label.shape)
print(wine.target_names)
#print(digits.DESCR)

(178, 13)
(178,)
['class_0' 'class_1' 'class_2']


In [43]:
# (2-3) train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

# (2-4) 모델별 훈련, 예측, 평가 
for model in (decision_tree, random_forest, svm_model, sgd_model, logistic_model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model.__class__.__name__, accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred,zero_division=0))

DecisionTreeClassifier 0.9444444444444444
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.89      1.00      0.94        17
           2       1.00      0.83      0.91        12

    accuracy                           0.94        36
   macro avg       0.96      0.94      0.95        36
weighted avg       0.95      0.94      0.94        36

RandomForestClassifier 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

SVC 0.6111111111111112
              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.58      0.88    

## 프로젝트 3 : load_cancer : 유방암 여부 진단
`목표 : 환자의 유방암 여부를 분류` 

In [10]:
from sklearn.datasets import load_breast_cancer
# (3-1) 데이터 준비
cancer = load_breast_cancer()
cancer_data = cancer.data
cancer_label = cancer.target

# (3-2) 데이터 이해하기
print(cancer_data.shape)
print(cancer_data.shape)
print(cancer.target_names)
#print(digits.DESCR)

(569, 30)
(569, 30)
['malignant' 'benign']


In [44]:
# (3-3) train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(cancer_data, 
                                                    cancer_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

# (1-4) 모델별 훈련, 예측, 평가 
for model in (decision_tree, random_forest, svm_model, sgd_model, logistic_model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model.__class__.__name__, accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

DecisionTreeClassifier 0.9122807017543859
              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114

RandomForestClassifier 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        74

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114

SVC 0.9035087719298246
              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro avg       0.94      0.86   

## 느낀점

`여러가지 모델로 여러가지 테이터셋을 실험해 볼 수 있는 좋은 기회였다. 
차후에 모델을 평가한 결과값으로 비교 그래프를 그려봐도 좋을 거 같다.` 