# wine 분류하기

### (1) 필요한 모듈 import

In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
import numpy as np

### (2) 데이터 준비

In [2]:
wines = load_wine()
wines_data = wines.data
wines_label = wines.target

In [3]:
print(wines_data.shape)

(178, 13)


In [4]:
print(wines.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


In [5]:
print(wines.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [6]:
print(wines.target_names)

['class_0' 'class_1' 'class_2']


In [7]:
print(wines.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

### (3) train, test 데이터 분리

In [8]:
x_train, x_test, y_train, y_test = train_test_split(wines_data, 
                                                    wines_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

In [9]:
x_train.shape, y_train.shape

((142, 13), (142,))

In [10]:
# 정규화 여기선 불필요

x_train_norm, x_test_norm = x_train, x_test

### (4) 모델 학습 및 예측

In [11]:
# Decision Tree
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(x_train_norm, y_train)
decision_tree_y_pred = decision_tree.predict(x_test_norm)

print(classification_report(y_test, decision_tree_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.89      1.00      0.94        17
           2       1.00      0.83      0.91        12

    accuracy                           0.94        36
   macro avg       0.96      0.94      0.95        36
weighted avg       0.95      0.94      0.94        36



In [12]:
# RandomForest
random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(x_train_norm, y_train)
random_forest_y_pred = random_forest.predict(x_test_norm)

print(classification_report(y_test, random_forest_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [13]:
# SVM
svm_model = svm.SVC()
svm_model.fit(x_train_norm, y_train)
svm_y_pred = svm_model.predict(x_test_norm)

print(classification_report(y_test, svm_y_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.58      0.88      0.70        17
           2       0.33      0.08      0.13        12

    accuracy                           0.61        36
   macro avg       0.59      0.61      0.56        36
weighted avg       0.55      0.61      0.54        36



In [14]:
# SGD
sgd_model = SGDClassifier()
sgd_model.fit(x_train_norm, y_train)
sgd_y_pred = sgd_model.predict(x_test_norm)

print(classification_report(y_test, sgd_y_pred))

              precision    recall  f1-score   support

           0       0.35      1.00      0.52         7
           1       0.75      0.71      0.73        17
           2       0.00      0.00      0.00        12

    accuracy                           0.53        36
   macro avg       0.37      0.57      0.42        36
weighted avg       0.42      0.53      0.44        36



  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Logistic Regression
logistic_model = LogisticRegression(max_iter=4096)
logistic_model.fit(x_train_norm, y_train)
logistic_y_pred = logistic_model.predict(x_test_norm)

print(classification_report(y_test, logistic_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.94      1.00      0.97        17
           2       1.00      0.92      0.96        12

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.98        36
weighted avg       0.97      0.97      0.97        36



In [16]:
# 종합 평가
# 와인은 판단 기준이 와인의 종류를 정확하게 알아맞추는 것이 중요하다고 생각하기 때문에
# Recall 이 중요하다고 생각되어 Recall 의 평균 값만 출력해보았다.
# wine의 경우에는 feature 의 갯수가 많아서 Decision Tree 또는 Random Forest가
# 우수한 성능을 나타낼 것으로 예상된다.

print('Decision Tree       : {}'.format(recall_score(y_test, decision_tree_y_pred, average='weighted')))
print('Random Forest       : {}'.format(recall_score(y_test, random_forest_y_pred, average='weighted')))
print('SVM                 : {}'.format(recall_score(y_test, svm_y_pred, average='weighted')))
print('SGD                 : {}'.format(recall_score(y_test, sgd_y_pred, average='weighted')))
print('Logistic Regression : {}'.format(recall_score(y_test, logistic_y_pred, average='weighted')))

Decision Tree       : 0.9444444444444444
Random Forest       : 1.0
SVM                 : 0.6111111111111112
SGD                 : 0.5277777777777778
Logistic Regression : 0.9722222222222222


In [17]:
# 결론 : 예상과 같이 Random Forest 모델이 가장 우수한 것으로 나타났다.
# 아무래도 wine이라면 feature의 수에 따라서 어떤 와인인지 확실하게 알 수 있기 때문인 것 같다.