# 와인 분류

In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

## 데이터 준비

In [2]:
wines = load_wine()
wines_data = wines.data
wines_label = wines.target

print("전체 데이터:", wines_label.shape[0])

전체 데이터: 178


In [3]:
import pandas as pd

df = pd.DataFrame(data=wines_data, columns=wines.feature_names)
df["label"] = wines.target
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,label
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [4]:
print("Label 별 데이터 갯수")
df.label.value_counts()

Label 별 데이터 갯수


1    71
0    59
2    48
Name: label, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(wines_data, 
                                                    wines_label, 
                                                    test_size=0.2, # 테스트 데이터 비중
                                                    random_state=7)

y_train, y_test, y_test.shape
print("학습용:", X_train.shape[0])
print("테스트용:", X_test.shape[0])

학습용: 142
테스트용: 36


## 5개 모델 학습

### 1. Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("[Report]\n", classification_report(y_test, y_pred, zero_division=0))
print("[Confusion Matrix]\n", confusion_matrix(y_test, y_pred))

[Report]
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.89      1.00      0.94        17
           2       1.00      0.83      0.91        12

    accuracy                           0.94        36
   macro avg       0.96      0.94      0.95        36
weighted avg       0.95      0.94      0.94        36

[Confusion Matrix]
 [[ 7  0  0]
 [ 0 17  0]
 [ 0  2 10]]


### 2. Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print("[Report]\n", classification_report(y_test, y_pred, zero_division=0))
print("[Confusion Matrix]\n", confusion_matrix(y_test, y_pred))

[Report]
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

[Confusion Matrix]
 [[ 7  0  0]
 [ 0 17  0]
 [ 0  0 12]]


### 3. SVM

In [8]:
from sklearn import svm

svm_model = svm.SVC(random_state=32)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print("[Report]\n", classification_report(y_test, y_pred, zero_division=0))
print("[Confusion Matrix]\n", confusion_matrix(y_test, y_pred))

[Report]
               precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.58      0.88      0.70        17
           2       0.33      0.08      0.13        12

    accuracy                           0.61        36
   macro avg       0.59      0.61      0.56        36
weighted avg       0.55      0.61      0.54        36

[Confusion Matrix]
 [[ 6  0  1]
 [ 1 15  1]
 [ 0 11  1]]


### 4. SGD Classifier

In [9]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(random_state=32)
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print("[Report]\n", classification_report(y_test, y_pred, zero_division=0))
print("[Confusion Matrix]\n", confusion_matrix(y_test, y_pred))

[Report]
               precision    recall  f1-score   support

           0       0.78      1.00      0.88         7
           1       0.59      0.94      0.73        17
           2       0.00      0.00      0.00        12

    accuracy                           0.64        36
   macro avg       0.46      0.65      0.53        36
weighted avg       0.43      0.64      0.51        36

[Confusion Matrix]
 [[ 7  0  0]
 [ 1 16  0]
 [ 1 11  0]]


### 5. Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=32, max_iter=5000)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print("[Report]\n", classification_report(y_test, y_pred, zero_division=0))
print("[Confusion Matrix]\n", confusion_matrix(y_test, y_pred))

[Report]
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.94      1.00      0.97        17
           2       1.00      0.92      0.96        12

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.98        36
weighted avg       0.97      0.97      0.97        36

[Confusion Matrix]
 [[ 7  0  0]
 [ 0 17  0]
 [ 0  1 11]]


## 모델 평가

### Q.

학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요?

모델의 성능을 평가하는 지표로는 무엇이 좋을까요?

sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

### A.

*Classification Report*로 확인해보면 성능이 가장 나은 모델은 **Random Forest** 모델로 f1스코어 100%이다. 반면 성능이 가장 떨어지는 모델은 **SGD Classifier**로 f1스코어 51%이다.

*Confusion Matrix*를 확인해보면 어떤 클래스 와인을 혼동하는지 확인할 수 있는데, 대체로 **Class-2 와인 식별률이 떨어지는** 것을 확인할 수 있다.