In [74]:
# (1) 필요한 모듈 import
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [75]:
# (2) 데이터 준비
wine = load_wine()
wine_data = wine.data
wine_label = wine.target

In [76]:
# (3) 데이터 살펴보기
wine.keys()
wine_data = wine.data
print(wine_data.shape) 
wine_data[0]
dir(wine)
print(wine_label.shape)
wine_label
wine.target_names
print(wine.DESCR)
wine.feature_names

(178, 13)
(178,)
.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [77]:
# (3) train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2,
                                                    stratify=wine_label, 
                                                    random_state=7)

In [78]:
# (4-1) 모델 학습 및 예측 : Decision Tree
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("*** Decision Tree ***")

print("(1) acc 결과: {:.2f} ".format(accuracy_score(y_test , y_pred)))
print("(2) cr 결과: \n ", classification_report(y_test, y_pred))
if pd.Series(y_test).nunique() >=3:
    print("(3) f1 결과: {:.2f} ".format(f1_score(y_test, y_pred, average='macro')))
    print("(4) recall 결과: {:.2f} ".format(recall_score(y_test, y_pred, average='macro')))
    print("(5) precision 결과: {:.2f} ".format(precision_score(y_test, y_pred, average='macro')))
else:
    print("(3) f1 결과: {:.2f} ".format(f1_score(y_test, y_pred)))
    print("(4) recall 결과: {:.2f} ".format(recall_score(y_test, y_pred)))
    print("(5) precision 결과: {:.2f} ".format(precision_score(y_test, y_pred)))

*** Decision Tree ***
(1) acc 결과: 0.94 
(2) cr 결과: 
                precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.93      0.93      0.93        14
           2       0.91      1.00      0.95        10

    accuracy                           0.94        36
   macro avg       0.95      0.95      0.95        36
weighted avg       0.95      0.94      0.94        36

(3) f1 결과: 0.95 
(4) recall 결과: 0.95 
(5) precision 결과: 0.95 


In [79]:
# (4-2) 모델 학습 및 예측 : Random Forest
random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print("*** Random Forest ***")

print("(1) acc 결과: {:.2f} ".format(accuracy_score(y_test , y_pred)))
print("(2) cr 결과: \n ", classification_report(y_test, y_pred))
if pd.Series(y_test).nunique() >=3:
    print("(3) f1 결과: {:.2f} ".format(f1_score(y_test, y_pred, average='macro')))
    print("(4) recall 결과: {:.2f} ".format(recall_score(y_test, y_pred, average='macro')))
    print("(5) precision 결과: {:.2f} ".format(precision_score(y_test, y_pred, average='macro')))
else:
    print("(3) f1 결과: {:.2f} ".format(f1_score(y_test, y_pred)))
    print("(4) recall 결과: {:.2f} ".format(recall_score(y_test, y_pred)))
    print("(5) precision 결과: {:.2f} ".format(precision_score(y_test, y_pred)))

*** Random Forest ***
(1) acc 결과: 0.97 
(2) cr 결과: 
                precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      0.93      0.96        14
           2       0.91      1.00      0.95        10

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.97      0.97      0.97        36

(3) f1 결과: 0.97 
(4) recall 결과: 0.98 
(5) precision 결과: 0.97 


In [80]:
# (4-3) 모델 학습 및 예측 : SVM
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print("*** SVM ***")

print("(1) acc 결과: {:.2f} ".format(accuracy_score(y_test , y_pred)))
print("(2) cr 결과: \n ", classification_report(y_test, y_pred))
if pd.Series(y_test).nunique() >=3:
    print("(3) f1 결과: {:.2f} ".format(f1_score(y_test, y_pred, average='macro')))
    print("(4) recall 결과: {:.2f} ".format(recall_score(y_test, y_pred, average='macro')))
    print("(5) precision 결과: {:.2f} ".format(precision_score(y_test, y_pred, average='macro')))
else:
    print("(3) f1 결과: {:.2f} ".format(f1_score(y_test, y_pred)))
    print("(4) recall 결과: {:.2f} ".format(recall_score(y_test, y_pred)))
    print("(5) precision 결과: {:.2f} ".format(precision_score(y_test, y_pred)))

*** SVM ***
(1) acc 결과: 0.64 
(2) cr 결과: 
                precision    recall  f1-score   support

           0       1.00      0.75      0.86        12
           1       0.60      0.86      0.71        14
           2       0.29      0.20      0.24        10

    accuracy                           0.64        36
   macro avg       0.63      0.60      0.60        36
weighted avg       0.65      0.64      0.63        36

(3) f1 결과: 0.60 
(4) recall 결과: 0.60 
(5) precision 결과: 0.63 


In [81]:
# (4-4) 모델 학습 및 예측 : SGDClassifier
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print("*** SGDClassifier ***")

print("(1) acc 결과: {:.2f} ".format(accuracy_score(y_test , y_pred)))
print("(2) cr 결과: \n ", classification_report(y_test, y_pred))
if pd.Series(y_test).nunique() >=3:
    print("(3) f1 결과: {:.2f} ".format(f1_score(y_test, y_pred, average='macro')))
    print("(4) recall 결과: {:.2f} ".format(recall_score(y_test, y_pred, average='macro')))
    print("(5) precision 결과: {:.2f} ".format(precision_score(y_test, y_pred, average='macro')))
else:
    print("(3) f1 결과: {:.2f} ".format(f1_score(y_test, y_pred)))
    print("(4) recall 결과: {:.2f} ".format(recall_score(y_test, y_pred)))
    print("(5) precision 결과: {:.2f} ".format(precision_score(y_test, y_pred)))

*** SGDClassifier ***
(1) acc 결과: 0.58 
(2) cr 결과: 
                precision    recall  f1-score   support

           0       0.80      0.67      0.73        12
           1       0.50      0.93      0.65        14
           2       0.00      0.00      0.00        10

    accuracy                           0.58        36
   macro avg       0.43      0.53      0.46        36
weighted avg       0.46      0.58      0.50        36

(3) f1 결과: 0.46 
(4) recall 결과: 0.53 
(5) precision 결과: 0.43 


In [82]:
# (4-5) 모델 학습 및 예측 : LogisticRegression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print("*** LogisticRegression ***")

print("(1) acc 결과: {:.2f} ".format(accuracy_score(y_test , y_pred)))
print("(2) cr 결과: \n ", classification_report(y_test, y_pred))
if pd.Series(y_test).nunique() >=3:
    print("(3) f1 결과: {:.2f} ".format(f1_score(y_test, y_pred, average='macro')))
    print("(4) recall 결과: {:.2f} ".format(recall_score(y_test, y_pred, average='macro')))
    print("(5) precision 결과: {:.2f} ".format(precision_score(y_test, y_pred, average='macro')))
else:
    print("(3) f1 결과: {:.2f} ".format(f1_score(y_test, y_pred)))
    print("(4) recall 결과: {:.2f} ".format(recall_score(y_test, y_pred)))
    print("(5) precision 결과: {:.2f} ".format(precision_score(y_test, y_pred)))

*** LogisticRegression ***
(1) acc 결과: 0.89 
(2) cr 결과: 
                precision    recall  f1-score   support

           0       1.00      0.83      0.91        12
           1       0.81      0.93      0.87        14
           2       0.90      0.90      0.90        10

    accuracy                           0.89        36
   macro avg       0.90      0.89      0.89        36
weighted avg       0.90      0.89      0.89        36

(3) f1 결과: 0.89 
(4) recall 결과: 0.89 
(5) precision 결과: 0.90 


In [None]:
### 어떤 점수를 기반으로 모델을 선택할 것인가?
와인 추천은 Precision 점수를 기반으로 선택하는 것이 만족도가 높다고 생각한다. 
- Precision : 그 특성을 갖는 와인이 아닌데, 그 특성인 와인으로 판단하는 경우, 더 중요함
- Recall : 그 특성을 갖는 와인인데, 그 특성을 갖지 않은 와인으로 판단하는 경우 
그래서 Precision 점수가 가장 높은 0.97인 RandomForest 모델을 선택하기로 했다. 