In [10]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pandas as pd

#load_data
wines = load_wine()
wines_data = wines.data
wines_label = wines.target

#check_data
print('data shape :', wines_data.shape)
print(wines_label.shape)
print('data[0] : \n', wines_data[0])
print('label :', wines_label)
print('target_names :', wines.target_names)
print('keys : ', wines.keys())

wines_df = pd.DataFrame(data = wines_data, columns = wines.feature_names)
wines_df["label"] = wines.target
wines_df

data shape : (178, 13)
(178,)
data[0] : 
 [1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
 2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]
label : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
target_names : ['class_0' 'class_1' 'class_2']
keys :  dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,label
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [2]:
print(wines.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [3]:
#train, test data separate

X_train, X_test, y_train, y_test = train_test_split(wines_data, 
                                                    wines_label, 
                                                    test_size=0.2, 
                                                    random_state=21)

print('X_train : ', len(X_train), ', X_test : ', len(X_test))
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

X_train :  142 , X_test :  36
(142, 13) (142,)
(36, 13) (36,)


In [4]:
#Decision Tree Model

from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print('accuracy : ', accuracy)

print('confusion matrix : \n', confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94        15
           1       0.80      0.80      0.80        10
           2       0.89      0.73      0.80        11

    accuracy                           0.86        36
   macro avg       0.86      0.84      0.85        36
weighted avg       0.86      0.86      0.86        36

accuracy :  0.8611111111111112
confusion matrix : 
 [[15  0  0]
 [ 1  8  1]
 [ 1  2  8]]


In [5]:
#Random Forest Model

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print('accuracy : ', accuracy)

print('confusion matrix : \n', confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        11

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

accuracy :  1.0
confusion matrix : 
 [[15  0  0]
 [ 0 10  0]
 [ 0  0 11]]


In [6]:
#SVM Model
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print('accuracy : ', accuracy)

print('confusion matrix : \n', confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.45      1.00      0.62        10
           2       0.00      0.00      0.00        11

    accuracy                           0.61        36
   macro avg       0.44      0.60      0.48        36
weighted avg       0.48      0.61      0.52        36

accuracy :  0.6111111111111112
confusion matrix : 
 [[12  3  0]
 [ 0 10  0]
 [ 2  9  0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
#SGD Classifier Model

from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print('accuracy : ', accuracy)

print('confusion matrix : \n', confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80        15
           1       0.38      1.00      0.56        10
           2       0.00      0.00      0.00        11

    accuracy                           0.56        36
   macro avg       0.46      0.56      0.45        36
weighted avg       0.52      0.56      0.49        36

accuracy :  0.5555555555555556
confusion matrix : 
 [[10  5  0]
 [ 0 10  0]
 [ 0 11  0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
#Logistic Regression Model

from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print('accuracy : ', accuracy)

print('confusion matrix : \n', confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        11

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

accuracy :  1.0
confusion matrix : 
 [[15  0  0]
 [ 0 10  0]
 [ 0  0 11]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# 와인 데이터
-----
## 데이터 정보
* 총 178개   
* feature는 총 13개로, Alcohol, Malic acid, Color intensity 등 와인의 특성값   
* label은 class 0, 1, 2의 세 가지 카테고리   

## 모델 평가
* Random Forest Model과 Logistic Regression Model은 정확도가 1.0이 나옴.   
* Decision Tree Model은 정확도가 위의 2개의 모델이 비하면 낮으나 나머지 2개의 모델의 정확도에 비하면 높음.   
 - Decision Tree Model accuracy :  0.8611111111111112
 - SVM Model accuracy :  0.6111111111111112
 - SGD Classifier Model accuracy :  0.5555555555555556   
* 정확도가 낮은 SVM, SGD Model은 적합하지 않다고 판단.      
* Decision Tree 모델이 가장 적합하다고 생각했는데 metrics를 보면 또 가장 적합한 모델이라고 판단할 순 없을 것 같음.
* 때문에 정확도가 1.0이 나온 두 모델을이 가장 적합하다고 판단.

## 평가 지표
* class0,1과 비교하면  class2의 수가 적어 불균형 데이터라고 판단함.
* 때문에 정확도가 좋은 평가 지표는 아니라고 판단함.