In [1]:
from sklearn.datasets import load_wine
import matplotlib.pyplot as plt
import numpy as np

wine = load_wine()
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [2]:
X=wine['data']
y=wine['target']

In [3]:
X.shape, y.shape

((178, 13), (178,))

In [5]:
import pandas as pd
from sklearn import datasets
from sklearn import model_selection
from sklearn import ensemble

In [6]:
wine = datasets.load_wine()
print('Dataset structure= ', dir(wine))

df = pd.DataFrame(wine.data, columns = wine.feature_names)
df['target'] = wine.target
df['wine_class'] = df.target.apply(lambda x : wine.target_names[x]) # Each value from 'target' is used as index to get corresponding value from 'target_names' 

print('Unique target values=',df['target'].unique())

df.head()

Dataset structure=  ['DESCR', 'data', 'feature_names', 'target', 'target_names']
Unique target values= [0 1 2]


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target,wine_class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0,class_0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0,class_0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0,class_0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0,class_0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0,class_0


In [10]:
# wine class_0
df[df.target == 0].head(3)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target,wine_class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0,class_0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0,class_0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0,class_0


In [11]:
# wine class_1
df[df.target == 1].head(3)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target,wine_class
59,12.37,0.94,1.36,10.6,88.0,1.98,0.57,0.28,0.42,1.95,1.05,1.82,520.0,1,class_1
60,12.33,1.1,2.28,16.0,101.0,2.05,1.09,0.63,0.41,3.27,1.25,1.67,680.0,1,class_1
61,12.64,1.36,2.02,16.8,100.0,2.02,1.41,0.53,0.62,5.75,0.98,1.59,450.0,1,class_1


In [12]:
# wine class_2
df[df.target == 2].head(3)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target,wine_class
130,12.86,1.35,2.32,18.0,122.0,1.51,1.25,0.21,0.94,4.1,0.76,1.29,630.0,2,class_2
131,12.88,2.99,2.4,20.0,104.0,1.3,1.22,0.24,0.83,5.4,0.74,1.42,530.0,2,class_2
132,12.81,2.31,2.4,24.0,98.0,1.15,1.09,0.27,0.83,5.7,0.66,1.36,560.0,2,class_2


In [13]:
X = df[['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium','total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']]
y = df[['target']]

print('X shape=', X.shape)
print('y shape=', y.shape)

X shape= (178, 13)
y shape= (178, 1)


In [14]:
X_train,X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size= 0.2, random_state= 1)
print('X_train dimension= ', X_train.shape)
print('X_test dimension= ', X_test.shape)
print('y_train dimension= ', y_train.shape)
print('y_train dimension= ', y_test.shape)

X_train dimension=  (142, 13)
X_test dimension=  (36, 13)
y_train dimension=  (142, 1)
y_train dimension=  (36, 1)


In [16]:
rfc = ensemble.RandomForestClassifier(random_state = 1)
rfc.fit(X_train ,y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [17]:
print('Actual Wine type for 10th test data sample= ', wine.target_names[y_test.iloc[10]][0])
print('Wine type prediction for 10th test data sample= ',wine.target_names[rfc.predict([X_test.iloc[10]])][0])

print('Actual Wine type for 30th test data sample= ', wine.target_names[y_test.iloc[30]][0])
print('Wine type prediction for 30th test data sample= ',wine.target_names[rfc.predict([X_test.iloc[30]])][0])

Actual Wine type for 10th test data sample=  class_0
Wine type prediction for 10th test data sample=  class_0
Actual Wine type for 30th test data sample=  class_1
Wine type prediction for 30th test data sample=  class_1


In [18]:
rfc.score(X_test, y_test)

0.9722222222222222

# 이번 과제의 목표는 사용 가능한 데이터를 기반으로 3가지의 와인을 분류하는 것입니다.
# 3가지 종류의 와인에서 발견되는 다양한 성분에 대해 13가지 측정을 통해 분류하였습니다.
# 총 훈련 사례는 178개 이며, 클래스 당 샘플은 59,71,48 입니다.
# Random Forest Classification 알고리즘을 사용하여 모델을 학습 시켰습니다.
# 테스트 데이터에서 10위와 30위의 와인 등급을 예측하였습니다.
# 랜덤 포레스트를 사용하여 높은 정확도를 보여주었습니다.
# 또한, 반복적인 학습이 불필요하다는 점도 장점이었습니다.
# 최소한의 코드만을 사용해서 좋은 결과를 보일 수 있습니다.