# Wine dataset with unsupervised learning

#### Balazs Balogh - 2019

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
wine_dataset = pd.read_csv("https://raw.githubusercontent.com/budapestpy-workshops/sample_files/master/wine.csv")

wine_dataset.sample(5)

Unnamed: 0,class_label,class_name,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
128,2,Grignolino,12.37,1.63,2.3,24.5,88,2.22,2.45,0.4,1.9,2.12,0.89,2.78,342
158,3,Barbera,14.34,1.68,2.7,25.0,98,2.8,1.31,0.53,2.7,13.0,0.57,1.96,660
36,1,Barolo,13.28,1.64,2.84,15.5,110,2.6,2.68,0.34,1.36,4.6,1.09,2.78,880
156,3,Barbera,13.84,4.12,2.38,19.5,89,1.8,0.83,0.48,1.56,9.01,0.57,1.64,480
131,3,Barbera,12.88,2.99,2.4,20.0,104,1.3,1.22,0.24,0.83,5.4,0.74,1.42,530


In [3]:
wine = wine_dataset.iloc[:, 2:]
wine_labels_num = wine_dataset.iloc[:, 0:1]
wine_labels_name = wine_dataset.iloc[:, 1:2]

print(wine.info(), '\n')
print(wine_labels_num.info(), '\n')
print(wine_labels_name.info(), '\n')
print(wine_dataset['class_name'].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
alcohol                 178 non-null float64
malic_acid              178 non-null float64
ash                     178 non-null float64
alcalinity_of_ash       178 non-null float64
magnesium               178 non-null int64
total_phenols           178 non-null float64
flavanoids              178 non-null float64
nonflavanoid_phenols    178 non-null float64
proanthocyanins         178 non-null float64
color_intensity         178 non-null float64
hue                     178 non-null float64
od280                   178 non-null float64
proline                 178 non-null int64
dtypes: float64(11), int64(2)
memory usage: 18.2 KB
None 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 1 columns):
class_label    178 non-null int64
dtypes: int64(1)
memory usage: 1.5 KB
None 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177


In [4]:
model = KMeans(n_clusters=3)
labels = model.fit_predict(wine) # 0-1-2 (label) x 178 sor

labels

array([1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 0, 0, 2,
       0, 0, 2, 2, 2, 0, 0, 1, 2, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0,
       2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2, 0, 0, 0, 0, 2,
       2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0])

In [5]:
# crosstab egyfajta a pivotolás, és sokat szenvedtem, hogy a labels - ami egy numpy array - mellett hogy adjam meg a másik 
# oszlopot, végül csak simán az eredeti csv class_name oszlopa lett.

# Az látszik, hogy az 1-es labelt jól kategorizálta, de a 0 és 2-t nem.

ct = pd.crosstab(labels, wine_dataset['class_name'])

ct

class_name,Barbera,Barolo,Grignolino
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,19,0,50
1,0,46,1
2,29,13,20


In [6]:
# Méghozza a hatalmas variance beli különbségek miatt, lásd: magnesium, proline, alcalinitiy_of_ash.
# Scaling kell.

wine.var()

alcohol                     0.659062
malic_acid                  1.248015
ash                         0.075265
alcalinity_of_ash          11.152686
magnesium                 203.989335
total_phenols               0.391690
flavanoids                  0.997719
nonflavanoid_phenols        0.015489
proanthocyanins             0.327595
color_intensity             5.374449
hue                         0.052245
od280                       0.504086
proline                 99166.717355
dtype: float64

In [7]:
# StandardScaler 0-1 közöttre módosítja az értékeket egymáshoz viszonyítva.

scaler = StandardScaler()

scaler.fit(wine)

samples_scaled = scaler.transform(wine)

In [8]:
# Pipeline használat leegyszerűsíti a metódust. Az eddig elkészített scaler, és model megy bele.

pipeline = make_pipeline(scaler, model)

labels_pipeline = pipeline.fit_predict(wine)

In [9]:
# Látszik, hogy egy sima StandardScale, avagy a featurek egy intervallumban való szerepeltetése mennyit javít.

ct_scaled = pd.crosstab(labels_pipeline, wine_dataset['class_name'])

print('Scale nélkül:', '\n', ct, '\n')
print('StandardScale:', '\n', ct_scaled)

Scale nélkül: 
 class_name  Barbera  Barolo  Grignolino
row_0                                  
0                19       0          50
1                 0      46           1
2                29      13          20 

StandardScale: 
 class_name  Barbera  Barolo  Grignolino
row_0                                  
0                 0      59           3
1                48       0           3
2                 0       0          65
