# Wine dataset with unsupervised learning

#### Balazs Balogh - 2019

In [1]:
# Import the necessary packages

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
# Import the wine dataset, and inspect it. We can see the "class_label", and "class_name" column, which is the one that 
# we will delete, to change this task to unsupervised learing.

wine_dataset = pd.read_csv("https://raw.githubusercontent.com/budapestpy-workshops/sample_files/master/wine.csv")

wine_dataset.sample(5)

Unnamed: 0,class_label,class_name,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
65,2,Grignolino,12.37,1.21,2.56,18.1,98,2.42,2.65,0.37,2.08,4.6,1.19,2.3,678
135,3,Barbera,12.6,2.46,2.2,18.5,94,1.62,0.66,0.63,0.94,7.1,0.73,1.58,695
56,1,Barolo,14.22,1.7,2.3,16.3,118,3.2,3.0,0.26,2.03,6.38,0.94,3.31,970
80,2,Grignolino,12.0,0.92,2.0,19.0,86,2.42,2.26,0.3,1.43,2.5,1.38,3.12,278
104,2,Grignolino,12.51,1.73,1.98,20.5,85,2.2,1.92,0.32,1.48,2.94,1.04,3.57,672


In [3]:
# Exclude the first two columns, class_label and class_name, and store them separately

wine = wine_dataset.iloc[:, 2:] 
wine_labels_num = wine_dataset.iloc[:, 0:1]
wine_labels_name = wine_dataset.iloc[:, 1:2]

print('Wine dataset without the class_name and class_label columns:\n')
print(wine.info(), '\n')

print('class_labels and class_name columns, excluded from the original dataset:\n')
print(wine_labels_num.info(), '\n')
print(wine_labels_name.info(), '\n')

print('The three classes of wine:')
print(wine_dataset['class_name'].unique())

Wine dataset without the class_name and class_label columns:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
alcohol                 178 non-null float64
malic_acid              178 non-null float64
ash                     178 non-null float64
alcalinity_of_ash       178 non-null float64
magnesium               178 non-null int64
total_phenols           178 non-null float64
flavanoids              178 non-null float64
nonflavanoid_phenols    178 non-null float64
proanthocyanins         178 non-null float64
color_intensity         178 non-null float64
hue                     178 non-null float64
od280                   178 non-null float64
proline                 178 non-null int64
dtypes: float64(11), int64(2)
memory usage: 18.2 KB
None 

class_labels and class_name columns, excluded from the original dataset:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 1 columns):
class_label    

In [4]:
# Make the prediction with KMeans. We have to specify, how many clusters we want it to categorize the features.

model = KMeans(n_clusters=3)
labels = model.fit_predict(wine) # 0-1-2 (label) x 178 row

labels

array([2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0,
       2, 2, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0,
       0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 2, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1])

In [5]:
# To see the results, we need pandas' crosstab function:
# "Compute a simple cross tabulation of two (or more) factors. 
# By default computes a frequency table of the factors unless an array of values and an aggregation function are passed."
# We can see that label 1 is well categorized, but 0 and 2 is not.

ct = pd.crosstab(labels, wine_dataset['class_name'])

ct

class_name,Barbera,Barolo,Grignolino
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,29,13,20
1,19,0,50
2,0,46,1


In [6]:
# That's because of the big differences in the variance. Just look at the proline, or magnesium values.
# The dataset needs to be scaled.

wine.var()

alcohol                     0.659062
malic_acid                  1.248015
ash                         0.075265
alcalinity_of_ash          11.152686
magnesium                 203.989335
total_phenols               0.391690
flavanoids                  0.997719
nonflavanoid_phenols        0.015489
proanthocyanins             0.327595
color_intensity             5.374449
hue                         0.052245
od280                       0.504086
proline                 99166.717355
dtype: float64

In [7]:
# The StandardScaler tandardizes a feature by subtracting the mean and then scaling to unit variance. 
# Unit variance means dividing all the values by the standard deviation.

scaler = StandardScaler()

scaler.fit(wine)

samples_scaled = scaler.transform(wine)

In [8]:
# Create a DataFrame from the scaled features to see the new values.

scaled_features_df = pd.DataFrame(samples_scaled, index=wine.index, columns=wine.columns)

scaled_features_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
0,1.518613,-0.56225,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.84792,1.013009
1,0.24629,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.26902,0.318304,0.788587,1.395148
3,1.69155,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574
4,0.2957,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874


In [9]:
# The original values.

wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [10]:
# Practice the pipeline: first, scale the values, after that let KMeans do the classification.

pipeline = make_pipeline(scaler, model)

labels_pipeline = pipeline.fit_predict(wine)

In [11]:
# It's clearly visible, that with the StandardScaler, the classifier can do a much better job.

ct_scaled = pd.crosstab(labels_pipeline, wine_dataset['class_name'])

print('Without scaling:', '\n', ct, '\n')
print('With StandardScaler:', '\n', ct_scaled)

Without scaling: 
 class_name  Barbera  Barolo  Grignolino
row_0                                  
0                29      13          20
1                19       0          50
2                 0      46           1 

With StandardScaler: 
 class_name  Barbera  Barolo  Grignolino
row_0                                  
0                 0       0          65
1                 0      59           3
2                48       0           3
