In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


white_path = "wine-white.csv"
red_path = "wine-red.csv"

red_wine = pd.read_csv(red_path)
white_wine = pd.read_csv(white_path)

white_wine['label'] = 'default'
red_wine['label'] = 'default'


In [3]:
def labelWine(threshhold, wine_type, data): # function labels the white as being good or bad based on the threshhold value provided
    for index, row in data.iterrows():
        number = -1
        if wine_type == "red":
            number = 0
        
        elif wine_type == "white":
            number = 2
        
        quality = row['quality']
        if quality<threshhold: 
            data.at[index,'label'] = number
        else:
            data.at[index,'label'] = number+1
            
    return data


red_wine = labelWine(6, "red", red_wine)  #labeling the wine as being good or bad for both white and red whites
white_wine = labelWine(6, "white",white_wine)     

wine = pd.concat([red_wine,white_wine], ignore_index=True) #combining the red and white wine dataset after labeling 


    
wine = wine.drop(['quality'], axis=1)
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,label
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,3
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,2
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,3
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,3


# Clustering the dataset into 4 clusters
- The goal is to have the data group itself into good white wine, good red white wine, bad white wine and bad red wine
- good and bad wine will be determined by its quality rating

## Aglomerative Clustering

In [20]:
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
from sklearn.preprocessing import scale

# split the labels and features
labels = wine['label']
features = wine.drop(['label'], axis = 1)
print(features.shape)


# scales along rows, a per sample basis
def scale_features(features):
    features = scale(features, axis = 1)
    return features


def aggclust_initial(features, labels, n_clust):
    scores = {name: [] for name in ('ward', 'complete', 'average', 'single', 'ward euclidean',
                                    'complete euclidean', 'average euclidean', 'single euclidean',
                                    'complete l1', 'average l1', 'single l1',
                                    'complete l2', 'average l2', 'single l2')}
    for linkage_mode in ('ward', 'complete', 'average', 'single'):
        for affinity in ('euclidean', 'l1', 'l2'):
            if linkage_mode == 'ward' and affinity != 'euclidean':
                print('skip l1 and l2 for ward')
            else:
                clust = AgglomerativeClustering(n_clusters = n_clust, affinity = affinity, 
                                                linkage = linkage_mode).fit(features)
                ARI_score = metrics.adjusted_rand_score(labels, clust.labels_)
                scores[linkage_mode + ' ' + affinity].append(ARI_score)
                scores[linkage_mode].append(ARI_score)
                print(f'AHC Affinity={affinity:9} Linkage mode={linkage_mode:8} ARI score = {ARI_score:.5}')
    return scores
# intitial results no scaling                
scores = aggclust_initial(features, labels, 4)

# scale the data

sc_features = scale_features(features)
print("Now Scaled")
# results with scaling
sc_scores = aggclust_initial(sc_features, labels, 4)


(6497, 11)
AHC Affinity=euclidean Linkage mode=ward     ARI score = 0.16255
skip l1 and l2 for ward
skip l1 and l2 for ward
AHC Affinity=euclidean Linkage mode=complete ARI score = 0.24452
AHC Affinity=l1        Linkage mode=complete ARI score = -0.013737
AHC Affinity=l2        Linkage mode=complete ARI score = 0.24452
AHC Affinity=euclidean Linkage mode=average  ARI score = 0.30738
AHC Affinity=l1        Linkage mode=average  ARI score = 0.00052832
AHC Affinity=l2        Linkage mode=average  ARI score = 0.30738
AHC Affinity=euclidean Linkage mode=single   ARI score = 0.0001167
AHC Affinity=l1        Linkage mode=single   ARI score = 0.0001167
AHC Affinity=l2        Linkage mode=single   ARI score = 0.0001167
Now Scaled
AHC Affinity=euclidean Linkage mode=ward     ARI score = 0.17357
skip l1 and l2 for ward
skip l1 and l2 for ward
AHC Affinity=euclidean Linkage mode=complete ARI score = 0.11276
AHC Affinity=l1        Linkage mode=complete ARI score = 0.22895
AHC Affinity=l2        Lin

### Results
- Some of the scaled features provided better clustering results but the best results were on the non-scaled data using the 'average' linkage mode and 'euclidean' or 'l2'. 
- With the scaled data the best results were with 'complete' linkage and 'l1' affinity. 

The results can be found in the boxplot below

In [21]:
for k,v in scores.items():
    print(k, v)

ward [0.16255084342472914]
complete [0.24452349182144464, -0.013737148766553683, 0.24452349182144464]
average [0.30737634461812696, 0.0005283195865163993, 0.30737634461812696]
single [0.00011669883762438619, 0.00011669883762438619, 0.00011669883762438619]
ward euclidean [0.16255084342472914]
complete euclidean [0.24452349182144464]
average euclidean [0.30737634461812696]
single euclidean [0.00011669883762438619]
complete l1 [-0.013737148766553683]
average l1 [0.0005283195865163993]
single l1 [0.00011669883762438619]
complete l2 [0.24452349182144464]
average l2 [0.30737634461812696]
single l2 [0.00011669883762438619]
