In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


white_path = "wine-white.csv"
red_path = "wine-red.csv"

red_wine = pd.read_csv(red_path)
white_wine = pd.read_csv(white_path)

white_wine['label'] = 'default'
red_wine['label'] = 'default'


In [2]:
def labelWine(threshhold, wine_type, data): # function labels the white as being good or bad based on the threshhold value provided
    for index, row in data.iterrows():
        number = -1
        if wine_type == "red":
            number = 0
        
        elif wine_type == "white":
            number = 2
        
        
        
        quality = row['quality']
        if quality<threshhold: 
            data.at[index,'label'] = number
        else:
            data.at[index,'label'] = number+1
            
    return data


red_wine = labelWine(6, "red", red_wine)  #labeling the wine as being good or bad for both white and red whites
white_wine = labelWine(6, "white",white_wine)     

wine = pd.concat([red_wine,white_wine], ignore_index=True) #combining the red and white wine dataset after labeling 


    
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,label
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,3
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,2
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,3
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,3


# Clustering the dataset into 4 clusters  (KMeans)
- The goal is to have the data group itself into good white wine, good red white wine, bad white wine and bad red wine
- good and bad wine will be determined by its quality rating

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn import metrics
from IPython.display import Markdown

from sklearn.preprocessing import scale

temp = wine.drop('quality',1)
labels = temp['label']
feature = temp.drop('label',1)
feature_scaled = scale(feature) # scaling the features data


scores = {name: [] for name in ('auto', 'full', 'elkan')}
scores_scaled = {name: [] for name in ('auto', 'full', 'elkan')}

for algorithm in ('auto', 'full', 'elkan'): # test different seed values with different algorithms and check the ARI scores
    for seed in np.arange(0, 10):
        kmeans = KMeans(n_clusters = 4, random_state = seed, algorithm = algorithm)
        kmeans_scaled = KMeans(n_clusters = 4, random_state = seed, algorithm = algorithm)
        
        kmeans.fit(feature)
        kmeans_scaled.fit(feature_scaled)
        
        ARI_score = metrics.adjusted_rand_score(kmeans.labels_ ,labels)
        ARI_score_scaled = metrics.adjusted_rand_score(kmeans_scaled.labels_ ,labels)
        
        array = kmeans.labels_
        
        scores[algorithm].append(ARI_score)
        scores_scaled[algorithm].append(ARI_score_scaled)
        
        print(algorithm, seed, ARI_score)

        


median_auto = np.median(np.array(scores['auto'])) # scores from non scaled features
median_full = np.median(np.array(scores['full']))
median_elkan = np.median(np.array(scores['elkan']))     

median_auto_scaled = np.median(np.array(scores_scaled['auto'])) # scores from scaled features
median_full_scaled = np.median(np.array(scores_scaled['full']))
median_elkan_scaled = np.median(np.array(scores_scaled['elkan']))  


display(Markdown(\
f'|[]()|Median scores|\n{"|---"*2}|\n'
f'|**auto**|{median_auto:.4}|\n'
f'|**full**|{median_full:.4}|\n'
f'|**elkan**|{median_elkan:.4}|\n'))

display(Markdown(\
f'|[]()|Median scores scaled|\n{"|---"*2}|\n'
f'|**auto**|{median_auto_scaled:.4}|\n'
f'|**full**|{median_full_scaled:.4}|\n'
f'|**elkan**|{median_elkan_scaled:.4}|\n'))
        
        
        


auto 0 0.1790886139178354
auto 1 0.17916415799812466
auto 2 0.1792577499620072
auto 3 0.17901233887814302
auto 4 0.17902706339533264
auto 5 0.17919798644423574
auto 6 0.17892223680057928
auto 7 0.17919798644423574
auto 8 0.1795126908651767
auto 9 0.1792795183615587
full 0 0.1790886139178354
full 1 0.17916415799812466
full 2 0.1792577499620072


## results
- in regards to the algorithm selected for the kmeans testing it appears to not have an impact on the result as shown by the tables above
- the medians that we recieved after finding the ARI scores from 0-9 are 0.1792, this was unscaled
- after scaling the features we achieved a better ARI score of 0.3223

In [None]:
# plotting the number of points in each cluster 


zero = np.count_nonzero(array==0)
one = np.count_nonzero(array==1)
two = np.count_nonzero(array==2)
three = np.count_nonzero(array==3)

clusters = ['1', '2', '3', '4']
count = [zero, one, two, three]


fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(clusters,count)

plt.show()


## bar graph
- this is a bar graph of the distribution of points in each of the cluster
- the clusters are suppose to represent good white wines, bad white wines, good red wines and bad red wines
- from the bar graph and data alone however we do not know what each cluster represents 


# DBSCAN
- test different eps values with different min_samples values and plot the results
- find the combination that provides the highest score 
- use scaled data to improve results

In [1]:
from sklearn.cluster import DBSCAN


xs = []
ys = []
zs = []
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
max_value = 0 
max_eps = -5
samples_ = -5

for eps in range(2,16,2):  # testting different min_sample values with different eps and recording the best ARI scores 
        for samples in range(150, 250, 10):
            
            xs.append(eps)
            ys.append(samples)
            
            db = DBSCAN (eps = eps , min_samples = samples)
            db.fit(temp.drop('label',1))
            
            ARI_score = metrics.adjusted_rand_score(db.labels_,labels)
            print(ARI_score)
            zs.append(ARI_score)
            if ARI_score > max_value:
                max_value = ARI_score
                max_eps = eps
                samples_ = samples


            
print("max value is:", max_value)
print("eps: ", max_eps)
print("min_samples: ", samples_)


ax.scatter(xs, ys, zs)
ax.set_xlabel('eps')
ax.set_ylabel('min_samples')
ax.set_zlabel('score')

plt.show()    # plotting a 3d graph to compare the ARI score based on different eps and min_sample values using scaled features data


NameError: name 'plt' is not defined

## Results
- the data was tested using mutile eps and min_samples values
- the highest ARI score we achieved was 0.235 with eps = 10 and min_samples = 200
- the test was done using scaled feature values