In [1]:
import numpy as np

import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import datasets

In [2]:
wine_ds = datasets.load_wine()
# print(wine_ds.DESCR)

In [3]:
X = wine_ds.data
y = wine_ds.target

unique, counts = np.unique(y, return_counts=True)

print("X.shape: ", X.shape)
print("No of classes in y: ", set(y))
print("Each class label counts: ", dict(zip(unique, counts)))

X.shape:  (178, 13)
No of classes in y:  {0, 1, 2}
Each class label counts:  {0: 59, 1: 71, 2: 48}


## Clustering


### 1. K-means 

In [4]:
from sklearn.cluster import KMeans

wine_km = KMeans(n_clusters=3, 
            init='random', 
            n_init=10, 
            max_iter=300,
            tol=1e-04,
            random_state=42)

wine_km_y_pred = wine_km.fit_predict(X)
wine_km_y_pred

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 2, 0, 2, 2, 0,
       2, 2, 0, 0, 0, 2, 2, 1, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2,
       0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0,
       2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2,
       2, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 0,
       0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0,
       0, 2], dtype=int32)

In [5]:
wine_km_confMatrix = confusion_matrix(y_true = y, y_pred = wine_km_y_pred)
wine_km_confMatrix

array([[13, 46,  0],
       [20,  1, 50],
       [29,  0, 19]])

In [6]:
print('Precision: %.3f' % precision_score(y_true=y, y_pred=wine_km_y_pred, average = "weighted"))
print('Recall: %.3f' % recall_score(y_true=y, y_pred=wine_km_y_pred, average = "weighted"))
print('F1: %.3f' % f1_score(y_true=y, y_pred=wine_km_y_pred, average = "weighted"))

Precision: 0.152
Recall: 0.185
F1: 0.166


### 2. Agglomerative clustering 

In [7]:
from sklearn.cluster import AgglomerativeClustering

wine_ac = AgglomerativeClustering(n_clusters=3, 
                             affinity='euclidean', 
                             linkage='complete')
wine_ac_y_pred = wine_ac.fit_predict(X)
wine_ac_y_pred

array([0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2,
       0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2,
       2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 2, 1, 1, 1, 1, 2,
       1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2,
       1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2,
       2, 1])

In [8]:
wine_ac_confMatrix = confusion_matrix(y_true = y, y_pred = wine_ac_y_pred)
wine_ac_confMatrix

array([[43,  0, 16],
       [ 0, 56, 15],
       [ 0, 27, 21]])

In [9]:
print('Precision: %.3f' % precision_score(y_true = y, y_pred = wine_ac_y_pred, average = "weighted"))
print('Recall: %.3f' % recall_score(y_true=y, y_pred=wine_ac_y_pred, average = "weighted"))
print('F1: %.3f' % f1_score(y_true=y, y_pred=wine_ac_y_pred, average = "weighted"))

Precision: 0.709
Recall: 0.674
F1: 0.683


### Comparison with classification model:  Logistic regression 

In [10]:
from sklearn.linear_model import LogisticRegression

wine_lr = LogisticRegression(random_state = 42, solver = "lbfgs")

wine_lr.fit(X,y)
wine_lr_y_pred = wine_lr.predict(X)
wine_lr_y_pred



array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [11]:
wine_lr_confMatrix = confusion_matrix(y_true = y, y_pred = wine_lr_y_pred)
wine_lr_confMatrix

array([[57,  2,  0],
       [ 0, 69,  2],
       [ 0,  1, 47]])

In [12]:
print('Precision: %.3f' % precision_score(y_true = y, y_pred = wine_lr_y_pred, average = "weighted"))
print('Recall: %.3f' % recall_score(y_true=y, y_pred=wine_lr_y_pred, average = "weighted"))
print('F1: %.3f' % f1_score(y_true=y, y_pred=wine_lr_y_pred, average = "weighted"))

Precision: 0.972
Recall: 0.972
F1: 0.972
