In [None]:
import numpy as np

import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import datasets

In [None]:
wine_ds = datasets.load_wine()

In [None]:
print(wine_ds.DESCR)

In [None]:
X = wine_ds.data
y = wine_ds.target

unique, counts = np.unique(y, return_counts=True)

print("X.shape: ", X.shape)
print("No of classes in y: ", set(y))
print("Each class label counts: ", dict(zip(unique, counts)))

## Clustering


### 1. K-means 

In [None]:
from sklearn.cluster import KMeans

wine_km = KMeans(n_clusters=3, 
            init='random', 
            n_init=10, 
            max_iter=300,
            tol=1e-04,
            random_state=42)

wine_km_y_pred = wine_km.fit_predict(X)
wine_km_y_pred

In [None]:
wine_km_confMatrix = confusion_matrix(y_true = y, y_pred = wine_km_y_pred)
wine_km_confMatrix

In [None]:
print('Precision: %.3f' % precision_score(y_true=y, y_pred=wine_km_y_pred, average = "weighted"))
print('Recall: %.3f' % recall_score(y_true=y, y_pred=wine_km_y_pred, average = "weighted"))
print('F1: %.3f' % f1_score(y_true=y, y_pred=wine_km_y_pred, average = "weighted"))

### 2. Agglomerative clustering 

In [None]:
from sklearn.cluster import AgglomerativeClustering

wine_ac = AgglomerativeClustering(n_clusters=3, 
                             affinity='euclidean', 
                             linkage='complete')
wine_ac_y_pred = wine_ac.fit_predict(X)
wine_ac_y_pred

In [None]:
wine_ac_confMatrix = confusion_matrix(y_true = y, y_pred = wine_ac_y_pred)
wine_ac_confMatrix

In [None]:
print('Precision: %.3f' % precision_score(y_true = y, y_pred = wine_ac_y_pred, average = "weighted"))
print('Recall: %.3f' % recall_score(y_true=y, y_pred=wine_ac_y_pred, average = "weighted"))
print('F1: %.3f' % f1_score(y_true=y, y_pred=wine_ac_y_pred, average = "weighted"))

### DBSCAN 

#### Scipy method 

In [None]:
labels = ['id_'+ str(i) for i in range(len(X))]
len(labels)

In [None]:
df = pd.DataFrame(X, columns = wine_ds.feature_names, index=labels)
df.head()

In [None]:
from scipy.spatial.distance import pdist, squareform

row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')),
                        columns=labels,
                        index=labels)
# row_dist

In [None]:
from scipy.cluster.hierarchy import linkage

row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')
cluster_df = pd.DataFrame(row_clusters,
             columns=['row label 1', 'row label 2',
                      'distance', 'no. of items in clust.'],
             index=['cluster %d' % (i + 1) 
                    for i in range(row_clusters.shape[0])])
cluster_df.head()

In [None]:
from scipy.cluster.hierarchy import dendrogram
import matplotlib.pyplot as plt
%matplotlib inline

row_dendr = dendrogram(row_clusters, labels= labels)

plt.tight_layout()
plt.ylabel('Euclidean distance')
plt.show()

### Classification model:  Logistic regression 

In [None]:
from sklearn.linear_model import LogisticRegression

wine_lr = LogisticRegression(random_state = 42, solver = "lbfgs")

wine_lr.fit(X,y)
wine_lr_y_pred = wine_lr.predict(X)
wine_lr_y_pred

In [None]:
wine_lr_confMatrix = confusion_matrix(y_true = y, y_pred = wine_lr_y_pred)
wine_lr_confMatrix

In [None]:
print('Precision: %.3f' % precision_score(y_true = y, y_pred = wine_lr_y_pred, average = "weighted"))
print('Recall: %.3f' % recall_score(y_true=y, y_pred=wine_lr_y_pred, average = "weighted"))
print('F1: %.3f' % f1_score(y_true=y, y_pred=wine_lr_y_pred, average = "weighted"))