# K-Means and Hierarchical clustering

### Libraries

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.cluster.hierarchy as sch
from sklearn.cluster import KMeans

## Data

Data set: **`Animals with Attributes`** contains information about 50 animals.  
For each, it has 85 real-valued features that capture various properties of the animal: where it lives, what it eats, and so on.  
You can download the data set from:  http://attributes.kyb.tuebingen.mpg.de

In [None]:
!find ../../_data | grep -1 classes.txt
!find ../../_data | grep -1 predicate-matrix-continuous.txt

Load in the data set. The file `'classes.txt'` contains the names of the 50 animals. The file `'predicate-matrix-continuous.txt'` contains the data itself: 85 attributes per animal.

In [None]:
!head -2 ../../_data/Animals_with_Attributes/predicate-matrix-continuous.txt

### Load txt matrix

In [None]:
X = np.loadtxt('../../_data/Animals_with_Attributes/predicate-matrix-continuous.txt')
X.shape

In [None]:
!head '../../_data/Animals_with_Attributes/classes.txt'

In [None]:
df = pd.read_csv('../../_data/Animals_with_Attributes/classes.txt', header=None, index_col=0, delimiter='\t')

In [None]:
df_classes = df[1]
df_classes.sample(5)

## K-means clustering

We now run Lloyd's algorithm to obtain a flat clustering of the data. In the code below, we ask for k=15 clusters, but you should experiment with other choices.

We ask for random initialization, which means that different runs of the algorithm will potentially have different outcomes. It is worth running the algorithm several times to see how the results change.

### Train K-means model

In [None]:
k = 8
kmeans = KMeans(n_clusters=k, init='random').fit(X)

### Cluster distribution and labels

In [None]:
from collections import Counter

In [None]:
kmeans.labels_
Counter(kmeans.labels_)

### Clusters by dictionary - cluster:[values,...]

In [None]:
clusters = {cluster:[] for cluster in kmeans.labels_}
_ = [clusters[cluster].append(label) for cluster, label in zip(kmeans.labels_, df_classes)]
clusters

### Sum of distance to closest cluster center

In [None]:
plt.figure(figsize=(6,6))
mpl.rc('axes.spines', left=True, top=False, right=False, bottom=True)  # hide axis/spines
mpl.rc('xtick', color='k')  # hide xticks

dist_cluster = []
for k in range(1, 18):
    kmeans = KMeans(n_clusters=k, init='random').fit(X)
    dist_cluster.append(kmeans.inertia_)
    
_ = plt.plot(range(1, 18), dist_cluster, '-o');

In [None]:
km6 = KMeans(n_clusters=6, init='random').fit(X)
km14 = KMeans(n_clusters=14, init='random').fit(X)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(18, 6))

ax1.scatter(X[:,0], X[:,1], s=40, c=km6.labels_, cmap=plt.cm.prism) 
ax1.set_title('K-Means Clustering Results with K=6')
ax1.scatter(km6.cluster_centers_[:, 0], km6.cluster_centers_[:, 1], marker='+', s=100, c='k', linewidth=2)

ax2.scatter(X[:, 0], X[:, 1], s=40, c=km14.labels_, cmap=plt.cm.prism) 
ax2.set_title('K-Means Clustering Results with K=14')
ax2.scatter(km14.cluster_centers_[:, 0], km14.cluster_centers_[:, 1], marker='+', s=100, c='k', linewidth=2);

### Kmeans using PCA

In [None]:
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
pca.fit(scale(X))

In [None]:
pca.fit_transform(scale(X));

In [None]:
df = pd.DataFrame(pca.fit_transform(X) , columns=['PC1', 'PC2'])
df.sample(5)

### Vector Loadings - Eigen Vectors

In [None]:
pca.components_[0]

In [None]:
pca_loadings = pd.DataFrame(pca.components_.T, columns=['V1', 'V2'])
pca_loadings.head()

In [None]:
plt.figure(figsize=(6,6))
mpl.rc('axes.spines', left=True, top=False, right=False, bottom=True)  # hide axis/spines
mpl.rc('xtick', color='k')  # hide xticks

dist_cluster = []
for k in range(1, 18):
    kmeans = KMeans(n_clusters=k, init='random').fit(df)
    dist_cluster.append(kmeans.inertia_)
    
_ = plt.plot(range(1, 18), dist_cluster, '-o');

In [None]:
km4 = KMeans(n_clusters=4, init='random').fit(df)
km8 = KMeans(n_clusters=8, init='random').fit(df)

In [None]:
import seaborn as sns
# https://seaborn.pydata.org/tutorial/color_palettes.html

colors = ["#67E568","#257F27","#08420D","#FFF000","#FFB62B","#E56124","#E53E30","#7F2353","#F911FF","#9F8CA6"]

cpal = sns.color_palette(colors)
sns.palplot(cpal, 1)
cmap_mpl = mpl.colors.ListedColormap(cpal.as_hex()) # discrete

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(18, 6))

ax1.scatter(df['PC1'], df['PC2'], s=40, c=km4.labels_, cmap=cmap_mpl) 
ax1.set_title('K-Means Clustering Results with K=4')
ax1.scatter(km4.cluster_centers_[:, 0], km4.cluster_centers_[:, 1], marker='+', s=100, c='k', linewidth=2)

ax2.scatter(df['PC1'], df['PC2'], s=40, c=km8.labels_, cmap=cmap_mpl) 
ax2.set_title('K-Means Clustering Results with K=8')
ax2.scatter(km8.cluster_centers_[:, 0], km8.cluster_centers_[:, 1], marker='+', s=100, c='k', linewidth=2);

### Compare clusters with and without PCA

In [None]:
clusters_pca = {cluster:[] for cluster in km8.labels_}
_ = [clusters_pca[cluster].append(label) for cluster, label in zip(km8.labels_, df_classes)]
clusters_pca

In [None]:
clusters

In [None]:
from matplotlib.pyplot import rcParams

mpl.rc('axes.spines', left=False, top=False, right=False, bottom=False)
mpl.rc('xtick', color='w')  # hide xticks
mpl.rc('ytick', color='w')  # hide xticks

In [None]:
fig , ax1 = plt.subplots(figsize=(12, 12))
colors = ["#67E568","#257F27","#08420D","#FFF000","#FFB62B","#E56124","#E53E30","#7F2353","#F911FF","#9F8CA6"]

_ = ax1.set_xlim(-300, 200)
_ = ax1.set_ylim(-150, 150)

# Plot Principal Components 1 and 2
for i in df.index:
    label = km8.labels_[i]
    _ = ax1.annotate(df_classes.iloc[i], (df.PC1.loc[i], df.PC2.loc[i]), ha='center', 
                     color=colors[label], size=14, alpha=.9)

ax1.set_xlabel('Principal Component 1', size=14)
ax1.set_ylabel('Principal Component 2', size=14)
    
# Plot Eigen Vectors
mp = 1000
ax1.arrow(0, 0, pca_loadings.V1[0]*mp, pca_loadings.V2[0]*mp, color='blue')
ax1.arrow(0, 0, pca_loadings.V1[1]*mp, pca_loadings.V2[1]*mp, color='blue');

## Hierarchical clustering

We use the built-in hierarchical clustering module of `scipy` to apply **Ward's method** to our data.

Lloyd's algorithm potentially returns a different solution each time it is run.

In [None]:
z = sch.linkage(X, method='ward')

### Show dendogram

### Set defaults dendogram

In [None]:
from matplotlib.pyplot import rcParams

# plt.rcParams.find_all

mpl.rc('figure', figsize=[10., 12.])
mpl.rc('axes.spines', left=False, top=False, right=False, bottom=False)  # hide axis/spines
mpl.rc('xtick', color='w')  # hide xticks

In [None]:
_ = plt.figure(figsize=(10, 12))

# Display dendrogram
info = sch.dendrogram(z, orientation='left', labels=df_classes.values, leaf_font_size=12)
leaves_in_reverse = info['ivl']

### Caveats and questions regarding clustering

Here are some things to think about:

**Multiple runs of Lloyd's algorithm**  
Lloyd's algorithm potentially returns a different solution each time it is run.  
Is there any reason to run it more than once?  
For instance, is there a sensible way of combining the information from several runs,  
of interpreting the similarities and differences?

**Sensitivity to the choice of features**  
Both clustering methods are highly sensitive to the choice of features.  
How would you feel if the results changed dramatically when just one or two features were dropped? 

**Criteria for success**  
This is clearly an application in which we are hoping that clustering will discover 'natural groups' in the data.  
To what extent do the algorithms succeed at this? Are the clusters mostly reasonable?  
Can we, in general, hope that tha clustering will perfectly capture what we want?  
Under what conditions would we be pleased with the clustering?  