# 3.2 Hierarchical Clustering: Guided Practice

In [None]:
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
from scipy.spatial.distance import pdist
import numpy as np
import pandas as pd
import os
from scipy.cluster.hierarchy import fcluster

%matplotlib inline

## Part 1: Format the data

#### 1.1 Import the iris data

In [None]:
iris = pd.read_csv('../../assets/datasets/iris.csv')

In [None]:
iris.head()

#### 1.2 Convert the data to a pandas dataframe and format the data

In [None]:
df = pd.DataFrame(data=iris, columns=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Name'])

Next, since we have categorical data, let's go ahead and covert that data to numeric dummy variables

In [None]:
def name_to_numeric(x):
    if x=='Iris-setosa':
        return 1
    if x=='Iris-virginica':
        return 2
    if x=='Iris-versicolor':
        return 3

In [None]:
df['name_num'] = df['Name'].apply(name_to_numeric)

In [None]:
del df['Name']

#### 1.3 Plot the data

Let's take a look at some of the clusters to get a feel for our data: 

In [None]:
df.plot(kind='scatter',x='SepalLength',y='SepalWidth')

Lastly, let's create a matrix to pass to the clustering algorithm 

In [None]:
X = df.as_matrix(columns=None)

## Part 2: Hierarchical Clustering

Now, let's perform the actual clustering on our set

In [None]:
Z = linkage(X, 'ward')

#### 2.1 Cophenetic Coefficient

Now, calculate the cophenetic correlation coefficient:

In [None]:
c, coph_dists = cophenet(Z, pdist(X))
c

## Part 3: Calculate the dendrogram

#### 3.1 Plot the dendrogram

In [None]:
plt.figure(figsize=(30, 10))
plt.title('Dendrogram')
plt.xlabel('Index Numbers')
plt.ylabel('Distance')
dendrogram(
    Z,
    leaf_rotation=90.,  
    leaf_font_size=8.,
)
plt.show()

**Question:** What can we tell about the clusters by visually inspecting them?

We can see that there are two primary clusters, the green cluster and the red cluster, as well as the green cluster only contains values < 50 while the red cluster contrains values > 50

Since we can't derive much information from this larger dendrogram, let's plot a truncated version of the dendrogram

In [None]:
plt.title('Truncated Dendrogram')
plt.xlabel('Index Numbers')
plt.ylabel('Distance')
dendrogram(
    Z,
    truncate_mode='lastp',  
    p=15,  
    show_leaf_counts=False,  
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  
)
plt.show()

#### 3.2 Calculate the cluster labels

In [None]:
max_d = 15
clusters = fcluster(Z, max_d, criterion='distance')
clusters

#### 3.3 Plot the resulting clusters

In [None]:
plt.scatter(X[:,0], X[:,1], c=clusters, cmap='prism')
plt.show()