In [None]:
import numpy as np
import pandas as pd
from scipy import ndimage
from scipy.cluster import hierarchy
from scipy.spatial import distance_matrix
from matplotlib import pyplot as plt
from sklearn import manifold, datasets
from sklearn.cluster import AgglomerativeClustering
%matplotlib inline

In [None]:
X1, Y1 = make_blobs(n_samples=50, centers=[[4,4], [-2,-1], 
          [1,1], [10,4]], cluster_std=0.9)

In [None]:
plt.scatter(X1[:,0], X1[:,1], marker='x')

In [None]:
agglom = AgglomerativeClustering(n_clusters = 4, linkage = 'single')
agglom.fit(X1,Y1)

In [None]:
plt.figure(figsize=(6,4))

x_min, x_max, = np.min(X1, axis=0), np.max(X1, axis=0)

X1 = (X1 - x_min) / (x_max - x_min)

for i in range(X1.shape[0]):
  plt.text(X1[i,0], X1[i,1], str(Y1[i]), 
           color=plt.cm.nipy_spectral(agglom.labels_[i]/10.),
           fontdict={'weight': 'bold', 'size': 9})

plt.xticks([])
plt.yticks([])

plt.scatter(X1[:,0], X1[:,1], marker='.')

plt.show()

In [None]:
dist_matrix = distance_matrix(X1,X1)
print(dist_matrix)

In [None]:
Z = hierarchy.linkage(dist_matrix, 'complete')
X = hierarchy.linkage(dist_matrix, 'single')
Y = hierarchy.linkage(dist_matrix, 'average')

In [None]:
dendro = hierarchy.dendrogram(Z)

In [None]:
dendro = hierarchy.dendrogram(X)

In [None]:
dendro = hierarchy.dendrogram(Y)

2. Cars culs

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-l','sepal-w','petal-l','petal-w','class']
dataset = pd.read_csv(url,names=names)
print(dataset.shape)
dataset.head(5)

In [None]:
print('Shape of dataset before cleaning: ',pdf.size)
pdf[['sales','resale','type','price','engine_s','horsepow','wheelbas',
     'width','length','curb_wgt','fuel_cap','mpg','lnsales']] = pdf[['sales', 
      'resale', 'type', 'price', 'engine_s', 'horsepow','wheelbas',
     'width','length','curb_wgt','fuel_cap','mpg','lnsales']].apply(pd.to_numeric,
      errors='coerce')
pdf = pdf.dropna()
pdf = pdf.reset_index(drop=True)
print ("Shape of dataset after cleaning: ", pdf.size)
pdf.head(5)

In [None]:
featureset = pdf[['engine_s','horsepow','wheelbas','width',
                  'length','curb_wgt','fuel_cap','mpg']]

In [None]:
from sklearn.preprocessing import MinMaxScaler
x = featureset.values
min_max_scaler = MinMaxScaler()
feature_mtx = min_max_scaler.fit_transform(x)
feature_mtx [0:5]

with scipy

In [None]:
import scipy
leng = feature_mtx.shape[0]
D = scipy.zeros([leng,leng])
for i in range(leng):
  for j in range(leng):
    D[i,j] = scipy.spatial.distance.euclidean(feature_mtx[i], feature_mtx[j])

single

In [None]:
import pylab
import scipy.cluster.hierarchy
X = hierarchy.linkage(D, 'single')

In [None]:
from scipy.cluster.hierarchy import fcluster
k = 5
clusters = fcluster(X, k, criterion='maxclust')
clusters

In [None]:
fig = pylab.figure(figsize=(18,50))
def llf(id):
  return '[%s %s %s]' % (pdf['manufact'][id], pdf['model'][id], 
                         int(float(pdf['type'][id])))
  
dendro = hierarchy.dendrogram(X, leaf_label_func = llf, 
         leaf_rotation = 0, leaf_font_size = 12, orientation = 'right')

average

In [None]:
import pylab
import scipy.cluster.hierarchy
Y = hierarchy.linkage(D, 'average')

In [None]:
from scipy.cluster.hierarchy import fcluster
k = 5
clusters = fcluster(X, k, criterion='maxclust')
clusters

In [None]:
fig = pylab.figure(figsize=(18,50))
def llf(id):
  return '[%s %s %s]' % (pdf['manufact'][id], pdf['model'][id], int(float(pdf['type'][id])))
  
dendro = hierarchy.dendrogram(Y, leaf_label_func = llf, leaf_rotation = 0, leaf_font_size = 12, orientation = 'right')

scikit-learn

In [None]:
dist_matrix = distance_matrix(feature_mtx, feature_mtx)
print(dist_matrix)

single

In [None]:
agglom = AgglomerativeClustering(n_clusters = 6, linkage = 'single')
agglom.fit(feature_mtx)
agglom.labels_

In [None]:
pdf['cluster_'] = agglom.labels_
pdf.head()

In [None]:
import matplotlib.cm as cm
n_clusters = max(agglom.labels_)+1
colors = cm.rainbow(np.linspace(0,1,n_clusters))
cluster_labels = list(range(0, n_clusters))

plt.figure(figsize=(16,14))

for color, label in zip(colors, cluster_labels):
  subset = pdf[pdf.cluster_ == label]
  for i in subset.index:
    plt.text(subset.horsepow[i], subset.mpg[i], str(subset['model'][i]), rotation=25)
    plt.scatter(subset.horsepow, subset.mpg, s= subset.price*10, c=color, 
                label='cluster'+str(label),alpha=0.5)

plt.legend()
plt.title('Clusters')
plt.xlabel('horsepow')
plt.ylabel('mpg')

In [None]:
pdf.groupby(['cluster_','type'])['cluster_'].count()

In [None]:
agg_cars = pdf.groupby(['cluster_','type'])['horsepow','engine_s','mpg','price'].mean()
agg_cars

In [None]:
plt.figure(figsize=(16,10))
for color, label in zip(colors, cluster_labels):
  subset = agg_cars.loc[(label,),]
  for i in subset.index:
    plt.text(subset.loc[i][0]+5, subset.loc[i][2], 'type='+str(int(i)) + 
             ', price='+str(int(subset.loc[i][3]))+'k')
    plt.scatter(subset.horsepow, subset.mpg, s=subset.price*20, c=color, 
                label='cluster'+str(label))

plt.legend()
plt.title('Clusters')
plt.xlabel('horsepow')
plt.ylabel('mpg')

average

In [None]:
agglom = AgglomerativeClustering(n_clusters = 6, linkage = 'average')
agglom.fit(feature_mtx)
agglom.labels_

In [None]:
pdf['cluster_'] = agglom.labels_
pdf.head()

In [None]:
import matplotlib.cm as cm
n_clusters = max(agglom.labels_)+1
colors = cm.rainbow(np.linspace(0,1,n_clusters))
cluster_labels = list(range(0, n_clusters))

plt.figure(figsize=(16,14))

for color, label in zip(colors, cluster_labels):
  subset = pdf[pdf.cluster_ == label]
  for i in subset.index:
    plt.text(subset.horsepow[i], subset.mpg[i], str(subset['model'][i]), rotation=25)
    plt.scatter(subset.horsepow, subset.mpg, s= subset.price*10, c=color, 
                label='cluster'+str(label),alpha=0.5)

plt.legend()
plt.title('Clusters')
plt.xlabel('horsepow')
plt.ylabel('mpg')

In [None]:
pdf.groupby(['cluster_','type'])['cluster_'].count()

In [None]:
agg_cars = pdf.groupby(['cluster_','type'])['horsepow','engine_s','mpg','price'].mean()
agg_cars

In [None]:
plt.figure(figsize=(16,10))
for color, label in zip(colors, cluster_labels):
  subset = agg_cars.loc[(label,),]
  for i in subset.index:
    plt.text(subset.loc[i][0]+5, subset.loc[i][2], 'type='+str(int(i)) + 
             ', price='+str(int(subset.loc[i][3]))+'k')
    plt.scatter(subset.horsepow, subset.mpg, s=subset.price*20, c=color, 
                label='cluster'+str(label))

plt.legend()
plt.title('Clusters')
plt.xlabel('horsepow')
plt.ylabel('mpg')