## K-Means and Hierchical Clustering of Time-Use Dataset

### Import Packages

In [1]:
from __future__ import division, print_function

# import packages for this example
import pandas as pd  # DataFrame operations 
from collections import OrderedDict  # to create DataFrame with ordered columns
# special plotting methods
from pandas.tools.plotting import scatter_matrix    
import numpy as np  # arrays and math functions
import matplotlib.pyplot as plt  # static plotting
import sys
sys.path.append('/anaconda/lib/python2.7/site-packages')
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics  # for silhouette coefficient
from scipy.cluster.hierarchy import dendrogram

In [2]:
#define dendrogram plot function

# the following code is adapted from Matt Kallada posting at
# https://github.com/scikit-learn/scikit-learn/pull/3464/files
# user-defined function to accommodate tree plotting
# from sklearn AgglomerativeClustering
def plot_dendrogram(model, **kwargs):
    # Children of hierarchical clustering
    children = model.children_
    # Distances between each pair of children
    # Since we don't have this information, 
    # we can use a uniform one for plotting
    distance = np.arange(children.shape[0])
    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)
    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = \
        np.column_stack([children, distance, no_of_observations]).astype(float)
    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

### Import Data and Dataset Overview

In [3]:
# create Pandas DataFram from the student data
# define a pandas DataFrame
timeuse = pd.read_csv('time_use_1976_rev.csv')

In [4]:
print('')
print('----- Summary of Input Data -----')
print('')

# show the object is a DataFrame
print('Object type: ', type(timeuse))

# show number of observations in the DataFrame
print('Number of observations: ', len(timeuse))

# show variable names
variable = timeuse.columns
print('Variable names: ', variable)

# show descriptive statistics
pd.set_option('display.max_columns', None)  # do not limit output
print(timeuse.describe())

# show a portion of the beginning of the DataFrame
print(timeuse.head())


----- Summary of Input Data -----

Object type:  <class 'pandas.core.frame.DataFrame'>
Number of observations:  28
Variable names:  Index([u'group', u'gender', u'professional_work_status', u'marital_status',
       u'country', u'professional', u'transport', u'housework', u'childcare',
       u'shopping', u'personal', u'mealtime', u'sleep', u'tv', u'leisure'],
      dtype='object')
       professional   transport   housework   childcare    shopping  \
count     28.000000   28.000000   28.000000   28.000000   28.000000   
mean     448.857143   86.071429  276.964286   33.321429  108.678571   
std      226.976376   48.095529  198.606718   30.457078   32.514445   
min       10.000000    0.000000   50.000000    0.000000   52.000000   
25%      356.750000   47.500000   96.500000   10.000000   85.000000   
50%      535.000000   95.500000  256.000000   22.000000  112.000000   
75%      630.750000  127.000000  423.500000   56.000000  131.000000   
max      655.000000  148.000000  710.000000  11

In [5]:
#look for missing data
pd.isnull(timeuse).sum()

group                        0
gender                       0
professional_work_status    16
marital_status              12
country                      0
professional                 0
transport                    0
housework                    0
childcare                    0
shopping                     0
personal                     0
mealtime                     0
sleep                        0
tv                           0
leisure                      0
dtype: int64

## Distance Measures and Input Matrixes

In [6]:
#Create Timeuse variable input matrix

#select only the time-use variables
df1 = timeuse.iloc[:, 5:15]

# it is good practice to standardize variables prior to clustering
# work with standard scores for all cluster variables
# standard scores have zero mean and unit standard deviation
# here we standardize each student's data 
standardized_timeuse_matrix = preprocessing.scale(df1)

# transpose of matrix needed for clusters of variables
timeuse_kmeanscluster_data = standardized_timeuse_matrix.T

timeuse_kmeanscluster_data

array([[  7.22982085e-01,   1.17292306e-01,  -1.96897249e+00,
          7.45415040e-01,  -1.21073862e+00,   6.10817312e-01,
          1.48698443e-01,   9.11418906e-01,   2.74322990e-01,
         -1.92410658e+00,   9.24878678e-01,  -1.26009112e+00,
          8.66552996e-01,  -2.68554516e-01,   9.02445724e-01,
          4.98652538e-01,  -1.96897249e+00,   9.02445724e-01,
         -8.47324749e-01,   7.45415040e-01,  -7.11445137e-02,
          9.02445724e-01,   5.79411175e-01,  -1.90616021e+00,
          9.11418906e-01,  -6.66579228e-02,   7.99254132e-01,
         -7.11445137e-02],
       [  1.14185602e+00,   8.31815642e-02,  -1.82243245e+00,
          1.16302951e+00,  -1.20840127e+00,   6.12518791e-01,
          1.67875521e-01,   2.94916455e-01,  -3.40288217e-01,
         -1.67421803e+00,   2.31395988e-01,  -1.35661569e+00,
          4.00783900e-01,  -1.10253382e+00,   1.14185602e+00,
          4.00783900e-01,  -1.61069756e+00,   1.24772346e+00,
         -7.21411021e-01,   8.24253682e-01,

In [7]:
#Create Timeuse variable input matrix

#select only the time-use variables
df1 = timeuse.iloc[:, 5:15]

# it is good practice to standardize variables prior to clustering
# work with standard scores for all cluster variables
# standard scores have zero mean and unit standard deviation
# here we standardize each student's data 
standardized_timeuse_matrix = preprocessing.scale(df1)

# transpose of matrix needed for clusters of variables
timeuse_kmeanscluster_data = standardized_timeuse_matrix.T

timeuse_kmeanscluster_data

array([[  7.22982085e-01,   1.17292306e-01,  -1.96897249e+00,
          7.45415040e-01,  -1.21073862e+00,   6.10817312e-01,
          1.48698443e-01,   9.11418906e-01,   2.74322990e-01,
         -1.92410658e+00,   9.24878678e-01,  -1.26009112e+00,
          8.66552996e-01,  -2.68554516e-01,   9.02445724e-01,
          4.98652538e-01,  -1.96897249e+00,   9.02445724e-01,
         -8.47324749e-01,   7.45415040e-01,  -7.11445137e-02,
          9.02445724e-01,   5.79411175e-01,  -1.90616021e+00,
          9.11418906e-01,  -6.66579228e-02,   7.99254132e-01,
         -7.11445137e-02],
       [  1.14185602e+00,   8.31815642e-02,  -1.82243245e+00,
          1.16302951e+00,  -1.20840127e+00,   6.12518791e-01,
          1.67875521e-01,   2.94916455e-01,  -3.40288217e-01,
         -1.67421803e+00,   2.31395988e-01,  -1.35661569e+00,
          4.00783900e-01,  -1.10253382e+00,   1.14185602e+00,
          4.00783900e-01,  -1.61069756e+00,   1.24772346e+00,
         -7.21411021e-01,   8.24253682e-01,

In [8]:
#Create Demographic variable input matrix

#select only the demographic variables
df2 = timeuse.iloc[:, 0:5]

In [9]:
df2.head(2)

Unnamed: 0,group,gender,professional_work_status,marital_status,country
0,maus,men,active,,USA
1,waus,women,active,,USA


In [10]:
dummy_group = pd.get_dummies(df2["group"], prefix="Group")
dummy_gender = pd.get_dummies(df2["gender"], prefix="Gender")
dummy_workstatus = pd.get_dummies(df2["professional_work_status"], prefix="Professional_Work_Status")
dummy_maritalstatus = pd.get_dummies(df2["marital_status"], prefix="Marital_Status")
dummy_country = pd.get_dummies(df2["country"], prefix="Country")

In [11]:
#combine dummy variables w/dataframe
df3 = pd.concat([dummy_group, dummy_gender, dummy_workstatus, dummy_maritalstatus, dummy_country], axis=1)

#check
df3.head(2)

Unnamed: 0,Group_maea,Group_maus,Group_mawe,Group_mayu,Group_mnsea,Group_mnsus,Group_mnswe,Group_mnsyu,Group_msea,Group_msus,Group_mswe,Group_msyu,Group_waea,Group_waus,Group_wawe,Group_wayu,Group_wnaea,Group_wnaus,Group_wnawe,Group_wnayu,Group_wnsea,Group_wnsus,Group_wnswe,Group_wnsyu,Group_wsea,Group_wsus,Group_wswe,Group_wsyu,Gender_men,Gender_women,Professional_Work_Status_active,Professional_Work_Status_not active,Marital_Status_not single,Marital_Status_single,Country_Eastern,Country_USA,Country_Western,Country_Yugoslavia
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [12]:
# transpose of matrix needed for clusters of variables
demo_kmeanscluster_data = df3.T

demo_kmeanscluster_data.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
Group_maea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Group_maus,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cluster analysis of Time-Use Activities

#### K-Means Clustering

In [13]:
# suppose we think five factors (and five clusters) will be sufficient 
# assuming that there may well be five clusters to identify
    
kmeans = KMeans(n_clusters = 5, n_init = 25, random_state = 1)
kmeans.fit(timeuse_kmeanscluster_data)
cluster = kmeans.predict(timeuse_kmeanscluster_data)  # cluster ids for variables

# print(variable_kmeans_solution)
# create pandas DataFrame for summarizing the cluster analysis results
variable_kmeans_solution = pd.DataFrame(OrderedDict([('cluster', cluster),
    ('variable', variable[5:15])]))

# print results of variable clustering one cluster at a time
for cluster_id in sorted(variable_kmeans_solution.cluster.unique()):
    print()
    print(variable_kmeans_solution.loc[variable_kmeans_solution['cluster'] == \
        cluster_id])

# The silhouette coefficient is a useful general-purpose index
# for evaluating the strength of a clustering solution. The original
# reference is
# Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the 
#     Interpretation and Validation of Cluster Analysis”. 
#     Computational and Applied Mathematics 20: 53–65. 
#     doi:10.1016/0377-0427(87)90125-7.
# larger positive values of the silhouette coefficient are preferred
# these indicate dense, well separated clusters
   
# evaluate the clustering solution using the silhouette coefficient
print('Silhouette coefficient for the five-cluster k-means solution: ', 
    metrics.silhouette_score(timeuse_kmeanscluster_data, cluster, 
        metric = 'euclidean'))   
        
# a low silhouette coefficient suggests that we may want to try
# kmeans with alternative values for the number of clusters 
# or perhaps this problem is not particularly well suited for cluster analysis    


   cluster  variable
4        0  shopping
5        0  personal
8        0        tv

   cluster      variable
0        1  professional
1        1     transport

   cluster  variable
6        2  mealtime
7        2     sleep

   cluster variable
9        3  leisure

   cluster   variable
2        4  housework
3        4  childcare
Silhouette coefficient for the five-cluster k-means solution:  0.377370568187


#### Search for Solution based on Silhouette Coefficient

In [14]:
print('')
print('----- Selected K-means Cluster Analysis for Segments -----')
print('')

# here we are working in much the way we would in a market research
# study looking for market segments... here segments/clusters of students

# specify the number of clusters in order to perform 
# K-means cluster analysis on the variables in the study
# with no preconceived notions about the number of student segments/clusters
# we search across various cluster analysis solutions defined 
# each individual k-means solution is defined by the argument n_clusters

# consider selecting a solution based on the silhouette coefficient
for nclusters in range(2,10): # search between 2 and 10 clusters/segments
    kmeans = KMeans(n_clusters = nclusters, n_init = 25, random_state = 1)
    kmeans.fit(timeuse_kmeanscluster_data)
    segment = kmeans.predict(timeuse_kmeanscluster_data)  # cluster ids for variables
    print('nclusters: ', nclusters, ' silhouette coefficient: ', 
        metrics.silhouette_score(timeuse_kmeanscluster_data, segment, 
            metric='euclidean'))

# results suggest that a six-cluster/segment solution is best

print('')


----- Selected K-means Cluster Analysis for Segments -----

nclusters:  2  silhouette coefficient:  0.365424558428
nclusters:  3  silhouette coefficient:  0.314388336183
nclusters:  4  silhouette coefficient:  0.333350753678
nclusters:  5  silhouette coefficient:  0.377370568187
nclusters:  6  silhouette coefficient:  0.380656762221
nclusters:  7  silhouette coefficient:  0.337416451292
nclusters:  8  silhouette coefficient:  0.244135708094
nclusters:  9  silhouette coefficient:  0.151817122951



#### K-Means Clustering of n=6

In [15]:
print('')
print('----- Solution for Six Segments -----')
print('')


# assuming that there are six clusters to identify
    
kmeans = KMeans(n_clusters = 6, n_init = 25, random_state = 1)
kmeans.fit(timeuse_kmeanscluster_data)
cluster = kmeans.predict(timeuse_kmeanscluster_data)  # cluster ids for variables

# print(variable_kmeans_solution)
# create pandas DataFrame for summarizing the cluster analysis results
variable_kmeans_solution = pd.DataFrame(OrderedDict([('cluster', cluster),
    ('variable', variable[5:15])]))

# print results of variable clustering one cluster at a time
for cluster_id in sorted(variable_kmeans_solution.cluster.unique()):
    print()
    print(variable_kmeans_solution.loc[variable_kmeans_solution['cluster'] == \
        cluster_id])

# evaluate the clustering solution using the silhouette coefficient
print('Silhouette coefficient for the six       $-cluster k-means solution: ', 
    metrics.silhouette_score(timeuse_kmeanscluster_data, cluster, 
        metric = 'euclidean'))   


----- Solution for Six Segments -----


   cluster  variable
4        0  shopping
5        0  personal

   cluster      variable
0        1  professional
1        1     transport

   cluster variable
8        2       tv

   cluster  variable
6        3  mealtime
7        3     sleep

   cluster   variable
2        4  housework
3        4  childcare

   cluster variable
9        5  leisure
Silhouette coefficient for the five-cluster k-means solution:  0.380656762221


#### Hierarchical Clustering

In [17]:
# no need to specify the number of clusters in order to perform 
# hierarchical cluster analysis... this is one of its key advantages
timeuse_hiercluster_data = timeuse_kmeanscluster_data
variable = timeuse.columns

In [18]:
# hierarchical cluster analysis gives us the chance to explore the
# number-of-personality-factors question... 
# let's see what the picture looks like
# "the picture," in this case, is often a tree diagram 

# compute the full tree with all observations
ward = AgglomerativeClustering(linkage='ward', 
    n_clusters = timeuse_hiercluster_data.shape[0],
    compute_full_tree = True)
ward_full_tree = ward.fit(timeuse_hiercluster_data)

# use variable names for the cluster labels 
cluster_labels = map(lambda x: df1.columns[x], ward_full_tree.labels_)

In [19]:
# plot the full hierarchical tree with variable labels
plot_dendrogram(ward_full_tree, labels = cluster_labels)
plt.show()

In [20]:
# subjective review of the dendrogram suggests that about four clusters 
# would do well as a solution for clustering the variables
# so let's specify eight and see how it does
ward_four = AgglomerativeClustering(linkage='ward', 
    n_clusters = 4,
    compute_full_tree = False)
ward_four_tree = ward_four.fit(timeuse_hiercluster_data)
# get cluster ids for variables
ward_four_cluster = ward_four.fit_predict(timeuse_hiercluster_data)

# create pandas DataFrame for summarizing the cluster analysis results
variable_ward_solution = \
    pd.DataFrame(OrderedDict([('cluster', ward_four_cluster),
    ('variable', variable[5:15] )]))
    
# print results of variable clustering one cluster at a time
for cluster_id in sorted(set(variable_ward_solution.cluster)):
    print()
    print(variable_ward_solution.loc[variable_ward_solution['cluster'] == \
        cluster_id])

# The silhouette coefficient is a useful general-purpose index
# for evaluating the strength of a clustering solution. The original
# reference is
# Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the 
#     Interpretation and Validation of Cluster Analysis”. 
#     Computational and Applied Mathematics 20: 53–65. 
#     doi:10.1016/0377-0427(87)90125-7.
# larger positive values of the silhouette coefficient are preferred
# these indicate dense, well separated clusters
   




   cluster  variable
6        0  mealtime
7        0     sleep
9        0   leisure

   cluster   variable
2        1  housework
3        1  childcare

   cluster  variable
4        2  shopping
5        2  personal
8        2        tv

   cluster      variable
0        3  professional
1        3     transport


In [21]:
# evaluate the clustering solution using the silhouette coefficient
print('Silhouette coefficient for the hierarchical clustering solution: ', 
    metrics.silhouette_score(timeuse_hiercluster_data, ward_four_cluster, 
        metric = 'euclidean'))   
        
# a low silhouette coefficient suggests that we may want to try
# alternative values for the number of clusters 
# or perhaps this problem is not particularly well suited for cluster analysis                                 
                                                                                       
# clustering of students (segmentation) is left as an exercise

# for clustering students we work in much the way we would in a market research
# study looking for market segments... here segments/clusters of students

# no need to specify the number of clusters in order to perform 
# hierarchical cluster analysis on the students in the study
# we have no preconceived notions about the number of student segments/clusters
# hierarchical cluster analysis may well point us to students who are
# similar to one another in their stated personality attributes
# it may also find student outliers, those who do not fit cleanly into
# and group or segment or students... identification of outliers
# is one of the key benefits of hierarchical cluster analysis

Silhouette coefficient for the hierarchical clustering solution:  0.333350753678


### Clustering of Demographic Groups

#### K-Means Clustering

In [23]:
# suppose we think five factors (and five clusters) will be sufficient 
# here we use our knowledge of the big-five personality factors
# assuming that there may well be five clusters to identify
kmeans = KMeans(n_clusters = 3, n_init = 25, random_state = 1)
kmeans.fit(demo_kmeanscluster_data)
cluster = kmeans.predict(demo_kmeanscluster_data)  # cluster ids for variables 

In [24]:
variable = df3.columns

In [25]:
# print(variable_kmeans_solution)
# create pandas DataFrame for summarizing the cluster analysis results
variable_kmeans_solution = pd.DataFrame(OrderedDict([('cluster', cluster),
    ('variable', variable)]))

# print results of variable clustering one cluster at a time
for cluster_id in sorted(variable_kmeans_solution.cluster.unique()):
    print()
    print(variable_kmeans_solution.loc[variable_kmeans_solution['cluster'] == \
        cluster_id])

# The silhouette coefficient is a useful general-purpose index
# for evaluating the strength of a clustering solution. The original
# reference is
# Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the 
#     Interpretation and Validation of Cluster Analysis”. 
#     Computational and Applied Mathematics 20: 53–65. 
#     doi:10.1016/0377-0427(87)90125-7.
# larger positive values of the silhouette coefficient are preferred
# these indicate dense, well separated clusters
   
# evaluate the clustering solution using the silhouette coefficient
print('Silhouette coefficient for the five-cluster k-means solution: ', 
    metrics.silhouette_score(demo_kmeanscluster_data, cluster, 
        metric = 'euclidean'))   
        
# a low silhouette coefficient suggests that we may want to try
# kmeans with alternative values for the number of clusters 
# or perhaps this problem is not particularly well suited for cluster analysis 


    cluster                             variable
0         0                           Group_maea
1         0                           Group_maus
2         0                           Group_mawe
3         0                           Group_mayu
4         0                          Group_mnsea
5         0                          Group_mnsus
6         0                          Group_mnswe
7         0                          Group_mnsyu
8         0                           Group_msea
9         0                           Group_msus
10        0                           Group_mswe
11        0                           Group_msyu
12        0                           Group_waea
13        0                           Group_waus
14        0                           Group_wawe
15        0                           Group_wayu
16        0                          Group_wnaea
17        0                          Group_wnaus
18        0                          Group_wnawe
19        0        

#### Search for Solution based on Silhouette Coefficient

In [26]:
print('')
print('----- Selected K-means Cluster Analysis for Segments -----')
print('')

# here we are working in much the way we would in a market research
# study looking for market segments... here segments/clusters of students

# specify the number of clusters in order to perform 
# K-means cluster analysis on the variables in the study
# with no preconceived notions about the number of student segments/clusters
# we search across various cluster analysis solutions defined 
# each individual k-means solution is defined by the argument n_clusters

# consider selecting a solution based on the silhouette coefficient
for nclusters in range(2,38): # search between 2 and 10 clusters/segments
    kmeans = KMeans(n_clusters = nclusters, n_init = 25, random_state = 1)
    kmeans.fit(demo_kmeanscluster_data)
    segment = kmeans.predict(demo_kmeanscluster_data)  # cluster ids for variables
    print('nclusters: ', nclusters, ' silhouette coefficient: ', 
        metrics.silhouette_score(demo_kmeanscluster_data, segment, 
            metric='euclidean'))

# results suggest that a two-cluster/segment solution is best

print('')


----- Selected K-means Cluster Analysis for Segments -----

nclusters:  2  silhouette coefficient:  0.47248697792
nclusters:  3  silhouette coefficient:  0.382626706071
nclusters:  4  silhouette coefficient:  0.341026502425
nclusters:  5  silhouette coefficient:  0.322643439605
nclusters:  6  silhouette coefficient:  0.321445949465
nclusters:  7  silhouette coefficient:  0.314610806409
nclusters:  8  silhouette coefficient:  0.311990298092
nclusters:  9  silhouette coefficient:  0.310333520006
nclusters:  10  silhouette coefficient:  0.310702934371
nclusters:  11  silhouette coefficient:  0.251449497565
nclusters:  12  silhouette coefficient:  -1.11559201012e-16
nclusters:  13  silhouette coefficient:  -1.07427378752e-16
nclusters:  14  silhouette coefficient:  -2.06591112985e-16
nclusters:  15  silhouette coefficient:  -1.98327468465e-16
nclusters:  16  silhouette coefficient:  -1.90063823946e-16
nclusters:  17  silhouette coefficient:  -1.81800179427e-16
nclusters:  18  silhouette c

In [27]:
# suppose we think five factors (and five clusters) will be sufficient 
# here we use our knowledge of the big-five personality factors
# assuming that there may well be five clusters to identify
kmeans = KMeans(n_clusters = 2, n_init = 25, random_state = 1)
kmeans.fit(demo_kmeanscluster_data)
cluster = kmeans.predict(demo_kmeanscluster_data)  # cluster ids for variables 

# print(variable_kmeans_solution)
# create pandas DataFrame for summarizing the cluster analysis results
variable_kmeans_solution = pd.DataFrame(OrderedDict([('cluster', cluster),
    ('variable', variable)]))

# print results of variable clustering one cluster at a time
for cluster_id in sorted(variable_kmeans_solution.cluster.unique()):
    print()
    print(variable_kmeans_solution.loc[variable_kmeans_solution['cluster'] == \
        cluster_id])

# The silhouette coefficient is a useful general-purpose index
# for evaluating the strength of a clustering solution. The original
# reference is
# Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the 
#     Interpretation and Validation of Cluster Analysis”. 
#     Computational and Applied Mathematics 20: 53–65. 
#     doi:10.1016/0377-0427(87)90125-7.
# larger positive values of the silhouette coefficient are preferred
# these indicate dense, well separated clusters
   
# evaluate the clustering solution using the silhouette coefficient
print('Silhouette coefficient for the five-cluster k-means solution: ', 
    metrics.silhouette_score(demo_kmeanscluster_data, cluster, 
        metric = 'euclidean'))   
        
# a low silhouette coefficient suggests that we may want to try
# kmeans with alternative values for the number of clusters 
# or perhaps this problem is not particularly well suited for cluster analysis 


    cluster                             variable
0         0                           Group_maea
1         0                           Group_maus
2         0                           Group_mawe
3         0                           Group_mayu
4         0                          Group_mnsea
5         0                          Group_mnsus
6         0                          Group_mnswe
7         0                          Group_mnsyu
8         0                           Group_msea
9         0                           Group_msus
10        0                           Group_mswe
11        0                           Group_msyu
12        0                           Group_waea
13        0                           Group_waus
14        0                           Group_wawe
15        0                           Group_wayu
16        0                          Group_wnaea
17        0                          Group_wnaus
18        0                          Group_wnawe
19        0        

#### Hierchical Clustering

In [28]:
demo_hiercluster_data = demo_kmeanscluster_data

In [35]:
# hierarchical cluster analysis gives us the chance to explore the
# number-of-personality-factors question... 
# let's see what the picture looks like
# "the picture," in this case, is often a tree diagram 

# compute the full tree with all observations
ward = AgglomerativeClustering(linkage='ward', 
    n_clusters = demo_hiercluster_data.shape[0],
    compute_full_tree = True)
ward_full_tree = ward.fit(demo_hiercluster_data)

# use variable names for the cluster labels 
cluster_labels = map(lambda x: df3.columns[x], ward_full_tree.labels_)

In [36]:
# plot the full hierarchical tree with variable labels
plot_dendrogram(ward_full_tree, labels = cluster_labels)
plt.show()

In [37]:
# subjective review of the dendrogram suggests that about four clusters 
# would do well as a solution for clustering the variables
# so let's specify eight and see how it does
ward_nine = AgglomerativeClustering(linkage='ward', 
    n_clusters = 9,
    compute_full_tree = False)
ward_nine_tree = ward_nine.fit(demo_hiercluster_data)
# get cluster ids for variables
ward_nine_cluster = ward_nine.fit_predict(demo_hiercluster_data)

# create pandas DataFrame for summarizing the cluster analysis results
variable_ward_solution = \
    pd.DataFrame(OrderedDict([('cluster', ward_nine_cluster),
    ('variable', variable )]))
    
# print results of variable clustering one cluster at a time
for cluster_id in sorted(set(variable_ward_solution.cluster)):
    print()
    print(variable_ward_solution.loc[variable_ward_solution['cluster'] == \
        cluster_id])

# The silhouette coefficient is a useful general-purpose index
# for evaluating the strength of a clustering solution. The original
# reference is
# Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the 
#     Interpretation and Validation of Cluster Analysis”. 
#     Computational and Applied Mathematics 20: 53–65. 
#     doi:10.1016/0377-0427(87)90125-7.
# larger positive values of the silhouette coefficient are preferred
# these indicate dense, well separated clusters
   




    cluster                         variable
30        0  Professional_Work_Status_active
34        0                  Country_Eastern

    cluster                             variable
0         1                           Group_maea
1         1                           Group_maus
2         1                           Group_mawe
3         1                           Group_mayu
4         1                          Group_mnsea
5         1                          Group_mnsus
6         1                          Group_mnswe
7         1                          Group_mnsyu
8         1                           Group_msea
9         1                           Group_msus
10        1                           Group_mswe
11        1                           Group_msyu
12        1                           Group_waea
13        1                           Group_waus
14        1                           Group_wawe
15        1                           Group_wayu
16        1                   

In [38]:
# evaluate the clustering solution using the silhouette coefficient
print('Silhouette coefficient for the hierarchical clustering solution: ', 
    metrics.silhouette_score(demo_hiercluster_data, ward_nine_cluster, 
        metric = 'euclidean'))   
        
# a low silhouette coefficient suggests that we may want to try
# alternative values for the number of clusters 
# or perhaps this problem is not particularly well suited for cluster analysis                                 
                                                                                       
# clustering of students (segmentation) is left as an exercise

# for clustering students we work in much the way we would in a market research
# study looking for market segments... here segments/clusters of students

# no need to specify the number of clusters in order to perform 
# hierarchical cluster analysis on the students in the study
# we have no preconceived notions about the number of student segments/clusters
# hierarchical cluster analysis may well point us to students who are
# similar to one another in their stated personality attributes
# it may also find student outliers, those who do not fit cleanly into
# and group or segment or students... identification of outliers
# is one of the key benefits of hierarchical cluster analysis

Silhouette coefficient for the hierarchical clustering solution:  0.310333520006
