In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.cluster import AgglomerativeClustering
import scipy 
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import cophenet
import sklearn.metrics as sm

In [2]:
df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0,household_id,SA2,HHSIZE,TOTALVEHS,INC,age_profile,HHSIZE_normalised,TOTALVEHS_normalised,INC_normalised,DWELLTYPE_Flat or Apartment,DWELLTYPE_Other,DWELLTYPE_Separate House,DWELLTYPE_Terrace/Townhouse,OWNDWELL_Being Purchased,OWNDWELL_Being Rented,OWNDWELL_Fully Owned,OWNDWELL_Occupied Rent-Free,OWNDWELL_Something Else
0,1,206011105,2,2,2350,1,0.125,0.4,0.724725,0,0,1,0,0,0,1,0,0
1,2,206011105,2,1,800,0,0.125,0.2,0.21972,0,0,1,0,1,0,0,0,0
2,3,206011105,2,1,800,0,0.125,0.2,0.21972,0,0,1,0,1,0,0,0,0
3,4,206011105,2,2,4000,1,0.125,0.4,0.945946,0,1,0,0,1,0,0,0,0
4,5,206011105,2,2,4000,1,0.125,0.4,0.945946,0,1,0,0,1,0,0,0,0


In [3]:
sample_size = 10000
clusters = 10
my_n_init = 10
df_sample = df.sample(sample_size, random_state=42)

## Normalised Data 

In [4]:
df_norm = pd.DataFrame()
df_norm = df_sample[['HHSIZE_normalised', 'TOTALVEHS_normalised', 'INC_normalised','DWELLTYPE_Separate House', 'age_profile']].copy()
df_norm.loc[:, 'owndwell'] = df_sample['OWNDWELL_Being Purchased'] | df_sample['OWNDWELL_Fully Owned']
df_norm.head()

Unnamed: 0,HHSIZE_normalised,TOTALVEHS_normalised,INC_normalised,DWELLTYPE_Separate House,age_profile,owndwell
112679,0.125,0.0,0.658659,0,0,0
864439,0.0,0.2,0.070571,1,0,1
1180461,0.25,0.2,0.381882,1,1,1
1307167,0.25,0.8,0.21972,1,1,1
1029345,0.375,0.2,0.965966,1,1,1


# Boolean Data 

In [5]:
high_inc = df_sample['INC'].quantile(0.9)
df_bool = pd.DataFrame()
df_bool["veh2"] = df_sample['TOTALVEHS'].apply(lambda x: 1 if x >= 2 else 0)
df_bool["highinc"] = df_sample['INC'].apply(lambda x: 1 if x >= high_inc else 0)
df_bool["house"] = df_sample['DWELLTYPE_Separate House']
df_bool["ownner"] = (df_sample['OWNDWELL_Being Purchased'] | df['OWNDWELL_Fully Owned']).astype(int)
df_bool["age"] = df_sample['age_profile']
df_bool["hhsize"] = df_sample['HHSIZE'].apply(lambda x: 1 if x >= 2 else 0)
df_bool.head()

Unnamed: 0,veh2,highinc,house,ownner,age,hhsize
112679,0,0,0,0,0,1
864439,0,0,1,1,0,0
1180461,0,0,1,1,1,1
1307167,1,0,1,1,1,1
1029345,0,1,1,1,1,1


# Kmeans

In [6]:
def optimise_k_means(data, max_k, title):
    means = []
    inertias = []

    for k in range(1, max_k):
        kmeans = KMeans(n_clusters=k, n_init=my_n_init)
        kmeans.fit(data)

        means.append(k)
        inertias.append(kmeans.inertia_)

    # generate the elbow plot
    fig = plt.subplots(figsize=(10,5))
    plt.plot(means, inertias, 'o-')
    plt.title(title)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.grid(True)
    plt.show()

In [7]:
#optimise_k_means(df_bool, test_n_clusters, "df_bool")

In [8]:
kmeans = KMeans(n_clusters=clusters, n_init=10).fit(df_bool)
score_kmeans_bool = calinski_harabasz_score(df_bool, kmeans.labels_)
silhouette_kmeans_bool = silhouette_score(df_bool, kmeans.labels_)

print(score_kmeans_bool)
print("The average silhouette score is :", silhouette_kmeans_bool)
 

df_kmeans_bool = df_bool.copy()
df_kmeans_bool['label'] = kmeans.labels_
group_kmeans_bool = df_kmeans_bool.groupby('label')
group_kmeans_bool.mean()

4837.911780975896
The average silhouette score is : 0.6508559520408738


Unnamed: 0_level_0,veh2,highinc,house,ownner,age,hhsize
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.567568,0.086873,1.0,0.0,0.0,0.972973
1,1.0,0.0,0.90118,1.0,1.0,1.0
2,0.078989,0.0,1.0,0.882306,0.0,0.110585
3,0.0,0.038534,0.758459,1.0,1.0,1.0
4,1.0,0.0,0.884498,1.0,0.0,1.0
5,0.023622,0.0,0.0,0.0,0.0,0.0
6,0.188352,0.065675,0.0,0.0,0.374226,1.0
7,0.925405,1.0,0.945946,0.951351,0.79027,1.0
8,0.018927,0.004732,0.0,1.0,0.0,0.107256
9,0.677863,0.071756,0.728244,0.0,1.0,1.0


In [9]:
df_sample_clust = df_sample.copy()
df_sample_clust['label'] = kmeans.labels_
group_sample_clust = df_sample_clust.groupby('label')
group_sample_clust.mean()

Unnamed: 0_level_0,household_id,SA2,HHSIZE,TOTALVEHS,INC,age_profile,HHSIZE_normalised,TOTALVEHS_normalised,INC_normalised,DWELLTYPE_Flat or Apartment,DWELLTYPE_Other,DWELLTYPE_Separate House,DWELLTYPE_Terrace/Townhouse,OWNDWELL_Being Purchased,OWNDWELL_Being Rented,OWNDWELL_Fully Owned,OWNDWELL_Occupied Rent-Free,OWNDWELL_Something Else
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1127986.0,209913800.0,3.833977,1.608108,1814.166023,0.0,0.354247,0.321622,0.511412,0.0,0.0,1.0,0.0,0.0,0.938224,0.0,0.059846,0.001931
1,1055381.0,210069100.0,3.160202,2.386847,1890.277572,1.0,0.270025,0.477369,0.5531,0.019224,0.020573,0.90118,0.059022,0.455649,0.0,0.544351,0.0,0.0
2,1041397.0,209966700.0,1.240126,0.943918,848.556872,0.0,0.030016,0.188784,0.236306,0.0,0.0,1.0,0.0,0.276461,0.107425,0.605845,0.008689,0.00158
3,942564.9,209463000.0,2.416353,0.915414,1535.637218,1.0,0.177044,0.183083,0.436841,0.06485,0.068609,0.758459,0.108083,0.25282,0.0,0.74718,0.0,0.0
4,1186313.0,210107800.0,4.422492,2.393617,1873.673252,0.0,0.427812,0.478723,0.551925,0.016717,0.007599,0.884498,0.091185,0.641337,0.0,0.358663,0.0,0.0
5,626902.7,208221100.0,1.0,0.645669,825.688976,0.0,0.0,0.129134,0.230663,0.704724,0.15748,0.0,0.137795,0.0,0.937008,0.0,0.031496,0.031496
6,628163.9,208028100.0,2.634449,0.983891,1687.291202,0.374226,0.204306,0.196778,0.479918,0.535316,0.146221,0.0,0.318463,0.0,0.976456,0.0,0.021066,0.002478
7,939408.9,209798400.0,4.32,2.731892,4225.163243,0.79027,0.415,0.546378,0.94549,0.010811,0.002162,0.945946,0.041081,0.507027,0.035676,0.444324,0.012973,0.0
8,779775.9,208888500.0,1.137224,0.845426,980.678233,0.0,0.017153,0.169085,0.277588,0.380126,0.271293,0.0,0.34858,0.343849,0.0,0.656151,0.0,0.0
9,980528.9,209557200.0,3.158779,1.830534,2022.151145,1.0,0.269847,0.366107,0.564712,0.103817,0.058015,0.728244,0.109924,0.0,0.91145,0.0,0.054962,0.033588


In [10]:
#optimise_k_means(df_norm, test_n_clusters, "df_norm")

In [11]:
kmeans = KMeans(n_clusters=clusters, n_init=my_n_init).fit(df_norm)
score_kmeans_norm = calinski_harabasz_score(df_norm, kmeans.labels_)
silhouette_kmeans_norm = silhouette_score(df_norm, kmeans.labels_)

print(score_kmeans_norm) 
print("The average silhouette score is :", silhouette_kmeans_norm)

df_kmeans_norm = df_norm.copy()
df_kmeans_norm['label'] = kmeans.labels_
group_kmeans_norm = df_kmeans_norm.groupby('label')
group_kmeans_norm.mean()

8410.667943706467
The average silhouette score is : 0.507323481878191


Unnamed: 0_level_0,HHSIZE_normalised,TOTALVEHS_normalised,INC_normalised,DWELLTYPE_Separate House,age_profile,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.275112,0.286057,0.443453,1.0,0.0,0.0
1,0.197164,0.348307,0.323819,1.0,1.0,1.0
2,0.551581,0.50512,0.72805,1.0,0.0,1.0
3,0.093657,0.176111,0.348005,0.0,0.0,0.0
4,0.046085,0.218271,0.238766,1.0,0.0,1.0
5,0.204146,0.334003,0.560917,0.0,1.0,1.0
6,0.066445,0.196914,0.304556,0.0,0.0,1.0
7,0.288075,0.358621,0.576116,1.0,1.0,0.0
8,0.231771,0.257083,0.550529,0.0,1.0,0.0
9,0.318414,0.497783,0.78501,1.0,1.0,1.0


In [12]:
group_kmeans_norm.size()/8000*100

label
0     8.3375
1    22.1500
2     8.3000
3    12.6625
4    15.3250
5     7.4625
6     8.9125
7     6.5250
8     6.0000
9    29.3250
dtype: float64

# GMM

In [13]:
def optimise_gmm(data, max_k):
    bics = []

    for k in range(1, max_k):
        gmm = GaussianMixture(n_components=k)
        gmm.fit(data)
        bics.append(gmm.bic(data))

    # generate the elbow plot
    fig = plt.subplots(figsize=(10,5))
    plt.plot(range(1, max_k), bics, 'o-')
    plt.xlabel('Number of Clusters')
    plt.ylabel('BIC')
    plt.grid(True)
    plt.show()

In [14]:
#optimise_gmm(df_bool, test_n_clusters)

In [15]:
gmm = GaussianMixture(n_components=clusters).fit(df_bool)
score_gmm_bool= calinski_harabasz_score(df_bool, gmm.predict(df_bool))
silhouette_gmm_bool = silhouette_score(df_bool, gmm.predict(df_bool))

print(score_gmm_bool) 
print("The average silhouette score is :", silhouette_gmm_bool)

df_gmm_bool = df_bool.copy()
df_gmm_bool['label'] = gmm.predict(df_bool)
group_gmm_bool = df_gmm_bool.groupby('label')
group_gmm_bool.mean()

4380.932258267166
The average silhouette score is : 0.6199016097083949


Unnamed: 0_level_0,veh2,highinc,house,ownner,age,hhsize
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.0,0.0,0.884498,1.0,0.0,1.0
1,0.019017,0.0,0.0,1.0,0.0,0.103011
2,1.0,0.0,0.909316,0.917673,1.0,1.0
3,0.0,0.031068,0.390291,0.0,0.439806,1.0
4,0.0,0.139613,0.761144,0.949537,1.0,1.0
5,0.078989,0.0,1.0,0.882306,0.0,0.110585
6,0.023622,0.0,0.0,0.0,0.0,0.0
7,1.0,0.075758,0.0,0.0,0.539394,1.0
8,0.991889,1.0,0.938586,0.947856,0.771727,1.0
9,1.0,0.095238,1.0,0.0,0.0,0.952381


In [16]:
group_gmm_bool.size()/8000*100

label
0     8.2250
1     7.8875
2    40.3875
3    12.8750
4    14.8625
5    15.8250
6     6.3500
7     4.1250
8    10.7875
9     3.6750
dtype: float64

In [17]:
#optimise_gmm(df_norm, test_n_clusters)

In [18]:
gmm = GaussianMixture(n_components=clusters).fit(df_norm)
score_gmm_norm= calinski_harabasz_score(df_norm, gmm.predict(df_norm))
silhouette_gmm_norm = silhouette_score(df_norm, gmm.predict(df_norm))

print(score_gmm_norm)

df_gmm_norm = df_norm.copy()
df_gmm_norm['label'] = gmm.predict(df_norm)
group_gmm_norm = df_gmm_norm.groupby('label')
group_gmm_norm.mean()

7967.509505068448


Unnamed: 0_level_0,HHSIZE_normalised,TOTALVEHS_normalised,INC_normalised,DWELLTYPE_Separate House,age_profile,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0,0.192828,0.218452,1.0,0.0,1.0
1,0.204038,0.368578,0.325662,1.0,1.0,1.0
2,0.093657,0.176111,0.348005,0.0,0.0,0.0
3,0.204146,0.334003,0.560917,0.0,1.0,1.0
4,0.066445,0.196914,0.304556,0.0,0.0,1.0
5,0.275112,0.286057,0.443453,1.0,0.0,0.0
6,0.288075,0.358621,0.576116,1.0,1.0,0.0
7,0.462527,0.453829,0.615912,1.0,0.0,1.0
8,0.315213,0.484549,0.791966,1.0,1.0,1.0
9,0.231771,0.257083,0.550529,0.0,1.0,0.0


# Hierarchical 

In [19]:
hclust = AgglomerativeClustering(n_clusters=clusters, linkage='ward', metric='euclidean')

In [20]:
#Z=linkage(df_bool, 'ward')
#plt.figure(figsize=(10, 5))
#dendrogram(Z, no_labels=True)
#plt.title('Hierarchical Clustering Dendrogram')
#plt.xlabel('Data points')
#plt.ylabel('Distance')
#plt.show()

In [21]:
hclust.fit(df_bool)
score_hclust_bool = calinski_harabasz_score(df_bool, hclust.labels_)
silhouette_hclust_bool = silhouette_score(df_bool, hclust.labels_)

print(score_hclust_bool)

df_hclust_bool = df_bool.copy()
df_hclust_bool['label'] = hclust.labels_
group_hclust_bool = df_hclust_bool.groupby('label')
group_hclust_bool.mean()

4545.837257592555


Unnamed: 0_level_0,veh2,highinc,house,ownner,age,hhsize
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.496644,0.0,1.0,0.134228,0.425695,1.0
1,0.815978,1.0,0.855476,0.829443,0.759425,1.0
2,0.332244,0.0,0.0,0.0,0.478214,1.0
3,0.567692,0.0,0.0,1.0,0.783077,1.0
4,0.0,0.0,0.231008,0.0,0.0,0.0
5,1.0,0.0,1.0,1.0,1.0,1.0
6,0.0,0.0,1.0,1.0,0.0,0.0
7,0.0,0.0,1.0,1.0,1.0,1.0
8,1.0,0.0,0.966667,0.963889,0.0,0.808333
9,0.0,0.0,0.0,1.0,0.0,0.0


In [22]:
#Z=linkage(df_norm, 'ward')
#plt.figure(figsize=(10, 5))
#dendrogram(Z, no_labels=True)
#plt.title('Hierarchical Clustering Dendrogram')
#plt.xlabel('Data points')
#plt.ylabel('Distance')
#plt.show()

In [23]:
hclust.fit(df_norm)
score_hclust_norm= calinski_harabasz_score(df_norm, hclust.labels_)
silhouette_hclust_norm= silhouette_score(df_norm, hclust.labels_)

print(score_hclust_norm)

df_hclust_norm = df_norm.copy()
df_hclust_norm['label'] = hclust.labels_
group_hclust_norm = df_hclust_norm.groupby('label')
group_hclust_norm.mean()

7973.411040855236


Unnamed: 0_level_0,HHSIZE_normalised,TOTALVEHS_normalised,INC_normalised,DWELLTYPE_Separate House,age_profile,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.425105,0.443861,0.649639,1.0,0.0,1.0
1,0.307401,0.484051,0.803159,1.0,1.0,1.0
2,0.093657,0.176111,0.348005,0.0,0.0,0.0
3,0.219848,0.376446,0.342432,1.0,1.0,1.0
4,0.204146,0.334003,0.560917,0.0,1.0,1.0
5,0.275112,0.286057,0.443453,1.0,0.0,0.0
6,0.066445,0.196914,0.304556,0.0,0.0,1.0
7,0.231771,0.257083,0.550529,0.0,1.0,0.0
8,0.288075,0.358621,0.576116,1.0,1.0,0.0
9,0.01881,0.192102,0.167606,1.0,0.0,1.0


In [24]:
df_final = df_sample.copy()
df_final['label'] = hclust.labels_
df_final

Unnamed: 0,household_id,SA2,HHSIZE,TOTALVEHS,INC,age_profile,HHSIZE_normalised,TOTALVEHS_normalised,INC_normalised,DWELLTYPE_Flat or Apartment,DWELLTYPE_Other,DWELLTYPE_Separate House,DWELLTYPE_Terrace/Townhouse,OWNDWELL_Being Purchased,OWNDWELL_Being Rented,OWNDWELL_Fully Owned,OWNDWELL_Occupied Rent-Free,OWNDWELL_Something Else,label
112679,112680,206041122,2,0,2100,0,0.125,0.0,0.658659,1,0,0,0,0,1,0,0,0,2
864439,864581,203021044,1,1,350,0,0.000,0.2,0.070571,0,0,1,0,0,0,1,0,0,9
1180461,1401687,210031237,3,1,1250,1,0.250,0.2,0.381882,0,0,1,0,1,0,0,0,0,3
1307167,1592533,209011204,3,4,800,1,0.250,0.8,0.219720,0,0,1,0,1,0,0,0,0,3
1029345,1136035,213031352,4,1,4275,1,0.375,0.2,0.965966,0,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144549,144550,206051129,2,2,3125,0,0.125,0.4,0.850350,0,0,0,1,0,1,0,0,0,2
1577874,2014255,212041312,3,2,1700,1,0.250,0.4,0.496997,0,1,0,0,0,0,1,0,0,4
912482,1018916,213011328,3,1,2350,1,0.250,0.2,0.724725,0,0,1,0,0,1,0,0,0,8
1615541,2051960,212041460,2,0,2075,1,0.125,0.0,0.645646,0,0,1,0,0,0,1,0,0,1


In [25]:
print(f"clusters: {clusters}")
print(f"score_kmeans_bool: {score_kmeans_bool}")
print(f"score_kmeans_norm: {score_kmeans_norm}")
print(f"score_gmm_bool:    {score_gmm_bool}")
print(f"score_gmm_norm:    {score_gmm_norm}")
print(f"score_hclust_bool: {score_hclust_bool}")
print(f"score_hclust_norm: {score_hclust_norm}")
print("\n")
print(f"clusters: {clusters}")
print(f"score_kmeans_bool: {silhouette_kmeans_bool}")
print(f"score_kmeans_norm: {silhouette_kmeans_norm}")
print(f"score_gmm_bool:    {silhouette_gmm_bool}")
print(f"score_gmm_norm:    {silhouette_gmm_norm}")
print(f"score_hclust_bool: {silhouette_hclust_bool}")
print(f"score_hclust_norm: {silhouette_kmeans_norm}")

clusters: 10
score_kmeans_bool: 4837.911780975896
score_kmeans_norm: 8410.667943706467
score_gmm_bool:    4380.932258267166
score_gmm_norm:    7967.509505068448
score_hclust_bool: 4545.837257592555
score_hclust_norm: 7973.411040855236


clusters: 10
score_kmeans_bool: 0.6508559520408738
score_kmeans_norm: 0.507323481878191
score_gmm_bool:    0.6199016097083949
score_gmm_norm:    0.48607029747707337
score_hclust_bool: 0.6639211815654639
score_hclust_norm: 0.507323481878191


In [26]:
group_kmeans_bool.mean()

Unnamed: 0_level_0,veh2,highinc,house,ownner,age,hhsize
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.567568,0.086873,1.0,0.0,0.0,0.972973
1,1.0,0.0,0.90118,1.0,1.0,1.0
2,0.078989,0.0,1.0,0.882306,0.0,0.110585
3,0.0,0.038534,0.758459,1.0,1.0,1.0
4,1.0,0.0,0.884498,1.0,0.0,1.0
5,0.023622,0.0,0.0,0.0,0.0,0.0
6,0.188352,0.065675,0.0,0.0,0.374226,1.0
7,0.925405,1.0,0.945946,0.951351,0.79027,1.0
8,0.018927,0.004732,0.0,1.0,0.0,0.107256
9,0.677863,0.071756,0.728244,0.0,1.0,1.0


In [27]:
group_kmeans_bool.size()/8000*100

label
0     6.4750
1    37.0625
2    15.8250
3    13.3000
4     8.2250
5     6.3500
6    10.0875
7    11.5625
8     7.9250
9     8.1875
dtype: float64

In [28]:
group_kmeans_norm.mean()

Unnamed: 0_level_0,HHSIZE_normalised,TOTALVEHS_normalised,INC_normalised,DWELLTYPE_Separate House,age_profile,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.275112,0.286057,0.443453,1.0,0.0,0.0
1,0.197164,0.348307,0.323819,1.0,1.0,1.0
2,0.551581,0.50512,0.72805,1.0,0.0,1.0
3,0.093657,0.176111,0.348005,0.0,0.0,0.0
4,0.046085,0.218271,0.238766,1.0,0.0,1.0
5,0.204146,0.334003,0.560917,0.0,1.0,1.0
6,0.066445,0.196914,0.304556,0.0,0.0,1.0
7,0.288075,0.358621,0.576116,1.0,1.0,0.0
8,0.231771,0.257083,0.550529,0.0,1.0,0.0
9,0.318414,0.497783,0.78501,1.0,1.0,1.0


In [29]:
group_kmeans_norm.size()/8000*100

label
0     8.3375
1    22.1500
2     8.3000
3    12.6625
4    15.3250
5     7.4625
6     8.9125
7     6.5250
8     6.0000
9    29.3250
dtype: float64

In [30]:
group_gmm_bool.mean()

Unnamed: 0_level_0,veh2,highinc,house,ownner,age,hhsize
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.0,0.0,0.884498,1.0,0.0,1.0
1,0.019017,0.0,0.0,1.0,0.0,0.103011
2,1.0,0.0,0.909316,0.917673,1.0,1.0
3,0.0,0.031068,0.390291,0.0,0.439806,1.0
4,0.0,0.139613,0.761144,0.949537,1.0,1.0
5,0.078989,0.0,1.0,0.882306,0.0,0.110585
6,0.023622,0.0,0.0,0.0,0.0,0.0
7,1.0,0.075758,0.0,0.0,0.539394,1.0
8,0.991889,1.0,0.938586,0.947856,0.771727,1.0
9,1.0,0.095238,1.0,0.0,0.0,0.952381


In [31]:
group_gmm_norm.mean()

Unnamed: 0_level_0,HHSIZE_normalised,TOTALVEHS_normalised,INC_normalised,DWELLTYPE_Separate House,age_profile,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0,0.192828,0.218452,1.0,0.0,1.0
1,0.204038,0.368578,0.325662,1.0,1.0,1.0
2,0.093657,0.176111,0.348005,0.0,0.0,0.0
3,0.204146,0.334003,0.560917,0.0,1.0,1.0
4,0.066445,0.196914,0.304556,0.0,0.0,1.0
5,0.275112,0.286057,0.443453,1.0,0.0,0.0
6,0.288075,0.358621,0.576116,1.0,1.0,0.0
7,0.462527,0.453829,0.615912,1.0,0.0,1.0
8,0.315213,0.484549,0.791966,1.0,1.0,1.0
9,0.231771,0.257083,0.550529,0.0,1.0,0.0


In [32]:
group_hclust_bool.mean()

Unnamed: 0_level_0,veh2,highinc,house,ownner,age,hhsize
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.496644,0.0,1.0,0.134228,0.425695,1.0
1,0.815978,1.0,0.855476,0.829443,0.759425,1.0
2,0.332244,0.0,0.0,0.0,0.478214,1.0
3,0.567692,0.0,0.0,1.0,0.783077,1.0
4,0.0,0.0,0.231008,0.0,0.0,0.0
5,1.0,0.0,1.0,1.0,1.0,1.0
6,0.0,0.0,1.0,1.0,0.0,0.0
7,0.0,0.0,1.0,1.0,1.0,1.0
8,1.0,0.0,0.966667,0.963889,0.0,0.808333
9,0.0,0.0,0.0,1.0,0.0,0.0


In [33]:
group_hclust_bool.size()/8000*100

label
0    13.0375
1    13.9250
2    11.4750
3     8.1250
4     8.0625
5    33.4000
6    10.9625
7    10.0875
8     9.0000
9     6.9250
dtype: float64

In [34]:
group_hclust_norm.mean()

Unnamed: 0_level_0,HHSIZE_normalised,TOTALVEHS_normalised,INC_normalised,DWELLTYPE_Separate House,age_profile,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.425105,0.443861,0.649639,1.0,0.0,1.0
1,0.307401,0.484051,0.803159,1.0,1.0,1.0
2,0.093657,0.176111,0.348005,0.0,0.0,0.0
3,0.219848,0.376446,0.342432,1.0,1.0,1.0
4,0.204146,0.334003,0.560917,0.0,1.0,1.0
5,0.275112,0.286057,0.443453,1.0,0.0,0.0
6,0.066445,0.196914,0.304556,0.0,0.0,1.0
7,0.231771,0.257083,0.550529,0.0,1.0,0.0
8,0.288075,0.358621,0.576116,1.0,1.0,0.0
9,0.01881,0.192102,0.167606,1.0,0.0,1.0


In [35]:
df_final.to_csv('clustered.csv', index=False)