In [221]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

In [222]:
df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0,LGA,hhsize,totalvehs,hhinc,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Flat or Apartment,dwelltype_Other,dwelltype_Separate House,dwelltype_Terrace/Townhouse,owndwell_Being Purchased,owndwell_Being Rented,owndwell_Fully Owned,owndwell_Occupied Rent-Free,owndwell_Something Else
0,Banyule,1.0,1.0,900.0,0.0,0.25,0.300801,0,1,0,0,0,0,1,0,0
1,Banyule,1.0,1.0,3625.0,0.0,0.25,0.8999,0,0,1,0,1,0,0,0,0
2,Banyule,2.0,1.0,1475.0,0.2,0.25,0.490991,0,0,1,0,0,0,1,0,0
3,Banyule,1.0,1.0,0.0,0.0,0.25,0.0,0,0,1,0,1,0,0,0,0
4,Banyule,2.0,1.0,1225.0,0.2,0.25,0.416416,0,0,1,0,0,0,1,0,0


In [223]:
df.describe().round(3)

Unnamed: 0,hhsize,totalvehs,hhinc,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Flat or Apartment,dwelltype_Other,dwelltype_Separate House,dwelltype_Terrace/Townhouse,owndwell_Being Purchased,owndwell_Being Rented,owndwell_Fully Owned,owndwell_Occupied Rent-Free,owndwell_Something Else
count,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0,1854696.0
mean,2.535,1.293,1808.029,0.307,0.323,0.5,0.139,0.052,0.706,0.103,0.321,0.262,0.404,0.008,0.005
std,1.29,0.872,1316.422,0.258,0.218,0.287,0.346,0.223,0.456,0.304,0.467,0.44,0.491,0.091,0.068
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,1.0,725.0,0.2,0.25,0.256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,1.0,1475.0,0.2,0.25,0.491,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,2.0,2500.0,0.6,0.5,0.735,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
max,6.0,4.0,6000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [224]:
sample_size = 8000
clusters = 7
my_n_init = 15
test_n_clusters = 15
df_sample = df.sample(sample_size, random_state=42)

## Normalised Data 

In [225]:
df_norm = pd.DataFrame()
df_norm = df_sample[['hhsize_normalised', 'totalvehs_normalised', 'hhinc_normalised','dwelltype_Separate House']].copy()
df_norm.loc[:, 'owndwell'] = df_sample['owndwell_Fully Owned'] | df_sample['owndwell_Being Purchased']
df_norm.head()

Unnamed: 0,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Separate House,owndwell
483687,0.4,0.5,0.838338,1,0
376706,0.0,0.25,0.224725,0,1
531206,0.2,0.25,0.416416,1,1
1759885,0.2,0.0,0.856857,0,0
1585313,0.4,0.25,0.337337,1,1


In [226]:
high_inc = df_sample['hhinc'].quantile(0.9)
df_bool = pd.DataFrame()
df_bool["veh2"] = df_sample['totalvehs'].apply(lambda x: 1 if x >= 2 else 0)
df_bool["highinc"] = df_sample['hhinc'].apply(lambda x: 1 if x >= high_inc else 0)
df_bool["house"] = df_sample['dwelltype_Separate House']
df_bool["ownner"] = (df_sample['owndwell_Fully Owned'] | df['owndwell_Being Purchased']).astype(int)
df_bool.head()

Unnamed: 0,veh2,highinc,house,ownner
483687,1,0,1,0
376706,0,0,0,1
531206,0,0,1,1
1759885,0,0,0,0
1585313,0,0,1,1


# Kmeans

In [227]:
def optimise_k_means(data, max_k, title):
    means = []
    inertias = []

    for k in range(1, max_k):
        kmeans = KMeans(n_clusters=k, n_init=my_n_init)
        kmeans.fit(data)

        means.append(k)
        inertias.append(kmeans.inertia_)

    # generate the elbow plot
    fig = plt.subplots(figsize=(10,5))
    plt.plot(means, inertias, 'o-')
    plt.title(title)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.grid(True)
    plt.show()

In [228]:
#ptimise_k_means(df_bool, test_n_clusters, "df_bool")

In [229]:
kmeans = KMeans(n_clusters=clusters, n_init=10).fit(df_bool)
score_kmeans_bool = calinski_harabasz_score(df_bool, kmeans.labels_)
silhouette_kmeans_bool = silhouette_score(df_bool, kmeans.labels_)

print(score_kmeans_bool)
print("The average silhouette score is :", silhouette_kmeans_bool)
 

df_kmeans_bool = df_bool.copy()
df_kmeans_bool['label'] = kmeans.labels_
group_kmeans_bool = df_kmeans_bool.groupby('label')
group_kmeans_bool.mean()

8326.520952281142
The average silhouette score is : 0.8223277215955563


Unnamed: 0_level_0,veh2,highinc,house,ownner
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,0.0,1.0,1.0
1,0.0,0.097245,1.0,0.0
2,1.0,0.094441,0.888829,1.0
3,0.0,0.100423,0.0,0.0
4,0.047619,1.0,0.74026,1.0
5,1.0,0.09661,0.60678,0.0
6,0.0,0.0,0.0,1.0


In [230]:
group_kmeans_bool.size()/8000*100

label
0    34.1375
1     7.7125
2    23.1625
3    11.8250
4     5.7750
5     7.3750
6    10.0125
dtype: float64

In [231]:
df_sample_clust = df_sample.copy()
df_sample_clust['label'] = kmeans.labels_
group_sample_clust = df_sample_clust.groupby('label')
group_sample_clust.mean()

  group_sample_clust.mean()


Unnamed: 0_level_0,hhsize,totalvehs,hhinc,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Flat or Apartment,dwelltype_Other,dwelltype_Separate House,dwelltype_Terrace/Townhouse,owndwell_Being Purchased,owndwell_Being Rented,owndwell_Fully Owned,owndwell_Occupied Rent-Free,owndwell_Something Else
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,2.182351,0.812889,1497.289637,0.23647,0.203222,0.449687,0.0,0.0,1.0,0.0,0.424021,0.0,0.575979,0.0,0.0
1,2.166937,0.776337,1779.04376,0.233387,0.194084,0.492236,0.0,0.0,1.0,0.0,0.0,0.944895,0.0,0.042139,0.012966
2,3.720453,2.374528,1791.572045,0.544091,0.593632,0.499444,0.026983,0.023206,0.888829,0.060982,0.446303,0.0,0.553697,0.0,0.0
3,1.667019,0.799154,1790.97463,0.133404,0.199789,0.496011,0.607822,0.167019,0.0,0.225159,0.0,0.970402,0.0,0.019027,0.010571
4,2.134199,0.876623,4621.829004,0.22684,0.219156,0.94858,0.090909,0.062771,0.74026,0.106061,0.452381,0.0,0.547619,0.0,0.0
5,3.549153,2.355932,1819.411864,0.509831,0.588983,0.502837,0.172881,0.059322,0.60678,0.161017,0.0,0.964407,0.0,0.023729,0.011864
6,1.734082,0.786517,1518.882647,0.146816,0.196629,0.456467,0.397004,0.205993,0.0,0.397004,0.473159,0.0,0.526841,0.0,0.0


In [232]:
#optimise_k_means(df_norm, test_n_clusters, "df_norm")

In [233]:
kmeans = KMeans(n_clusters=clusters, n_init=my_n_init).fit(df_norm)
score_kmeans_norm = calinski_harabasz_score(df_norm, kmeans.labels_)
silhouette_kmeans_norm = silhouette_score(df_norm, kmeans.labels_)

print(score_kmeans_norm) 
print("The average silhouette score is :", silhouette_kmeans_norm)

df_kmeans_norm = df_norm.copy()
df_kmeans_norm['label'] = kmeans.labels_
group_kmeans_norm = df_kmeans_norm.groupby('label')
group_kmeans_norm.mean()

5739.155145156629
The average silhouette score is : 0.4169418067254566


Unnamed: 0_level_0,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Separate House,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.174161,0.216241,0.764209,1.0,1.0
1,0.18438,0.272071,0.495058,0.0,0.0
2,0.604632,0.536579,0.259107,1.0,1.0
3,0.200177,0.271961,0.506431,0.0,1.0
4,0.621331,0.523495,0.754589,1.0,1.0
5,0.362872,0.344359,0.498904,1.0,0.0
6,0.171645,0.210255,0.260824,1.0,1.0


In [234]:
group_kmeans_norm.size()/8000*100

label
0    17.1250
1    14.7250
2    11.8750
3    14.0875
4    11.8375
5    12.1875
6    18.1625
dtype: float64

# GMM

In [235]:
def optimise_gmm(data, max_k):
    bics = []

    for k in range(1, max_k):
        gmm = GaussianMixture(n_components=k)
        gmm.fit(data)
        bics.append(gmm.bic(data))

    # generate the elbow plot
    fig = plt.subplots(figsize=(10,5))
    plt.plot(range(1, max_k), bics, 'o-')
    plt.xlabel('Number of Clusters')
    plt.ylabel('BIC')
    plt.grid(True)
    plt.show()

In [236]:
#optimise_gmm(df_bool, test_n_clusters)

In [237]:
gmm = GaussianMixture(n_components=clusters).fit(df_bool)
score_gmm_bool= calinski_harabasz_score(df_bool, gmm.predict(df_bool))
silhouette_gmm_bool = silhouette_score(df_bool, gmm.predict(df_bool))

print(score_gmm_bool) 
print("The average silhouette score is :", silhouette_gmm_bool)

df_gmm_bool = df_bool.copy()
df_gmm_bool['label'] = gmm.predict(df_bool)
group_gmm_bool = df_gmm_bool.groupby('label')
group_gmm_bool.mean()

6398.465043509873
The average silhouette score is : 0.7829117462103345


Unnamed: 0_level_0,veh2,highinc,house,ownner
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,1.0,0.237154,0.387352
1,0.0,0.111292,1.0,1.0
2,0.0,0.0,0.0,1.0
3,1.0,0.105067,0.8784,1.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0
6,1.0,0.09661,0.60678,0.0


In [238]:
group_gmm_bool.size()/8000*100

label
0     3.1625
1    38.4125
2    10.0125
3    23.4375
4    10.6375
5     6.9625
6     7.3750
dtype: float64

In [239]:
#optimise_gmm(df_norm, test_n_clusters)

In [240]:
gmm = GaussianMixture(n_components=clusters).fit(df_norm)
score_gmm_norm= calinski_harabasz_score(df_norm, gmm.predict(df_norm))
silhouette_gmm_norm = silhouette_score(df_norm, gmm.predict(df_norm))

print(score_gmm_norm)

df_gmm_norm = df_norm.copy()
df_gmm_norm['label'] = gmm.predict(df_norm)
group_gmm_norm = df_gmm_norm.groupby('label')
group_gmm_norm.mean()

5545.42169744498


Unnamed: 0_level_0,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Separate House,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.585313,0.480508,0.748289,1.0,1.0
1,0.18438,0.272071,0.495058,0.0,0.0
2,0.200177,0.271961,0.506431,0.0,1.0
3,0.549316,0.476468,0.26446,1.0,1.0
4,0.145471,0.209543,0.787975,1.0,1.0
5,0.362872,0.344359,0.498904,1.0,0.0
6,0.12692,0.199475,0.272172,1.0,1.0


In [241]:
group_gmm_norm.size()/8000*100

label
0    13.7875
1    14.7250
2    14.0875
3    15.5375
4    14.2125
5    12.1875
6    15.4625
dtype: float64

# Hierarchical 

In [242]:
from sklearn.cluster import AgglomerativeClustering
import scipy 
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import cophenet
import sklearn.metrics as sm

In [243]:
hclust = AgglomerativeClustering(n_clusters=clusters, linkage='ward', metric='euclidean')

In [244]:
#Z=linkage(df_bool, 'ward')
#plt.figure(figsize=(10, 5))
#dendrogram(Z, no_labels=True)
#plt.title('Hierarchical Clustering Dendrogram')
#plt.xlabel('Data points')
#plt.ylabel('Distance')
#plt.show()

In [245]:
hclust.fit(df_bool)
score_hclust_bool = calinski_harabasz_score(df_bool, hclust.labels_)
silhouette_hclust_bool = silhouette_score(df_bool, hclust.labels_)

print(score_hclust_bool)

df_hclust_bool = df_bool.copy()
df_hclust_bool['label'] = hclust.labels_
group_hclust_bool = df_hclust_bool.groupby('label')
group_hclust_bool.mean()

7934.880755713281


Unnamed: 0_level_0,veh2,highinc,house,ownner
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.367045,0.0,1.0,0.0
1,0.299176,1.0,0.720848,0.750294
2,1.0,0.0,1.0,1.0
3,0.0,0.0,1.0,1.0
4,1.0,0.0,0.0,0.495192
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0


In [246]:
group_hclust_bool.size()/8000*100

label
0    11.0000
1    10.6125
2    18.4000
3    34.1375
4     5.2000
5    10.6375
6    10.0125
dtype: float64

In [247]:
#Z=linkage(df_norm, 'ward')
#plt.figure(figsize=(10, 5))
#dendrogram(Z, no_labels=True)
#plt.title('Hierarchical Clustering Dendrogram')
#plt.xlabel('Data points')
#plt.ylabel('Distance')
#plt.show()

In [248]:
hclust.fit(df_norm)
score_hclust_norm= calinski_harabasz_score(df_norm, hclust.labels_)
silhouette_hclust_norm= silhouette_score(df_norm, hclust.labels_)

print(score_hclust_norm)

df_hclust_norm = df_norm.copy()
df_hclust_norm['label'] = hclust.labels_
group_hclust_norm = df_hclust_norm.groupby('label')
group_hclust_norm.mean()

5306.116216144471


Unnamed: 0_level_0,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Separate House,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.362872,0.344359,0.498904,1.0,0.0
1,0.18438,0.272071,0.495058,0.0,0.0
2,0.200177,0.271961,0.506431,0.0,1.0
3,0.51977,0.451653,0.702163,1.0,1.0
4,0.614881,0.512153,0.25258,1.0,1.0
5,0.130887,0.200967,0.317147,1.0,1.0
6,0.13601,0.196472,0.827223,1.0,1.0


In [249]:
df_final = df_sample.copy()
df_final['label'] = hclust.labels_
df_final

Unnamed: 0,LGA,hhsize,totalvehs,hhinc,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Flat or Apartment,dwelltype_Other,dwelltype_Separate House,dwelltype_Terrace/Townhouse,owndwell_Being Purchased,owndwell_Being Rented,owndwell_Fully Owned,owndwell_Occupied Rent-Free,owndwell_Something Else,label
483687,GlenEira,3.0,2.0,3125.0,0.4,0.50,0.838338,0,0,1,0,0,1,0,0,0,0
376706,Darebin,1.0,1.0,700.0,0.0,0.25,0.224725,0,1,0,0,0,0,1,0,0,2
531206,GlenEira,2.0,1.0,1225.0,0.2,0.25,0.416416,0,0,1,0,0,0,1,0,0,5
1759885,Yarra,2.0,0.0,3250.0,0.2,0.00,0.856857,0,0,0,1,0,1,0,0,0,1
1585313,Whitehorse,3.0,1.0,1000.0,0.4,0.25,0.337337,0,0,1,0,0,0,1,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390403,Darebin,2.0,0.0,2250.0,0.2,0.00,0.673674,0,0,1,0,1,0,0,0,0,6
473370,Frankston,2.0,1.0,725.0,0.2,0.25,0.255756,0,0,1,0,1,0,0,0,0,5
196461,Brimbank,3.0,1.0,4125.0,0.4,0.25,0.928929,0,0,1,0,0,1,0,0,0,0
1399256,MorningtonPeninsula,3.0,2.0,250.0,0.4,0.50,0.056557,0,0,1,0,0,1,0,0,0,0


In [250]:
group_hclust_norm.size()/8000*100

label
0    12.1875
1    14.7250
2    14.0875
3    17.3875
4    12.6000
5    18.7375
6    10.2750
dtype: float64

In [251]:
print(f"clusters: {clusters}")
print(f"score_kmeans_bool: {score_kmeans_bool}")
print(f"score_kmeans_norm: {score_kmeans_norm}")
print(f"score_gmm_bool:    {score_gmm_bool}")
print(f"score_gmm_norm:    {score_gmm_norm}")
print(f"score_hclust_bool: {score_hclust_bool}")
print(f"score_hclust_norm: {score_hclust_norm}")
print("\n")
print(f"clusters: {clusters}")
print(f"score_kmeans_bool: {silhouette_kmeans_bool}")
print(f"score_kmeans_norm: {silhouette_kmeans_norm}")
print(f"score_gmm_bool:    {silhouette_gmm_bool}")
print(f"score_gmm_norm:    {silhouette_gmm_norm}")
print(f"score_hclust_bool: {silhouette_hclust_bool}")
print(f"score_hclust_norm: {silhouette_kmeans_norm}")

clusters: 7
score_kmeans_bool: 8326.520952281142
score_kmeans_norm: 5739.155145156629
score_gmm_bool:    6398.465043509873
score_gmm_norm:    5545.42169744498
score_hclust_bool: 7934.880755713281
score_hclust_norm: 5306.116216144471


clusters: 7
score_kmeans_bool: 0.8223277215955563
score_kmeans_norm: 0.4169418067254566
score_gmm_bool:    0.7829117462103345
score_gmm_norm:    0.3985818872485214
score_hclust_bool: 0.8276894100583843
score_hclust_norm: 0.4169418067254566


In [252]:
group_kmeans_bool.mean()

Unnamed: 0_level_0,veh2,highinc,house,ownner
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,0.0,1.0,1.0
1,0.0,0.097245,1.0,0.0
2,1.0,0.094441,0.888829,1.0
3,0.0,0.100423,0.0,0.0
4,0.047619,1.0,0.74026,1.0
5,1.0,0.09661,0.60678,0.0
6,0.0,0.0,0.0,1.0


In [253]:
group_kmeans_bool.size()/8000*100

label
0    34.1375
1     7.7125
2    23.1625
3    11.8250
4     5.7750
5     7.3750
6    10.0125
dtype: float64

In [254]:
group_kmeans_norm.mean()

Unnamed: 0_level_0,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Separate House,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.174161,0.216241,0.764209,1.0,1.0
1,0.18438,0.272071,0.495058,0.0,0.0
2,0.604632,0.536579,0.259107,1.0,1.0
3,0.200177,0.271961,0.506431,0.0,1.0
4,0.621331,0.523495,0.754589,1.0,1.0
5,0.362872,0.344359,0.498904,1.0,0.0
6,0.171645,0.210255,0.260824,1.0,1.0


In [261]:
group_kmeans_norm.size()/8000*100

label
0    17.1250
1    14.7250
2    11.8750
3    14.0875
4    11.8375
5    12.1875
6    18.1625
dtype: float64

In [255]:
group_gmm_bool.mean()

Unnamed: 0_level_0,veh2,highinc,house,ownner
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,1.0,0.237154,0.387352
1,0.0,0.111292,1.0,1.0
2,0.0,0.0,0.0,1.0
3,1.0,0.105067,0.8784,1.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0
6,1.0,0.09661,0.60678,0.0


In [256]:
group_gmm_norm.mean()

Unnamed: 0_level_0,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Separate House,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.585313,0.480508,0.748289,1.0,1.0
1,0.18438,0.272071,0.495058,0.0,0.0
2,0.200177,0.271961,0.506431,0.0,1.0
3,0.549316,0.476468,0.26446,1.0,1.0
4,0.145471,0.209543,0.787975,1.0,1.0
5,0.362872,0.344359,0.498904,1.0,0.0
6,0.12692,0.199475,0.272172,1.0,1.0


In [257]:
group_hclust_bool.mean()

Unnamed: 0_level_0,veh2,highinc,house,ownner
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.367045,0.0,1.0,0.0
1,0.299176,1.0,0.720848,0.750294
2,1.0,0.0,1.0,1.0
3,0.0,0.0,1.0,1.0
4,1.0,0.0,0.0,0.495192
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0


In [258]:
group_hclust_bool.size()/8000*100

label
0    11.0000
1    10.6125
2    18.4000
3    34.1375
4     5.2000
5    10.6375
6    10.0125
dtype: float64

In [259]:
group_hclust_norm.mean()

Unnamed: 0_level_0,hhsize_normalised,totalvehs_normalised,hhinc_normalised,dwelltype_Separate House,owndwell
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.362872,0.344359,0.498904,1.0,0.0
1,0.18438,0.272071,0.495058,0.0,0.0
2,0.200177,0.271961,0.506431,0.0,1.0
3,0.51977,0.451653,0.702163,1.0,1.0
4,0.614881,0.512153,0.25258,1.0,1.0
5,0.130887,0.200967,0.317147,1.0,1.0
6,0.13601,0.196472,0.827223,1.0,1.0


In [260]:
df_final.to_csv('clustered.csv', index=False)