# K-Means Clustering Test on Three-Component Vectors for Each CBSA
This is for the sake of practice, testing, and visualisation. AUB-ARIMA CL output currently not considered.

In [59]:
import pandas as pd
import numpy as np

import chart_studio.plotly as py
import plotly.graph_objs as go

df = pd.read_csv('test_01.csv')
kmeans_df = df.drop(columns = ['metro_name'])

# Number of clusters
k = 3

In [61]:
from sklearn.cluster import KMeans

# Features extracted from our test .csv file
kmeans = KMeans(n_clusters = k).fit(kmeans_df[['ratio_of_public_transport', 'median_income', 'enrolled_in_school']])
kmeans_df.loc[:,'cluster'] = kmeans.labels_
kmeans_df = kmeans_df
kmeans_df


Unnamed: 0,ratio_of_public_transport,median_income,enrolled_in_school,cluster
0,1.274381,-0.524099,1.031626,0
1,1.098373,1.148940,1.561631,0
2,0.216680,0.219432,0.696153,2
3,0.872824,0.475929,1.687845,0
4,-2.025492,0.288121,-0.251108,1
...,...,...,...,...
950,0.440277,0.583197,-0.715944,2
951,0.481194,0.353888,0.954770,0
952,-0.010236,0.286517,-0.682515,2
953,0.047177,-0.504815,-0.135704,2


In [62]:
# Coordinates of centroids, initialised randomly
centroids = kmeans.cluster_centers_
centroids

array([[ 1.22800145,  0.80030249,  1.4716126 ],
       [-0.96411045, -0.86639472, -0.72459989],
       [ 0.04473414,  0.1824264 , -0.22372412]])

In [63]:
# Final centroids after iterative part of K-Means algorithm
centroids_x = [centroids[c][0] for c in range(k)]
centroids_y = [centroids[c][1] for c in range(k)]
centroids_z = [centroids[c][2] for c in range(k)]

# Plot the actual datapoints/CBSAs
trace1 = go.Scatter3d(x = kmeans_df['ratio_of_public_transport'],
                      y = kmeans_df['median_income'],
                      z = kmeans_df['enrolled_in_school'],
                      name = 'Metropolitan Area',
                      mode='markers',
                      marker=dict(
                        size=4,
                        opacity=0.35,
                        color=kmeans.labels_
                    ))

# Plot the centroids
trace2 = go.Scatter3d(x = centroids_x,
                      y = centroids_y,
                      z = centroids_z,
                      name = 'Centroid',
                      mode='markers',
                      marker=dict(
                        size=6,
                        opacity=0.5,
                        color='#000'
                    ))
trace3 = go.Mesh3d(
    x = kmeans_df['ratio_of_public_transport'], 
    y = kmeans_df['median_income'],
    z = kmeans_df['enrolled_in_school'],
    alphahull = 1,
    color='#1522b0',
    opacity = 0.05,
    
)

data = [trace1, trace2, trace3]
fig = go.Figure(data=data)

# Stylise the visual
fig.update_layout(
    title="Public Transportation to Work Ratio, Median Household Income, Education Attainment in Metro Areas in 2010",
    scene = dict(
        xaxis = dict(title = 'Pub. Transport. to Work'),
        yaxis = dict(title = 'Median Household Income'),
        zaxis = dict(title = 'Education Attainment'),
    ),
    font = dict(
        family="Courier New, monospace",
        size=10,
        color="#7f7f7f"
    )
)

py.sign_in('camille0004', 'qsZvebqUjVL7lFGtLX0R')

py.iplot(fig)

In [46]:

res_df = pd.DataFrame(zip(df.metro_name, kmeans_df.cluster), columns = ['metro_name', 'cluster']).sort_values(by = 'cluster')
res_df

Unnamed: 0,metro_name,cluster
0,"College Station-Bryan, TX Metro Area",0
459,"Kennewick-Pasco-Richland, WA Metro Area",0
453,"Kansas City, MO-KS Metro Area",0
450,"Kalamazoo-Portage, MI Metro Area",0
449,"Kahului-Wailuku, HI Micro Area",0
...,...,...
559,"Mayfield, KY Micro Area",2
558,"Mayag?ez, PR Metro Area",2
556,"Maryville, MO Micro Area",2
577,"Middlesborough, KY Micro Area",2
