# K-Means Clustering Test on Three-Component Vectors for Each CBSA
This is for the sake of practice, testing, and visualisation. AUB-ARIMA CL output currently not considered.

In [27]:
import pandas as pd
import numpy as np

import chart_studio.plotly as py
import plotly.graph_objs as go

df = pd.read_csv('test_01.csv')
kmeans_df = df.drop(columns = ['metro_name'])

# Number of clusters
k = 3

In [28]:
from sklearn.cluster import KMeans

# Features extracted from our test .csv file
kmeans = KMeans(n_clusters = k).fit(kmeans_df[['ratio_of_public_transport', 'median_income', 'enrolled_in_school']])
kmeans_df.loc[:,'cluster'] = kmeans.labels_
kmeans_df = kmeans_df
kmeans_df


Unnamed: 0.1,Unnamed: 0,ratio_of_public_transport,median_income,pctg_below_poverty_line,enrolled_in_school,cluster
0,0,-0.355924,-0.622420,0.000000,0.035869,1
1,1,-0.362291,1.245645,0.000000,0.346162,1
2,2,0.248173,0.120348,0.013290,-0.077303,1
3,3,-0.332337,0.407350,0.139864,0.455581,1
4,4,-0.520086,0.195562,0.000000,-0.228167,1
...,...,...,...,...,...,...
950,950,0.002138,0.532464,-1.252450,-0.257513,1
951,951,-0.186601,0.268693,-0.002531,0.005607,1
952,952,-0.016467,0.193791,0.345547,-0.255924,1
953,953,0.635026,-0.604710,0.472121,-0.217837,1


In [23]:
# Coordinates of centroids, initialised randomly
centroids = kmeans.cluster_centers_
centroids

array([[-0.14509145, -0.03403589, -0.10840268],
       [ 4.06426632,  0.13851505,  0.17297364],
       [-0.16818945,  1.7533035 ,  6.17420641]])

In [18]:
# Final centroids after iterative part of K-Means algorithm
centroids_x = [centroids[c][0] for c in range(k)]
centroids_y = [centroids[c][1] for c in range(k)]
centroids_z = [centroids[c][2] for c in range(k)]

# Plot the actual datapoints/CBSAs
trace1 = go.Scatter3d(x = kmeans_df['ratio_of_public_transport'],
                      y = kmeans_df['median_income'],
                      z = kmeans_df['enrolled_in_school'],
                      name = 'Metropolitan Area',
                      mode='markers',
                      marker=dict(
                        size=4,
                        opacity=0.35,
                        color=kmeans.labels_
                    ))

# Plot the centroids
trace2 = go.Scatter3d(x = centroids_x,
                      y = centroids_y,
                      z = centroids_z,
                      name = 'Centroid',
                      mode='markers',
                      marker=dict(
                        size=6,
                        opacity=0.5,
                        color='#000'
                    ))
trace3 = go.Mesh3d(
    x = kmeans_df['ratio_of_public_transport'], 
    y = kmeans_df['median_income'],
    z = kmeans_df['enrolled_in_school'],
    alphahull = 1,
    color='#1522b0',
    opacity = 0.05,
    
)

data = [trace1, trace2, trace3]
fig = go.Figure(data=data)

# Stylise the visual
fig.update_layout(
    title="Public Transportation to Work Ratio, Median Household Income, Education Attainment in Metro Areas in 2010",
    scene = dict(
        xaxis = dict(title = 'Pub. Transport. to Work'),
        yaxis = dict(title = 'Median Household Income'),
        zaxis = dict(title = 'Education Attainment'),
    ),
    font = dict(
        family="Courier New, monospace",
        size=10,
        color="#7f7f7f"
    )
)

py.sign_in('camille0004', 'qsZvebqUjVL7lFGtLX0R')

py.iplot(fig)

In [10]:

res_df = pd.DataFrame(zip(df.metro_name, kmeans_df.cluster), columns = ['metro_name', 'cluster']).sort_values(by = 'cluster')
res_df

Unnamed: 0,metro_name,cluster
0,"College Station-Bryan, TX Metro Area",0
592,"Monroe, MI Metro Area",0
593,"Monroe, WI Micro Area",0
594,"Montgomery, AL Metro Area",0
595,"Montrose, CO Micro Area",0
...,...,...
789,"San Luis Obispo-Paso Robles, CA Metro Area",2
790,"Santa Barbara-Santa Maria-Goleta, CA Metro Area",2
791,"Santa Cruz-Watsonville, CA Metro Area",2
321,"Fort Collins-Loveland, CO Metro Area",2
