In [None]:
import numpy as np, pandas as pd

from sas7bdat import SAS7BDAT
from joe import *

import bokeh
bokeh.sampledata.download()
from bokeh.sampledata.us_counties import data as counties

from sklearn.decomposition import PCA
from sklearn import cluster

In [None]:
counties_df = pd.DataFrame.from_dict(counties, orient='index')
counties_df.state = counties_df.state.apply(lambda x: x.upper())

In [None]:
countiesplus_df = pd.DataFrame.from_dict(counties, orient='index')
countiesplus_df.state = countiesplus_df.state.apply(lambda x: x.upper())

In [None]:
with SAS7BDAT('ace.sas7bdat') as f:
    df = f.to_data_frame()

In [None]:
# To understand the different columns and their meanings, uncomment and examine the following output
print(df.loc[28])

In [None]:
set([item[0] for item in df.columns])

In [None]:
# To replicate the experiments of Kevin, collect only poverty, uninsured and unemployment rates for 2014

data = df[['f12424','f00010','f00002','ppov2014', 'punins2014','punemp2014']]
data.columns = ['state','name','zip','ppov2014', 'punins2014','punemp2014']

In [None]:
#spectral = cluster.SpectralClustering(n_clusters=9, eigen_solver='arpack', assign_labels="discretize")
spectral = cluster.SpectralClustering(n_clusters=9, eigen_solver='arpack', assign_labels="kmeans", n_neighbors=10)

In [None]:
spectral.fit(data[['ppov2014','punins2014','punemp2014']].values)

In [None]:
data['cluster'] = spectral.labels_.astype(int)

In [None]:
# To examine the structure of counties_df, uncomment the following
# counties_df.loc[1,57]

In [None]:
counties_df['zip']=""
for index in counties_df.index:
    counties_df['zip'][index]= "%02i%03i" % index

#counties_plus = pd.merge(counties_df, data, how="left", on="zip").dropna()
counties_df = pd.merge(counties_df, data[['zip','ppov2014','punins2014','punemp2014','cluster']], how="left", on="zip").dropna()

In [None]:
pca = PCA(n_components=2)
pca.fit(counties_df[['ppov2014','punins2014','punemp2014']].values)

projected_data = pca.transform(counties_df[['ppov2014','punins2014','punemp2014']].values)

In [None]:
counties_df['projected_x']=projected_data[:,0]
counties_df['projected_y']=projected_data[:,1]

In [None]:
counties_df.to_pickle("ruralrecesion.pkl")

In [None]:
# I will now collect those three categories for the years 2010, 2011, 2012, 2013 and 2014, and repeat the experiment

dataplus = df[['f12424','f00010','f00002',
               'ppov2010', 'ppov2011','ppov2012','ppov2013', 'ppov2014',
               'punins2010', 'punins2011', 'punins2012', 'punins2013', 'punins2014',
               'punemp2010', 'punemp2011', 'punemp2012', 'punemp2013', 'punemp2014']].dropna()
dataplus.columns = ['state','name','zip',
                    'ppov2010', 'ppov2011','ppov2012','ppov2013', 'ppov2014',
                    'punins2010', 'punins2011', 'punins2012', 'punins2013', 'punins2014',
                    'punemp2010', 'punemp2011', 'punemp2012', 'punemp2013', 'punemp2014']

In [None]:
spectral = cluster.SpectralClustering(n_clusters=9, eigen_solver='arpack', assign_labels="kmeans", n_neighbors=10)
spectral.fit(dataplus[['ppov2010', 'ppov2011','ppov2012','ppov2013', 'ppov2014',
                       'punins2010', 'punins2011', 'punins2012', 'punins2013', 'punins2014',
                       'punemp2010', 'punemp2011', 'punemp2012', 'punemp2013', 'punemp2014']].values)

In [None]:
dataplus['cluster'] = spectral.labels_.astype(int)

In [None]:
countiesplus_df['zip']=""
for index in countiesplus_df.index:
    countiesplus_df['zip'][index]= "%02i%03i" % index

countiesplus_df = pd.merge(countiesplus_df, 
                           dataplus[['zip',
                                     'ppov2010', 'ppov2011','ppov2012','ppov2013', 'ppov2014',
                                     'punins2010', 'punins2011', 'punins2012', 'punins2013', 'punins2014',
                                     'punemp2010', 'punemp2011', 'punemp2012', 'punemp2013', 'punemp2014',
                                     'cluster']], how="left", on="zip").dropna()

In [None]:
pca = PCA(n_components=3)
pca.fit(countiesplus_df[['ppov2010', 'ppov2011','ppov2012','ppov2013', 'ppov2014',
                         'punins2010', 'punins2011', 'punins2012', 'punins2013', 'punins2014',
                         'punemp2010', 'punemp2011', 'punemp2012', 'punemp2013', 'punemp2014']].values)

projected_data = pca.transform(countiesplus_df[['ppov2010', 'ppov2011','ppov2012','ppov2013', 'ppov2014',
                                                'punins2010', 'punins2011', 'punins2012', 'punins2013', 'punins2014',
                                                'punemp2010', 'punemp2011', 'punemp2012', 'punemp2013', 'punemp2014']].values)

In [None]:
countiesplus_df['projected_x']=projected_data[:,0]
countiesplus_df['projected_y']=projected_data[:,1]
countiesplus_df['projected_z']=projected_data[:,2]

In [None]:
countiesplus_df.to_pickle("ruralrecesionplus.pkl")

In [None]:
#  The next cells could be used to perform a ranking of counties, by restricting the dimension of the clustering to one

In [None]:
V, result = fast_sort_frame(data[['ppov2014','punins2014','punemp2014']])

result2 = result.reindex(range(len(result)))

#trace = Scatter3d(x=result.ppov2014,
#                  y=result.punins2014,
#                  z=result.punemp2014,
#                  text=(data.name+", "+data.state).loc[result.index],
#                  marker=dict(size=3, color=255*(result2.index)/len(result2),
#                              line=dict(color='rgba(217, 217, 217, 0.14)',width=0.5)))
#
#layout = Layout(dict(showlegend=False))

#fig = Figure(data=[trace], layout=layout)
#iplot(fig, show_link=False)