## Exploring Correlations of Top Features for CA Clustered Counties

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from shapely import wkt

from geopandas import GeoDataFrame
import geopandas as gpd
import pygeos
import gpdvega 
import altair as alt

In [None]:
#Load in cluster dataframe
cluster_df = pd.read_csv('/work/output/ca_unsupervised_min_max.csv')

# Load geospatial file
counties = gpd.read_file('/work/ca-county-boundaries/CA_Counties/CA_Counties_TIGER2016.shp')
counties['COUNTYFP'] = pd.to_numeric(counties['COUNTYFP'])

#Load in California data
ca_df = pd.read_csv('/work/cleaned-csvs/ca_counties_full_dataset.csv')

#Limit California data to 2018
ca_df = ca_df[ca_df['year']==2018]


In [None]:
print(len(cluster_df))
cluster_df.columns

# cluster_df.drop(columns= ['COUNTYNS', 'GEOID', 'NAME', 'NAMELSAD', 'LSAD',
#        'CLASSFP', 'MTFCC', 'CSAFP', 'CBSAFP', 'METDIVFP', 'FUNCSTAT', 'ALAND',
#        'AWATER', 'INTPTLAT', 'INTPTLON', 'geometry', ],inplace = True)



In [None]:
cluster_df = pd.merge(counties,cluster_df,how='left', on='COUNTYFP')


In [None]:
print(len(ca_df))
ca_df.columns

In [None]:
#Combine our dataframes
df = cluster_df.merge(ca_df, how = 'inner', on = 'county_name')
print(len(df))

#List the columns that end with cluster
list(df[list(df.filter(regex='cluster'))])


In [None]:
#Check the cluster values for the kmeans_pca_cluster
df['kmeans_pca_cluster'].value_counts()

In [None]:
#Limit data to the top features found in supervised learning

#Top 10 (only counting population once)
top_fields = [ 'democrat_pct','per_capita_retirement_and_other',
        'per_capita_farm_proprieter_jobs','total_population',
       'perc_white', 'perc_other_race', 'perc_hispanic', 
       'perc_owner', 'perc_renter', 'area_water']

#Smaller set
#Top 5 (only counting population once)
# top_fields = [ 'area_water','total_population',
#        'perc_white', 'perc_owner', 'perc_renter', ]


#Create correlations for each of the three clusters in kmeans_pca_cluster
df_cluster_0 = df[df['kmeans_pca_cluster']==0][top_fields].corr(method='kendall')
df_cluster_1 = df[df['kmeans_pca_cluster']==1][top_fields].corr(method='kendall')
df_cluster_2 = df[df['kmeans_pca_cluster']==2][top_fields].corr(method='kendall')

### K-Means Clustering with 3 Clusters after PCA

In [None]:
cluster_df.columns

In [None]:
kmeans_pca_chart = alt.Chart(cluster_df).mark_geoshape().encode(
    tooltip='NAME',
    color='kmeans_pca_cluster:N'
).properties(title='K-Means With 3 Clusters on PCA Results - 2018 Data')
kmeans_pca_chart

### Correlations for Cluster 0

In [None]:
plt.figure(figsize = (12,5))
sns.heatmap(df_cluster_0,annot=True)

### Correlations for Cluster 1

In [None]:
plt.figure(figsize = (12,5))
sns.heatmap(df_cluster_1,annot=True)

### Correlations for Cluster 2

In [None]:
plt.figure(figsize = (12,5))
sns.heatmap(df_cluster_2,annot=True)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f6c76417-5fde-42f3-8920-755838dec3fa' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>