In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
import git
import math

from sklearn.cluster import _kmeans

In [2]:
repo = git.Repo("./", search_parent_directories=True)
homedir = repo.working_dir
datadir = f"{homedir}" + "/models/processing/USA/County_Based/"

In [3]:
#helper functions
def logFunc(x):
    if x < 0.01:
        x = 0.01
    return math.log10(x)

#def fillTerritories():
    

In [4]:
# read in the files, load as dataframe
Age_Race = pd.read_csv(datadir + 'Age_Race_Filled.csv')
Population = pd.read_csv(datadir + 'Total_Pop')
Density = pd.read_csv(datadir + 'Density.csv')
JHU = pd.read_csv(datadir + 'aggregate_jhu_filled.csv')
Berkeley = pd.read_csv(datadir + 'Aggregate_Berkeley.csv')
Policies = pd.read_csv(datadir + 'Policy_Transit.csv')
Geography = pd.read_csv(datadir + 'County_Centers.csv')

Data = pd.DataFrame()

DataBasic = pd.DataFrame()
DataDemographics = pd.DataFrame()
DataHealth = pd.DataFrame()
DataGeography = pd.DataFrame()

In [5]:
Data['fips'] = Age_Race['fips']

Population.columns = ['fips', 'Population']

In [6]:
# Nature of the county, includes policies

Data['Pop'] = Population['Population']
Data['Pop'] = Data['Pop'].div(2000000.0) # population max is 5

Data['Density'] = Density['2010 Density per square mile of land area - Population']
Data['Density'] = Data['Density'].div(10000.0) # density max is 7

Data['UrbanRural'] = JHU['Rural-urban_Continuum Code_2013']
Data['UrbanRural'] = Data['UrbanRural'].div(5.0) # urban rural max is 2

Data['EconType'] = JHU['Economic_typology_2015']
Data['EconType'] = Data['EconType'].div(5.0) # economic typology max is 1

# Policies
Data['Policies'] = Policies['Score']
Data['Policies'] = Data['Policies'].div(3) # policies max is just above 3

# Typical immigration in/out. Proxy for being a sink/source in flows
Data['Movement'] = JHU['R_NET_MIG_2018']
Data['Movement'] = Data['Movement'].div(20.0) #range around -3.5 to 3.5

DataBasic = Data[['Pop', 'Density', 'UrbanRural', 'EconType', 'Policies', 'Movement']]

In [7]:
# Demographics of the county

# Age distribution
Data['65+'] = Age_Race['65 to 74 years'] + Age_Race['75 to 84 years'] + Age_Race['85 years and over']
Data['65+'] = Data['65+'] / Population['Population']
Data['65+'] = Data['65+'].mul(20) # 65+ max is 3, generally around 1.5

# Race/gender
Data['Male'] = Age_Race['Male'] / Population['Population']
Data['Male'] = Data['Male'].mul(3) # male max is 2, generally 1

Data['AfricanAmer'] = Age_Race['Exclusively Black or African American'] + Age_Race['Hispanic or Latino (of any race)!!Puerto Rican']
Data['AfricanAmer'] = Data['AfricanAmer'] / Population['Population']
Data['AfricanAmer'] = Data['AfricanAmer'].mul(5) # African American max is 5, generally 0-3
     
# Politics/education/income/economy
Data['CollegePlus'] = JHU['Percent of adults completing some college or associate\'s degree 2014-18']
Data['CollegePlus'] = Data['CollegePlus'].mul(3) # education max is 3, generally around 1-1.5

Data['Income'] = JHU['Median_Household_Income_2018']
Data['Income'] = Data['Income'].div(25000) # income max is 4, generally around 2  

Data['Unemployed'] = JHU['Unemployment_rate_2018']
Data['Unemployed'] = Data['Unemployed'].div(2) # unemployed max around 3
                         
    # need puerto rico voting patterns
Data['Dems'] = Berkeley['FracDem']
Data['Dems'] = Data['Dems'].mul(3) # Dems max is 3, generally around 1.5

DataDemographics = Data[['Male', 'AfricanAmer', 'CollegePlus', 'Income', 'Unemployed', 'Dems']]

KeyError: "['Males'] not in index"

In [None]:
#Health care of the county

Data['HospBeds'] = JHU['Total hospital beds per 1000 people (2019)'] # around 2-3

Data['ICUBeds'] = Berkeley['#ICU_beds']
Data['ICUBeds'] = Data['ICUBeds'] / Population['Population']
Data['ICUBeds'] = Data['ICUBeds'].mul(1000)
Data['ICUBeds'] = Data['ICUBeds'].mul(2) # around 5-6. Outliers ok, very important statistic

#note: not considering comorbidities
Data['HeartDiseaseMort'] = Berkeley['HeartDiseaseMortality'] 
Data['HeartDiseaseMort'] = Data['HeartDiseaseMort'].div(60) # max is 10, typically around 3-4

Data['StrokeMort'] = Berkeley['StrokeMortality']
Data['StrokeMort'] = Data['StrokeMort'].div(15) # max is 7-8, generally around 5

Data['Diabetes'] = Berkeley['DiabetesPercentage'] 
Data['Diabetes'] = Data['Diabetes'].mul(3) # max 3

Data['Smokers'] = Berkeley['SmokersPercentage'] 
Data['Smokers'] = Data['Smokers'].mul(3) # max 3

DataHealth = Data[['HospBeds', 'ICUBeds', 'HeartDiseaseMort', 'StrokeMort', 'Diabetes', 'Smokers']]

In [None]:
#Geography

Data['PLon'] = Geography['pclon10'] # population weighted

Data['PLat'] = Geography['pclat10'] # population weighted

DataGeography = Data[['PLon', 'PLat']]

In [None]:
# drop first two columns
Data = Data.drop(columns = ['fips'])

# convert it into an array
X = Data.to_numpy()
X = [X[i][1:] for i in range(0, len(X))]

In [None]:
# loop over k = 3 to 10 for instance
# compute the error in each case
for k in range(4, 5):
    kmeans = _kMeans(n_clusters=2, random_state=0).fit(Age_Race_Arr)

In [None]:
# plot k vs error on a graph, decide optimal k via elbow method

In [None]:
# plot the clusters across the country to visualize