In [22]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import sklearn
import os
import datetime
import time
import git
import sys
import math
from geopy.distance import geodesic

In [23]:
repo = git.Repo("./", search_parent_directories=True)
homedir = repo.working_dir
datadir = f"{homedir}" + "/models/processing/USA/County_Based/"

In [24]:
#helper functions
def logFunc(x):
    if x < 0.01:
        x = 0.01
    return math.log10(x)

# Convert longitude latitude pair to x, y, z Cartesian coordinates\n
def convertPts(pair):
    lon = pair[0]
    lat = pair[1]
    R = 3958.8
    lonRad = lon * math.pi / 180
    latRad = lat * math.pi / 180
    x = R * math.cos(latRad) * math.cos(lonRad)
    y = R * math.cos(latRad) * math.cos(lonRad)
    z = R * math.sin(lat)
    return (x, y, z)

def getX(x):
    return x[0]

def getY(x):
    return x[1]

def getZ(x):
    return x[2]

In [25]:
#Neighbor Data
neighborcounties = pd.read_csv(f"{homedir}/models/processing/USA/County_Based/neighborcounties.csv", index_col = 0)

# read in the files, load as dataframe
Age_Race = pd.read_csv(datadir + 'Age_Race_Filled.csv')
Population = pd.read_csv(datadir + 'Total_Pop')
Density = pd.read_csv(datadir + 'Density.csv')
JHU = pd.read_csv(datadir + 'aggregate_jhu_filled.csv')
Berkeley = pd.read_csv(datadir + 'Aggregate_Berkeley.csv')
Policies = pd.read_csv(datadir + 'Policy_Transit.csv')
Geography = pd.read_csv(datadir + 'County_Centers.csv')
Beds = pd.read_csv(datadir + 'County_Beds.csv')

Data = pd.DataFrame()
Raw_Data = pd.DataFrame()

DataBasic = pd.DataFrame()
DataDemographics = pd.DataFrame()
DataHealth = pd.DataFrame()
DataGeography = pd.DataFrame()

In [26]:
Data['FIPS'] = Geography['fips']

# fix population
Population.columns = ['fips', 'Population']

# drop US territories, train separate models for them
FipsSet = []
counter = 0
for row in Data.iterrows():
    row = row[1][0]
    if math.floor(row / 1000) > 56:
        Data = Data.drop([counter], axis = 0)
    else:
        FipsSet.append(float(row))
    counter += 1
    
# edit county centers
counter1 = 0
for row in Geography.iterrows():
    if row[1][1] not in FipsSet:
        Geography = Geography.drop([counter1], axis = 0)
    counter1 += 1

Raw_Data['FIPS'] = Data['FIPS']

In [27]:
# Nature of the county, includes policies

Data['Pop'] = Population['Population']
Raw_Data['Pop'] = Population['Population']

Data['Density'] = Density['2010 Density per square mile of land area - Population']
Raw_Data['Density'] = Density['2010 Density per square mile of land area - Population']

Data['Area'] = Density['Area in square miles - Total area']
Raw_Data['Area'] = Density['Area in square miles - Total area']

Data['UrbanRural'] = JHU['Rural-urban_Continuum Code_2013']
Raw_Data['UrbanRural'] = JHU['Rural-urban_Continuum Code_2013']

Data['EconType'] = JHU['Economic_typology_2015']
Raw_Data['EconType'] = JHU['Economic_typology_2015']

# Policies
Data['Policies'] = Policies['Score']
Raw_Data['Policies'] = Policies['Score']

# Typical immigration in/out. Proxy for being a sink/source in flows
Data['Movement'] = JHU['R_NET_MIG_2018']
Raw_Data['Movement'] = JHU['R_NET_MIG_2018']

Data['Transit'] = JHU['transit_scores - population weighted averages aggregated from town/city level to county']
Raw_Data['Transit'] = JHU['transit_scores - population weighted averages aggregated from town/city level to county']

In [28]:
# Demographics of the county

# Age distribution
Data['65+'] = Age_Race['65 to 74 years'] + Age_Race['75 to 84 years'] + Age_Race['85 years and over']
Raw_Data['65+'] = Age_Race['65 to 74 years'] + Age_Race['75 to 84 years'] + Age_Race['85 years and over']
Data['65+'] = Data['65+'] / Population['Population']
Raw_Data['65+'] = Raw_Data['65+'] / Population['Population']

# Race/gender
Data['Male'] = Berkeley['FracMale2017']
Raw_Data['Male'] = Berkeley['FracMale2017']

Data['AfricanAmer'] = Age_Race['Exclusively Black or African American'] + Age_Race['Hispanic or Latino (of any race)!!Puerto Rican']
Raw_Data['AfricanAmer'] = Age_Race['Exclusively Black or African American'] + Age_Race['Hispanic or Latino (of any race)!!Puerto Rican']
Data['AfricanAmer'] = Data['AfricanAmer'] / Population['Population']
Raw_Data['AfricanAmer'] = Raw_Data['AfricanAmer'] / Population['Population']
     
# Politics/education/income/economy
Data['CollegePlus'] = JHU['Percent of adults completing some college or associate\'s degree 2014-18']
Raw_Data['CollegePlus'] = JHU['Percent of adults completing some college or associate\'s degree 2014-18']

Data['Income'] = JHU['Median_Household_Income_2018']
Raw_Data['Income'] = JHU['Median_Household_Income_2018']

Data['Unemployed'] = JHU['Unemployment_rate_2018']
Raw_Data['Unemployed'] = JHU['Unemployment_rate_2018']
                         
Data['Dems'] = Berkeley['FracDem']
Raw_Data['Dems'] = Berkeley['FracDem']

In [29]:
#Health care of the county

Data['Hospitals'] = Berkeley['#Hospitals'] * 10000.0 / Population['Population']
Raw_Data['Hospitals'] = Berkeley['#Hospitals'] * 10000.0 / Population['Population']

# very tough to fill in
#Data['HospBeds'] = Beds['licensed_beds'] / Population['Population'] # around 2-3

Data['ICUBeds'] = Berkeley['#ICU_beds']
Data['ICUBeds'] = Data['ICUBeds'] / Population['Population']
Raw_Data['ICUBeds'] = Data['ICUBeds']

#note: not considering comorbidities
Data['HeartDiseaseMort'] = Berkeley['HeartDiseaseMortality']
Raw_Data['HeartDiseaseMort'] = Berkeley['HeartDiseaseMortality']

Data['StrokeMort'] = Berkeley['StrokeMortality']
Raw_Data['StrokeMort'] = Berkeley['StrokeMortality']

Data['Diabetes'] = Berkeley['DiabetesPercentage'] 
Raw_Data['Diabetes'] = Berkeley['DiabetesPercentage'] 

Data['Smokers'] = Berkeley['SmokersPercentage'] 
Raw_Data['Smokers'] = Berkeley['SmokersPercentage'] 

In [30]:
#Geography

Data['pLon'] = Geography['pclon10']
Raw_Data['pLon'] = Geography['pclon10']

Data['pLat'] = Geography['pclat10']
Raw_Data['pLat'] = Geography['pclat10']

#Data['pLonLat'] = list(zip(Geography.pclon10, Geography.pclat10)) # population weighted
#Data['pLonLat'] = Data['pLonLat'].values

#Data['XYZ'] = Data['pLonLat'].apply(convertPts)

#Data['xVal'] = Data['XYZ'].apply(getX)
#Data['xVal'] = Data['xVal'].div(100)

#Data['yVal'] = Data['XYZ'].apply(getY)
#Data['yVal'] = Data['yVal'].div(100)

#Data['zVal'] = Data['XYZ'].apply(getZ)
#Data['zVal'] = Data['zVal'].div(200)

#Data = Data.drop(columns=['pLonLat', 'XYZ'])

In [32]:
# functions from neighbor_fill_ins

def fillcol(fips, value,neighborcounties, min_neighbors=2):
    #Takes in a column of fips codes, and any type of datafield with some NaNs,
    #Computes distance-weighted average of the value across all neighbors of NaN counties
    tic1 = time.time()
    #Loading in the fips and value into proper dataframes
    #This is the df with only nan values
    df = pd.DataFrame(data = [fips,value]).T
    df.columns = ['FIPS', 'Values']
    df.Values = df.Values.astype(float)
    df = df.set_index('FIPS')
    
    #creating new column to set to the current dataframe values
    newcol = []
    for ind in df.index:
        #for any entries with NaNs
        if np.isnan(df['Values'][ind]):
            #list of neighbors for NaN county
            neighbors = list(neighborcounties[neighborcounties['orgfips'] == ind]['adjfips'])
            nonzero = 0
            weightedval = 0
            totalinvdist = 0
            totaldist = 0
            vals = 0
            #iterates though neighbors of NaN county with non-NaN entires
            for n in neighbors:
                if n in df.index:
                    if ~np.isnan(df['Values'][n]):
                        #Getting weighted values, using 1/dist as a scalar to show closer distance counts more
                        nonzero += 1
                        dist = list(neighborcounties.query('orgfips == ' + str(ind) + ' and adjfips == ' + str(n))['Pop_10'])[0]
                        totalinvdist += (1/dist)**1
                        weightedval += ((1/dist)**1)*df['Values'][n]
            #If there are at least 2 neighbors (this can be adjusted)
            if nonzero >= min_neighbors:
                newcol.append(weightedval/(totalinvdist))
            else:
                newcol.append(np.nan)
        else:
            newcol.append(df['Values'][ind])
    toc1 = time.time()
    #print(toc1 - tic1)
    return newcol

def fillfixed(colname, data, code, neighborcounties):
    #Method to fill up the google mobility data
    #Uses colname to designate which column to fill
    numnans = len(data[np.isnan(data[colname])])
    while numnans > 0:
        #print(numnans)
        tempnum = numnans
        #Creating the filled column from method
        newcol = fillcol(data[code], data[colname], neighborcounties)
        data[colname] = newcol
        numnans = len(data[np.isnan(data[colname])])
        #Checking if the number of nans changes
        if tempnum == numnans:
            #if number doesnt change, try again with only 1 neighbor, otherwise quit
            newcol = fillcol(data[code], data[colname], neighborcounties)
            data[colname] = newcol
            numnans = len(data[np.isnan(data[colname])])
            if tempnum == numnans:
                numnans = 0     
    return data

In [33]:
# Filling in columns of dataframe by nearest neighbor analysis

cols = list(Data.columns)[1:]
for col in cols:
    Data = fillfixed(col, Data, 'FIPS', neighborcounties)
print(sum(Data.isna().sum()))

0


In [34]:
cols = list(Raw_Data.columns)[1:]
for col in cols:
    Raw_Data = fillfixed(col, Raw_Data, 'FIPS', neighborcounties)
print(sum(Raw_Data.isna().sum()))

0


In [35]:
for column in Data.columns:
    print((column, Data[column].isnull().sum()))
    
for column in Raw_Data.columns:
    print((column, Raw_Data[column].isnull().sum()))

('FIPS', 0)
('Pop', 0)
('Density', 0)
('Area', 0)
('UrbanRural', 0)
('EconType', 0)
('Policies', 0)
('Movement', 0)
('Transit', 0)
('65+', 0)
('Male', 0)
('AfricanAmer', 0)
('CollegePlus', 0)
('Income', 0)
('Unemployed', 0)
('Dems', 0)
('Hospitals', 0)
('ICUBeds', 0)
('HeartDiseaseMort', 0)
('StrokeMort', 0)
('Diabetes', 0)
('Smokers', 0)
('pLon', 0)
('pLat', 0)
('FIPS', 0)
('Pop', 0)
('Density', 0)
('Area', 0)
('UrbanRural', 0)
('EconType', 0)
('Policies', 0)
('Movement', 0)
('Transit', 0)
('65+', 0)
('Male', 0)
('AfricanAmer', 0)
('CollegePlus', 0)
('Income', 0)
('Unemployed', 0)
('Dems', 0)
('Hospitals', 0)
('ICUBeds', 0)
('HeartDiseaseMort', 0)
('StrokeMort', 0)
('Diabetes', 0)
('Smokers', 0)
('pLon', 0)
('pLat', 0)


In [36]:
for column in Data:
    print(column, max(abs(Data[column])))

FIPS 56045
Pop 10105722.0
Density 69468.4
Area 147804.63
UrbanRural 9.0
EconType 5.0
Policies 9.9
Movement 69.4
Transit 9194202251.0
65+ 0.0016696477681781288
Male 0.7342512908777971
AfricanAmer 0.0012546336670852722
CollegePlus 57.3
Income 140382.0
Unemployed 18.1
Dems 0.9569518834185408
Hospitals 116.27906976744185
ICUBeds 0.4945054945054945
HeartDiseaseMort 603.0
StrokeMort 99.9
Diabetes 0.33
Smokers 0.4149130895400001
pLon 168.292885
pLat 70.522779


In [37]:
Data['Pop'] = Data['Pop'].div(6000.0)
Data['Density'] = Data['Density'].div(50.0)
Data['UrbanRural'] = Data['UrbanRural'].div(1.5)
Data['EconType'] = Data['EconType'].mul(1.4)
Data['Policies'] = Data['Policies'].mul(10)
Data['Movement'] = Data['Movement'].mul(5.0)
Data['Transit'] = Data['Transit'].div(125000000)

Data['65+'] = Data['65+'].mul(2400.0)
Data['Male'] = Data['Male'].mul(6.0)
Data['AfricanAmer'] = Data['AfricanAmer'].mul(4500.0)
Data['CollegePlus'] = Data['CollegePlus'].div(10.0)
Data['Income'] = Data['Income'].div(20000.0)
Data['Unemployed'] = Data['Unemployed'].div(3.0)
Data['Dems'] = Data['Dems'].mul(7.5)

Data['Hospitals'] = Data['Hospitals'].div(30.0)
Data['ICUBeds'] = Data['ICUBeds'].mul(100.0)
Data['HeartDiseaseMort'] = Data['HeartDiseaseMort'].div(60.0)
Data['StrokeMort'] = Data['StrokeMort'].div(10.0)
Data['Diabetes'] = Data['Diabetes'].mul(9.0)
Data['Smokers'] = Data['Smokers'].mul(9.0)

Data['Area'] = Data['Area'].div(100.0)

In [38]:
for column in Data:
    print(column, sum(Data[column])/len(Data[column]))

FIPS 30390.411708558702
Pop 16.962220915473527
Density 5.1864626153356745
Area 12.079994432071283
UrbanRural 3.3342820262224278
EconType 2.5156078269824045
Policies 5.9711889805397
Movement 5.628992343224402
Transit 5.004797471231954
65+ 0.03706986570960897
Male 3.0052710060422614
AfricanAmer 0.0377620483467866
CollegePlus 3.0777439772952477
Income 2.636383388412829
Unemployed 1.3624625463844486
Dems 2.5009657144010995
Hospitals 0.04397222438260943
ICUBeds 0.17429075411844672
HeartDiseaseMort 3.0995274272831175
StrokeMort 4.019712153026367
Diabetes 0.9331811556724352
Smokers 1.5677166968897707
pLon -92.28580511581298
pLat 38.450388801145365


In [39]:
for column in Data:
    print(column, max(abs(Data[column])))

FIPS 56045
Pop 1684.287
Density 1389.368
Area 1478.0463
UrbanRural 6.0
EconType 7.0
Policies 99.0
Movement 347.0
Transit 73.553618008
65+ 4.007154643627509
Male 4.405507745266783
AfricanAmer 5.645851501883725
CollegePlus 5.7299999999999995
Income 7.0191
Unemployed 6.033333333333334
Dems 7.177139125639056
Hospitals 3.875968992248062
ICUBeds 49.45054945054945
HeartDiseaseMort 10.05
StrokeMort 9.99
Diabetes 2.97
Smokers 3.734217805860001
pLon 168.292885
pLat 70.522779


In [40]:
# getting specific dataframes for what we want to cluster

DataBasic = Data[['FIPS', 'Pop', 'Density', 'UrbanRural', 'EconType', 'Policies', 'Movement', 'Transit']]

DataDemographics = Data[['FIPS', '65+', 'Male', 'AfricanAmer', 'CollegePlus', 'Income', 'Unemployed', 'Dems']]

DataHealth = Data[['FIPS', 'Hospitals', 'ICUBeds', 'HeartDiseaseMort', 'StrokeMort', 'Diabetes', 'Smokers']]

# approximate distances between counties
# we double count Urban Rural, population, density, size since don't cluster on the entire Data dataframe
# so that nearby urban counties are closer than a rural county adjacent from an urban county
DataGeography = Data[['FIPS', 'Pop', 'Density', 'Area', 'UrbanRural', 'pLon', 'pLat']]

In [41]:
Data['pLon'] = Data['pLon'].div(50)
Data['pLat'] = Data['pLat'].div(50)

In [42]:
# write our dataframes to CSVs in this folder
Data.to_csv('data.csv')
DataBasic.to_csv('data_basic.csv')
DataDemographics.to_csv('data_demographics.csv')
DataHealth.to_csv('data_health.csv')
DataGeography.to_csv('data_geography.csv')
Raw_Data.to_csv('raw_data.csv')