In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import sklearn
import os
import datetime
import time
import git
import sys
import math
from geopy.distance import geodesic

In [2]:
repo = git.Repo("./", search_parent_directories=True)
homedir = repo.working_dir
datadir = f"{homedir}" + "/models/processing/USA/County_Based/"

In [3]:
#helper functions
def logFunc(x):
    if x < 0.01:
        x = 0.01
    return math.log10(x)

# Convert longitude latitude pair to x, y, z Cartesian coordinates\n
def convertPts(pair):
    lon = pair[0]
    lat = pair[1]
    R = 3958.8
    lonRad = lon * math.pi / 180
    latRad = lat * math.pi / 180
    x = R * math.cos(latRad) * math.cos(lonRad)
    y = R * math.cos(latRad) * math.cos(lonRad)
    z = R * math.sin(lat)
    return (x, y, z)

def getX(x):
    return x[0]

def getY(x):
    return x[1]

def getZ(x):
    return x[2]

In [4]:
#Neighbor Data
neighborcounties = pd.read_csv(f"{homedir}/models/processing/USA/County_Based/neighborcounties.csv", index_col = 0)

# read in the files, load as dataframe
Age_Race = pd.read_csv(datadir + 'Age_Race_Filled.csv')
Population = pd.read_csv(datadir + 'Total_Pop')
Density = pd.read_csv(datadir + 'Density.csv')
JHU = pd.read_csv(datadir + 'aggregate_jhu_filled.csv')
Berkeley = pd.read_csv(datadir + 'Aggregate_Berkeley.csv')
Policies = pd.read_csv(datadir + 'Policy_Transit.csv')
Geography = pd.read_csv(datadir + 'County_Centers.csv')
Beds = pd.read_csv(datadir + 'County_Beds.csv')

Data = pd.DataFrame()

DataBasic = pd.DataFrame()
DataDemographics = pd.DataFrame()
DataHealth = pd.DataFrame()
DataGeography = pd.DataFrame()

In [5]:
Data['FIPS'] = Geography['fips']

# fix population
Population.columns = ['fips', 'Population']

# drop US territories, train separate models for them
FipsSet = []
counter = 0
for row in Data.iterrows():
    row = row[1][0]
    if math.floor(row / 1000) > 56:
        Data = Data.drop([counter], axis = 0)
    else:
        FipsSet.append(float(row))
    counter += 1
    
# edit county centers
counter1 = 0
for row in Geography.iterrows():
    if row[1][1] not in FipsSet:
        Geography = Geography.drop([counter1], axis = 0)
    counter1 += 1

In [6]:
# Nature of the county, includes policies

Data['Pop'] = Population['Population']
Data['Pop'] = Data['Pop'].div(2000000.0) # population max is 5

Data['Density'] = Density['2010 Density per square mile of land area - Population']
Data['Density'] = Data['Density'].div(5000.0) # density max is around 14

Data['Area'] = Density['Area in square miles - Total area']
Data['Area'] = Data['Area'].div(40000.0) #max around 4

Data['UrbanRural'] = JHU['Rural-urban_Continuum Code_2013']
Data['UrbanRural'] = Data['UrbanRural'].div(1.5) # urban rural max is 7

Data['EconType'] = JHU['Economic_typology_2015']
Data['EconType'] = Data['EconType'].div(1.5) # economic typology max is 3

# Policies
Data['Policies'] = Policies['Score']
Data['Policies'] = Data['Policies'].div(2) # policies max is just above 4.5

# Typical immigration in/out. Proxy for being a sink/source in flows
Data['Movement'] = JHU['R_NET_MIG_2018']
Data['Movement'] = Data['Movement'].div(20.0) #range around -3.5 to 3.5

Data['Transit'] = JHU['transit_scores - population weighted averages aggregated from town/city level to county']
Data['Transit'] = Data['Transit'].div(20000000000)

In [7]:
# Demographics of the county

# Age distribution
Data['65+'] = Age_Race['65 to 74 years'] + Age_Race['75 to 84 years'] + Age_Race['85 years and over']
Data['65+'] = Data['65+'] / Population['Population']
Data['65+'] = Data['65+'].mul(2400) # 65+ max is 4.5

# Race/gender
Data['Male'] = Berkeley['FracMale2017']
Data['Male'] = Data['Male'].mul(6) # male max is 2, generally 1

Data['AfricanAmer'] = Age_Race['Exclusively Black or African American'] + Age_Race['Hispanic or Latino (of any race)!!Puerto Rican']
Data['AfricanAmer'] = Data['AfricanAmer'] / Population['Population']
Data['AfricanAmer'] = Data['AfricanAmer'].mul(4500) # African American max is 6
     
# Politics/education/income/economy
Data['CollegePlus'] = JHU['Percent of adults completing some college or associate\'s degree 2014-18']
Data['CollegePlus'] = Data['CollegePlus'].div(10) # education max is 3, generally around 1-1.5

Data['Income'] = JHU['Median_Household_Income_2018']
Data['Income'] = Data['Income'].div(20000) # income max is 4, generally around 2  

Data['Unemployed'] = JHU['Unemployment_rate_2018']
Data['Unemployed'] = Data['Unemployed'].div(3) # unemployed max around 3
                         
    # need puerto rico voting patterns
Data['Dems'] = Berkeley['FracDem']
Data['Dems'] = Data['Dems'].mul(7.5) # Dems max is 5, generally around 1.5

In [8]:
#Health care of the county

Data['Hospitals'] = Berkeley['#Hospitals'] * 10000.0 / Population['Population']
Data['Hospitals'] = Data['Hospitals'].div(30.0)

# very tough to fill in
#Data['HospBeds'] = Beds['licensed_beds'] / Population['Population'] # around 2-3

Data['ICUBeds'] = Berkeley['#ICU_beds']
Data['ICUBeds'] = Data['ICUBeds'] / Population['Population']
Data['ICUBeds'] = Data['ICUBeds'].mul(100) # around 11. Outliers ok, very important statistic

#note: not considering comorbidities
Data['HeartDiseaseMort'] = Berkeley['HeartDiseaseMortality'] 
Data['HeartDiseaseMort'] = Data['HeartDiseaseMort'].div(60) # max is 10, typically around 3-4

Data['StrokeMort'] = Berkeley['StrokeMortality']
Data['StrokeMort'] = Data['StrokeMort'].div(10) # max is 7-8, generally around 5

Data['Diabetes'] = Berkeley['DiabetesPercentage'] 
Data['Diabetes'] = Data['Diabetes'].mul(9) # max 3

Data['Smokers'] = Berkeley['SmokersPercentage'] 
Data['Smokers'] = Data['Smokers'].mul(9) # max 3

In [9]:
#Geography

Data['pLon'] = Geography['pclon10']

Data['pLat'] = Geography['pclat10']

#Data['pLonLat'] = list(zip(Geography.pclon10, Geography.pclat10)) # population weighted
#Data['pLonLat'] = Data['pLonLat'].values

#Data['XYZ'] = Data['pLonLat'].apply(convertPts)

#Data['xVal'] = Data['XYZ'].apply(getX)
#Data['xVal'] = Data['xVal'].div(100)

#Data['yVal'] = Data['XYZ'].apply(getY)
#Data['yVal'] = Data['yVal'].div(100)

#Data['zVal'] = Data['XYZ'].apply(getZ)
#Data['zVal'] = Data['zVal'].div(200)

#Data = Data.drop(columns=['pLonLat', 'XYZ'])

In [10]:
for column in Data.columns:
    print((column, max(abs(Data[column])), Data[column].isnull().sum()))

('FIPS', 56045, 0)
('Pop', 5.052861, 0)
('Density', 13.893679999999998, 0)
('Area', 3.6951157500000003, 0)
('UrbanRural', 6.0, 53)
('EconType', 3.3333333333333335, 53)
('Policies', 4.95, 9)
('Movement', 3.47, 53)
('Transit', 0.45971011255, 53)
('65+', 4.007154643627509, 0)
('Male', 4.405507745266783, 29)
('AfricanAmer', 5.645851501883725, 0)
('CollegePlus', 5.7299999999999995, 53)
('Income', 7.0191, 53)
('Unemployed', 6.033333333333334, 53)
('Dems', 7.177139125639056, 29)
('Hospitals', 3.875968992248062, 29)
('ICUBeds', 49.45054945054945, 29)
('HeartDiseaseMort', 10.05, 29)
('StrokeMort', 9.99, 30)
('Diabetes', 2.97, 29)
('Smokers', 3.734217805860001, 29)
('pLon', 168.292885, 0)
('pLat', 70.522779, 0)


In [11]:
# functions from neighbor_fill_ins

def fillcol(fips, value,neighborcounties, min_neighbors=2):
    #Takes in a column of fips codes, and any type of datafield with some NaNs,
    #Computes distance-weighted average of the value across all neighbors of NaN counties
    tic1 = time.time()
    #Loading in the fips and value into proper dataframes
    #This is the df with only nan values
    df = pd.DataFrame(data = [fips,value]).T
    df.columns = ['FIPS', 'Values']
    df.Values = df.Values.astype(float)
    df = df.set_index('FIPS')
    
    #creating new column to set to the current dataframe values
    newcol = []
    for ind in df.index:
        #for any entries with NaNs
        if np.isnan(df['Values'][ind]):
            #list of neighbors for NaN county
            neighbors = list(neighborcounties[neighborcounties['orgfips'] == ind]['adjfips'])
            nonzero = 0
            weightedval = 0
            totalinvdist = 0
            totaldist = 0
            vals = 0
            #iterates though neighbors of NaN county with non-NaN entires
            for n in neighbors:
                if n in df.index:
                    if ~np.isnan(df['Values'][n]):
                        #Getting weighted values, using 1/dist as a scalar to show closer distance counts more
                        nonzero += 1
                        dist = list(neighborcounties.query('orgfips == ' + str(ind) + ' and adjfips == ' + str(n))['Pop_10'])[0]
                        totalinvdist += (1/dist)**1
                        weightedval += ((1/dist)**1)*df['Values'][n]
            #If there are at least 2 neighbors (this can be adjusted)
            if nonzero >= min_neighbors:
                newcol.append(weightedval/(totalinvdist))
            else:
                newcol.append(np.nan)
        else:
            newcol.append(df['Values'][ind])
    toc1 = time.time()
    #print(toc1 - tic1)
    return newcol

def fillfixed(colname, data, code, neighborcounties):
    #Method to fill up the google mobility data
    #Uses colname to designate which column to fill
    numnans = len(data[np.isnan(data[colname])])
    while numnans > 0:
        #print(numnans)
        tempnum = numnans
        #Creating the filled column from method
        newcol = fillcol(data[code], data[colname], neighborcounties)
        data[colname] = newcol
        numnans = len(data[np.isnan(data[colname])])
        #Checking if the number of nans changes
        if tempnum == numnans:
            #if number doesnt change, try again with only 1 neighbor, otherwise quit
            newcol = fillcol(data[code], data[colname], neighborcounties)
            data[colname] = newcol
            numnans = len(data[np.isnan(data[colname])])
            if tempnum == numnans:
                numnans = 0     
    return data

In [12]:
# Filling in columns of dataframe by nearest neighbor analysis

cols = list(Data.columns)[1:]
for col in cols:
    Data = fillfixed(col, Data, 'FIPS', neighborcounties)
print(sum(Data.isna().sum()))

0


In [13]:
for column in Data.columns:
    print((column, Data[column].isnull().sum()))

('FIPS', 0)
('Pop', 0)
('Density', 0)
('Area', 0)
('UrbanRural', 0)
('EconType', 0)
('Policies', 0)
('Movement', 0)
('Transit', 0)
('65+', 0)
('Male', 0)
('AfricanAmer', 0)
('CollegePlus', 0)
('Income', 0)
('Unemployed', 0)
('Dems', 0)
('Hospitals', 0)
('ICUBeds', 0)
('HeartDiseaseMort', 0)
('StrokeMort', 0)
('Diabetes', 0)
('Smokers', 0)
('pLon', 0)
('pLat', 0)


In [14]:
for column in Data.columns:
    if column != 'FIPS':
        Data[column] = Data[column] * 10
Data.head

<bound method NDFrame.head of        FIPS       Pop  Density      Area  UrbanRural   EconType   Policies  \
0      1001  0.275180   0.1836  0.151097   13.333333   0.000000   0.000000   
1      1003  1.016800   0.2292  0.506827   20.000000  33.333333   1.500000   
2      1005  0.131005   0.0620  0.226130   40.000000  20.000000   0.000000   
3      1007  0.112900   0.0736  0.156542    6.666667   0.000000   0.500000   
4      1009  0.288335   0.1778  0.162657    6.666667   0.000000   0.000000   
...     ...       ...      ...       ...         ...        ...        ...   
3138  56037  0.041265   0.0084  2.622772   20.727555  26.666667   1.839696   
3139  56039  0.035585   0.0106  1.054083   40.505400   8.707877   0.000000   
3140  56041  0.097401   0.0202  0.521893   16.343028  12.038777  10.005394   
3141  56043  0.209780   0.0076  0.560675   50.443943  15.520354   2.329237   
3142  56045  0.304752   0.0060  0.600005   25.572057   5.572057   0.000000   

       Movement   Transit       6

In [15]:
for column in Data:
    print(column, max(Data[column]))

FIPS 56045
Pop 50.52861
Density 138.93679999999998
Area 36.9511575
UrbanRural 60.0
EconType 33.333333333333336
Policies 49.5
Movement 34.7
Transit 4.5971011255
65+ 40.07154643627509
Male 44.055077452667824
AfricanAmer 56.458515018837254
CollegePlus 57.3
Income 70.191
Unemployed 60.33333333333334
Dems 71.77139125639056
Hospitals 38.75968992248062
ICUBeds 494.50549450549454
HeartDiseaseMort 100.5
StrokeMort 99.9
Diabetes 29.700000000000003
Smokers 37.34217805860001
pLon -674.64919
pLat 705.22779


In [18]:
# getting specific dataframes for what we want to cluster

DataBasic = Data[['FIPS', 'Pop', 'Density', 'Area', 'UrbanRural', 'EconType', 'Policies', 'Movement', 'Transit']]

DataDemographics = Data[['FIPS', 'Male', 'AfricanAmer', 'CollegePlus', 'Income', 'Unemployed', 'Dems']]

DataHealth = Data[['FIPS', 'Hospitals', 'ICUBeds', 'HeartDiseaseMort', 'StrokeMort', 'Diabetes', 'Smokers']]

# approximate distances between counties
# we double count Urban Rural, population, density, size since don't cluster on the entire Data dataframe
# so that nearby urban counties are closer than a rural county adjacent from an urban county
DataGeography = Data[['FIPS', 'Pop', 'Density', 'Area', 'UrbanRural', 'pLon', 'pLat']]

In [19]:
# write our dataframes to CSVs in this folder
Data.to_csv('data.csv')
DataBasic.to_csv('data_basic.csv')
DataDemographics.to_csv('data_demographics.csv')
DataHealth.to_csv('data_health.csv')
DataGeography.to_csv('data_geography.csv')