In [1]:
import numpy as np
import pandas as pd
import math
import os
from geopy.distance import geodesic 

In [2]:
Berkeley = pd.read_csv('../../../../data/us/aggregate_berkeley.csv')                 
JHU = pd.read_csv('../../../../data/us/aggregate_jhu.csv')       

In [3]:
#helper function for standardizing data format
def RatioToFrac(x):
    return x/(x+1)

def PercentToFrac(x):
    return x/100

In [4]:
#Drop the first column which is meaningless
Berkeley = Berkeley.drop(columns = ['Unnamed: 0'])

In [5]:
#Improve readability
Berkeley = Berkeley.rename(columns = {'Population(Persons)2017': 'Population2017'})
Berkeley = Berkeley.rename(columns = {'PopTotalMale2017': 'PopMale2017'})
Berkeley = Berkeley.rename(columns = {'PopTotalFemale2017': 'PopFemale2017'})
Berkeley = Berkeley.rename(columns = {'MedianAge,Male2010': 'MedianAgeMale2010'})
Berkeley = Berkeley.rename(columns = {'MedianAge,Female2010': 'MedianAgeFemale2010'})
Berkeley = Berkeley.rename(columns = {'MedicareEnrollment,AgedTot2017':'MedicareEnrollmentTot2017'})
Berkeley = Berkeley.rename(columns = {'Smokers_Percentage':'SmokersPercentage'})
Berkeley = Berkeley.rename(columns = {'#FTEHospitalTotal2017':'#FTEHospital2017'})
Berkeley = Berkeley.rename(columns = {'TotalM.D.\'s,TotNon-FedandFed2017': '#MDs2017'})
Berkeley = Berkeley.rename(columns = {'#HospParticipatinginNetwork2017': '#HospitalsInNetwork2017'})
Berkeley = Berkeley.rename(columns = {'dem_to_rep_ratio':'FracDem'})

In [6]:
#Changing ratios and percentages to frac. Doing this whenever its relative to a population total.
#Editing to get dem_to_rep_ratio into a fraction of Democrats
Berkeley['FracDem'] = Berkeley['FracDem'].apply(RatioToFrac)

#Changing percentages to frac. 
Berkeley['DiabetesPercentage'] = Berkeley['DiabetesPercentage'].apply(PercentToFrac)
Berkeley['SmokersPercentage'] = Berkeley['SmokersPercentage'].apply(PercentToFrac)

In [7]:
Berkeley.to_csv('Aggregate_Berkeley.csv')

In [8]:
#Dropping Puerto Rico data, too sparse to be useful; missing all but 10-20 columns
JHU = JHU.drop(list(range(3194, 3273)))

In [9]:
#Editing percentages to fractions. No ratios relative to a population total.
#Doing this whenever its a percentage out of a population total. Thus some percentages are still present, but they're relative to different quantities.

#Changing labels
JHU = JHU.rename(columns = {'Percent of adults with less than a high school diploma 2014-18': 'Frac of adults with less than a high school diploma 2014-18'})
JHU = JHU.rename(columns = {'Percent of adults with a high school diploma only 2014-18': 'Frac of adults with a high school diploma only 2014-18'})
JHU = JHU.rename(columns = {'Percent of adults completing some college or associate\'s degree 2014-18': 'Frac of adults completing some college or associate\'s degree 2014-18'}) 
JHU = JHU.rename(columns = {'Percent of adults with a bachelor\'s degree or higher 2014-18': 'Frac of adults with a bachelor\'s degree or higher 2014-18'})

#Changing the data
JHU['Frac of adults with less than a high school diploma 2014-18'] = JHU['Frac of adults with less than a high school diploma 2014-18'].apply(PercentToFrac)
JHU['Frac of adults with a high school diploma only 2014-18'] = JHU['Frac of adults with a high school diploma only 2014-18'].apply(PercentToFrac)
JHU['Frac of adults completing some college or associate\'s degree 2014-18'] = JHU['Frac of adults completing some college or associate\'s degree 2014-18'].apply(PercentToFrac)
JHU['Frac of adults with a bachelor\'s degree or higher 2014-18'] = JHU['Frac of adults with a bachelor\'s degree or higher 2014-18'].apply(PercentToFrac)

In [13]:
JHU.head

<bound method NDFrame.head of        FIPS State          Area_Name  Rural-urban_Continuum Code_2013  \
0         0    US      United States                              NaN   
1      1000    AL            Alabama                              NaN   
2      1001    AL     Autauga County                              2.0   
3      1003    AL     Baldwin County                              3.0   
4      1005    AL     Barbour County                              6.0   
...     ...   ...                ...                              ...   
3189  56037    WY  Sweetwater County                              5.0   
3190  56039    WY       Teton County                              7.0   
3191  56041    WY       Uinta County                              7.0   
3192  56043    WY    Washakie County                              7.0   
3193  56045    WY      Weston County                              7.0   

      Urban_Influence_Code_2013  Economic_typology_2015  POP_ESTIMATE_2018  \
0              

In [10]:
#helper functions to fill in NaN data entries per county

#get the lontitude and latitude of a given fips
def get_lonlat(fips, county_centers): 
    df = county_centers.loc[county_centers['fips'] == fips]
    #use 2010 population centers, generally most relevant to the statistics in the table
    lon = county_centers.loc[county_centers['fips'] == fips]['pclon10'].values[0]
    lat = county_centers.loc[county_centers['fips'] == fips]['pclat10'].values[0]
    if math.isnan(lon) == True or math.isnan(lat) == True:
        #if 2010 population centers are NaN type, we just use 2000 population centers; shouldn't have changed too much
        lon = county_centers.loc[county_centers['fips'] == fips]['pclon00'].values[0]
        lat = county_centers.loc[county_centers['fips'] == fips]['pclat00'].values[0]
    return (lat, lon)

#given a county given by its fips code, find the order of the other counties from lowest to greatest distance away
#returns the fips codes of these counties in this order
def nearest_counties(fips):
    sorted_dict = {}
    
    #get the list of counties sorted by increasing distance
    county_centers = pd.read_csv('../../../../data/us/geolocation/county_centers.csv')
    root_pair = get_lonlat(fips, county_centers)
    for code in county_centers['fips']:
        if code != fips:
            current_pair = get_lonlat(code, county_centers)
            #from geeksforgeeks: https://www.geeksforgeeks.org/python-calculate-distance-between-two-places-using-geopy/
            distance = geodesic(root_pair, current_pair).km 
            sorted_dict[code] = distance
        
    #from https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value
    sorted_dict = {key: value for key, value in sorted(sorted_dict.items(), key=lambda item: item[1])}  
    
    keys = []
    for code in sorted_dict:
        keys.append(code)
    return keys
    
#averages the data for the fips codes of the k nearest counties
#NOTE: weakness of this is if many adjacent counties have a given entry as NaN, then we take the data from very far away
def avg_nearest_k(k, fips, entry, nearest_counties):
    avg = 0.0
    counter = 0
    while counter < k:
        code = nearest_counties[counter]
        if math.isnan(JHU.loc[JHU['FIPS'] == code][entry].values[0]) == False:
            avg += JHU.loc[JHU['FIPS'] == code][entry].values[0]
            counter += 1
    avg /= k
    return avg

In [11]:
#helper functions to fill in data for state and nation data

#find the number of counties in a given state represented by its fips code
def num_counties(fips):
    count = 0
    for code in JHU['FIPS']:
        if int(code) % 1000 > 0 and int(code) / 1000 == fips / 1000:
            count += 1
    return count

#find the sum of the values for a given data type, ex 'frac smokers', for a state represented by its fips code, over all its counties
def sum_counties(fips, entry):
    ans = 0.0
    for code in JHU['FIPS']:
        if int(code) % 1000 > 0 and int(code) / 1000 == fips / 1000: # could be made more efficient
            ans += JHU.loc[JHU['FIPS'] == code][entry].values[0]
    return ans
    
def avg_counties(fips, entry):
    return sum_counties(fips, entry) / num_counties(fips)

#find the sum of the values for the US over all the states for a given data type, ex 'frac smokers'
def sum_states(entry):
    ans = 0.0   
    for code in JHU['FIPS']:
        if code > 0 and code % 1000 == 0: # states FIPS codes are 0 mod 1000, including DC and PR
            ans += JHU.loc[JHU['FIPS'] == code][entry].values[0]
    return ans

#find the average of the values for the US over all the states
def avg_states(entry):
    return sum_states(entry) / 52.0

In [12]:
#Fill in the local data, which we do first by a average of the nearby values
#Note: this is an approximation
#Assume for small k the counties are fairly similar so we don't need to weight by area or population
#Hoping to make this more sophisticated later. May be a pain.

for fips in JHU['FIPS']:
    df = JHU.loc[JHU['FIPS'] == fips]
    #if there are nan values in the row of this fips, we use nearest neighbor analysis
    if fips % 1000 != 0 and df.isnull().sum(axis = 1).values[0] > 0: #only consider counties with rows that need to be filled in
        nearest_counties = nearest_counties(fips)
    
        for entry in JHU.columns: 
            if entry != 'State' and entry != 'Area_Name' and math.isnan(JHU.loc[JHU['FIPS'] == fips][entry]) == True: 
                JHU.loc[JHU['FIPS'] == fips][entry] = avg_nearest_k(3, fips, entry, nearest_counties)

NameError: name 'dist_dict' is not defined

In [None]:
#Fill in the state data from the county data
#Noted that the empty data is the same for each state
#Not sure if we need this; if needed, will do later

In [None]:
#Fill in the US data from the state data, which we do last
#Not sure if we need this; if needed, will do later

In [None]:
JHU.to_csv('Aggregate_JHU_Filled.csv')