In [1]:
import xsimlab as xs
import xarray as xr
from episimlab.models import ExampleSIR, EpiModel
from episimlab.models.example_sir import SetupPhi
from episimlab.models.partition_v1 import *
from episimlab.compt_model import ComptModel
from episimlab.foi import BaseFOI
from episimlab.utils import get_var_dims, group_dict_by_var, visualize_compt_graph, coerce_to_da, fix_coord_dtypes, IntPerDay
from episimlab.setup.sto import SetupStochasticFromToggle
from episimlab.setup.seed import SeedGenerator
import networkx as nx
import pandas as pd
import numpy as np
import math
import datetime
import geopandas as gpd

# ZCTA-level high risk proportion for the state of Texas

## Goal: parameterize Austin granular model with ZCTA level data on risk percentage.

### Risk data

We have CDC 500 Cities places data at the Census Block Group level that could be used to make risk proportion calculations. We also have pre-calculated risk percentages courtesy of Remy Pasco, which is what we'll use moving forward.

The raw data that we won't use:

In [6]:
places = pd.read_csv(
    '../CDC_PLACES_DATA/500_Cities__Local_Data_for_Better_Health__2019_release.csv',
    dtype={'TractFIPS': str, 'CityFIPS': str}
)

Remy's risk data, which we will use:

In [13]:
zcta_age_risk = pd.read_csv(
    '../Remy_ZCTA_high_risk/COVID High risk population per age group per zip code.csv',
    dtype={'ZCTA5': str}
)

In [14]:
zcta_age_risk.head()

Unnamed: 0,ZCTA5,0_0.5,0.5_4,5_9,10_14,15_19,20_24,25_29,30_34,35_39,40_44,45_49,50_54,55_59,60_64,65_69,70_74,75+
0,1086,0.0,0.063143,0.092746,0.104584,0.117104,0.135435,0.147714,0.14718,0.131456,0.116898,0.153668,0.153505,0.153505,0.153505,0.198322,0.198322,0.198322
1,4469,0.0,0.090581,0.133047,0.149925,0.163973,0.184093,0.19495,0.195366,0.179244,0.168342,0.227767,0.227767,0.227767,0.227767,0.294302,0.294302,0.294302
2,6269,0.0,0.082688,0.121454,0.136861,0.148272,0.157956,0.183474,0.196116,0.175064,0.158072,0.217177,0.216874,0.216874,0.216874,0.283603,0.283603,0.283603
3,8240,0.0,0.058633,0.086122,0.097047,0.108229,0.127235,0.143871,0.145469,0.125942,0.110078,0.14398,0.143747,0.143747,0.143747,0.18627,0.18627,0.18627
4,11549,0.0,0.055626,0.081705,0.092094,0.100741,0.112221,0.130361,0.155854,0.135504,0.108977,0.142456,0.141966,0.141889,0.141889,0.186404,0.186404,0.186404


Note that there is 0% risk percentage in the 0-0.5y age bin.

In [19]:
max(zcta_age_risk['0_0.5'])

0.0

### Population data

These data were collected from the US Census Bureau 5-year American Community Survey from 2019 using the `tidycensus` package in R. See script `texas_age_by_zcta.r` for full details.

In [28]:
zcta_pop = pd.read_csv(
    '/Users/kpierce/COVID19/SchoolCatchmentDemo/2019_TX_ZCTA_age_populations.csv',
    dtype={'GEOID': str}
)

In [29]:
zcta_pop.head()

Unnamed: 0.1,Unnamed: 0,GEOID,age,estimate
0,1,75001,10 to 14 years,401
1,2,75001,15 to 17 years,277
2,3,75001,18 and 19 years,147
3,4,75001,20 years,180
4,5,75001,21 years,163


In [30]:
zcta_pop['age'].unique()

array(['10 to 14 years', '15 to 17 years', '18 and 19 years', '20 years',
       '21 years', '22 to 24 years', '25 to 29 years', '30 to 34 years',
       '35 to 39 years', '40 to 44 years', '45 to 49 years',
       '5 to 9 years', '50 to 54 years', '55 to 59 years',
       '60 and 61 years', '62 to 64 years', '65 and 66 years',
       '67 to 69 years', '70 to 74 years', '75 to 79 years',
       '80 to 84 years', '85 years and over', 'Under 5 years'],
      dtype=object)

### Merge risk and population data

Risk data and population data have different binning strategies; we need to map one binning strategy on to the other and merge the data sets.

In [31]:
pop_risk_map = {
    '10 to 14 years': '10_14',
    '15 to 17 years': '15_19',
    '18 and 19 years': '15_19',
    '20 years': '20_24',
    '21 years': '20_24', 
    '22 to 24 years': '20_24', 
    '25 to 29 years': '25_29', 
    '30 to 34 years': '30_34',
    '35 to 39 years': '35_39', 
    '40 to 44 years': '40_44', 
    '45 to 49 years': '45_49',
    '5 to 9 years': '5_9', 
    '50 to 54 years': '50_54', 
    '55 to 59 years': '55_59',
    '60 and 61 years': '60_64', 
    '62 to 64 years': '60_64', 
    '65 and 66 years': '65_69',
    '67 to 69 years': '65_69', 
    '70 to 74 years': '70_74', 
    '75 to 79 years': '75+',
    '80 to 84 years': '75+', 
    '85 years and over': '75+', 
    'Under 5 years': '0.5_4' # risk is zero for 0-0.5 years, so we can ignore that category
    
}

In [32]:
zcta_pop['age_bin'] = [pop_risk_map[i] for i in zcta_pop['age']]

In [33]:
zcta_pop.head()

Unnamed: 0.1,Unnamed: 0,GEOID,age,estimate,age_bin
0,1,75001,10 to 14 years,401,10_14
1,2,75001,15 to 17 years,277,15_19
2,3,75001,18 and 19 years,147,15_19
3,4,75001,20 years,180,20_24
4,5,75001,21 years,163,20_24


In [34]:
zcta_pop_total = zcta_pop.groupby(['GEOID', 'age_bin']).sum('estimate').reset_index().drop('Unnamed: 0', axis=1)

In [35]:
zcta_pop_total.head()

Unnamed: 0,GEOID,age_bin,estimate
0,75001,0.5_4,794
1,75001,10_14,401
2,75001,15_19,424
3,75001,20_24,1276
4,75001,25_29,2859


There should be 1,939 ZCTAs in Texas, so we're missing a couple:

In [50]:
len(zcta_pop_total['GEOID'].unique())

1935

In [36]:
zcta_age_risk_texas = zcta_age_risk[zcta_age_risk['ZCTA5'].isin(set(zcta_pop_total['GEOID'].unique()))]

In [52]:
len(zcta_age_risk_texas['ZCTA5'].unique())

1896

In [37]:
zcta_age_risk.shape

(32409, 18)

In [38]:
zcta_age_risk_texas.shape

(1896, 18)

In [40]:
zcta_age_risk_texas_long = pd.melt(zcta_age_risk_texas, id_vars='ZCTA5')

In [41]:
zcta_age_risk_texas_long.head()

Unnamed: 0,ZCTA5,variable,value
0,75962,0_0.5,0.0
1,76402,0_0.5,0.0
2,76798,0_0.5,0.0
3,76908,0_0.5,0.0
4,78712,0_0.5,0.0


In [43]:
zcta_age_pct_risk = pd.merge(
    zcta_age_risk_texas_long, 
    zcta_pop_total, left_on=['ZCTA5', 'variable'], 
    right_on=['GEOID', 'age_bin']
)

In [55]:
zcta_age_pct_risk = zcta_age_pct_risk.rename(
    columns={
        'value': 'pct_high_risk',
        'estimate': 'total_population'
    }
)
zcta_age_pct_risk = zcta_age_pct_risk.drop(['ZCTA5', 'variable'], axis=1)

In [56]:
zcta_age_pct_risk.head()

Unnamed: 0,pct_high_risk,GEOID,age_bin,total_population
0,0.108621,75962,0.5_4,0
1,0.090956,76402,0.5_4,0
2,0.10336,76798,0.5_4,0
3,0.079681,76908,0.5_4,0
4,0.08607,78712,0.5_4,0


In [59]:
zcta_age_pct_risk['high_risk_population'] = zcta_age_pct_risk['pct_high_risk'] * zcta_age_pct_risk['total_population']

We next do a final widening of age bins. Past models have used five age bins; the current binning strategy for these data does not allow us to exactly match those bin widths, but we can get close.

In [70]:
# '0-4', '5-19', '20-49', '50-64', '65+' -- slightly different from usual age groups
wide_age_bin_mapping = {
    '10_14': '5-19',
    '15_19': '5-19',
    '20_24': '20-49', 
    '25_29': '20-49', 
    '30_34': '20-49',
    '35_39': '20-49', 
    '40_44': '20-49', 
    '45_49': '20-49',
    '5_9': '5-19', 
    '50_54': '50-64', 
    '55_59': '50-64',
    '60_64': '50-64', 
    '65_69': '65+',
    '70_74': '65+', 
    '75+': '65+',
    '0.5_4': '0-4' # risk is zero for 0-0.5 years, so we can ignore that category   
}

In [71]:
zcta_age_pct_risk['wide_age_bin'] = [wide_age_bin_mapping[i] for i in zcta_age_pct_risk['age_bin']]

In [72]:
zcta_wide_age_pct_risk = zcta_age_pct_risk.groupby(['GEOID', 'wide_age_bin']).sum(['total_population', 'high_risk_population']).reset_index().drop('pct_high_risk', axis=1)


In [73]:
zcta_wide_age_pct_risk.head()

Unnamed: 0,GEOID,wide_age_bin,total_population,high_risk_population
0,75001,0-4,794,87.140716
1,75001,20-49,9273,2546.86701
2,75001,5-19,1551,282.514614
3,75001,50-64,2259,848.093682
4,75001,65+,1115,574.459288


Simple arithmetic to get the final dataset, then save:

In [74]:
zcta_wide_age_pct_risk['pct_high_risk'] = zcta_wide_age_pct_risk['high_risk_population']/zcta_wide_age_pct_risk['total_population']

In [77]:
zcta_wide_age_pct_risk['low_risk_population'] = zcta_wide_age_pct_risk['total_population'] - zcta_wide_age_pct_risk['high_risk_population']

In [78]:
zcta_wide_age_pct_risk.head()

Unnamed: 0,GEOID,wide_age_bin,total_population,high_risk_population,pct_high_risk,low_risk_population
0,75001,0-4,794,87.140716,0.109749,706.859284
1,75001,20-49,9273,2546.86701,0.274654,6726.13299
2,75001,5-19,1551,282.514614,0.18215,1268.485386
3,75001,50-64,2259,848.093682,0.375429,1410.906318
4,75001,65+,1115,574.459288,0.51521,540.540712


In [79]:
zcta_wide_age_pct_risk['wide_age_bin'].unique()

array(['0-4', '20-49', '5-19', '50-64', '65+'], dtype=object)

In [80]:
zcta_wide_age_pct_risk.to_csv('/Users/kpierce/epimodels/sandbox-scripts/tx_zcta_high_risk.csv')