In [1]:
from pathlib import Path
import pandas as pd
import csv
import json

[CDC Places Data Portal](https://chronicdata.cdc.gov/browse?category=500+Cities+%26+Places)

In [2]:
data_dir = Path(r'../DATA')
cdc_dir = data_dir / 'places'

In [3]:
%%time
dfs = []
for fobj in cdc_dir.glob('PLACES*.*'):
    print(f'loading: {fobj.name}')
    file_year = fobj.name.split('___')[-1].split('_')[0]
    year = str(int(file_year) - 2)
    print(f'datayear: {year}')
    df = pd.read_csv(fobj)
    n_rows, n_cols = df.shape
    print(f'loaded dataframe: {n_rows} rows, {n_cols} cols')
    df['year'] = year
    dfs.append(df)
cdc_df = pd.concat(dfs)

loading: PLACES__County_Data__GIS_Friendly_Format___2020_release.csv
datayear: 2018
loaded dataframe: 3142 rows, 118 cols
loading: PLACES__County_Data__GIS_Friendly_Format___2021_release.csv
datayear: 2019
loaded dataframe: 3142 rows, 126 cols
Wall time: 450 ms


In [4]:
SUFFIX = 'AdjPrev' ## age-adjusted prevelance

In [5]:
base_cols = ['StateAbbr', 'year', 'CountyName', 'CountyFIPS', 'Geolocation']

In [6]:
ref_dir = Path(r'../ref')

In [7]:
label_file = ref_dir / 'cdc_labels.csv'
labels = {}
with open(label_file, 'r') as csv_in:
    reader = csv.DictReader(csv_in)
    for row in reader:
        label_type = row['Type']
        if label_type not in labels.keys():
            labels[label_type] = []
        labels[label_type].append(row['Label'])

In [8]:
keep_cols = base_cols
for label in labels['Outcome']:
    if label not in ['DEPRESSION']:
        keep_cols.append(f'{label}_{SUFFIX}')

In [9]:
limited_df = cdc_df[keep_cols].copy()

In [10]:
limited_df.head()

Unnamed: 0,StateAbbr,year,CountyName,CountyFIPS,Geolocation,ARTHRITIS_AdjPrev,BPHIGH_AdjPrev,CANCER_AdjPrev,CASTHMA_AdjPrev,CHD_AdjPrev,COPD_AdjPrev,DIABETES_AdjPrev,HIGHCHOL_AdjPrev,KIDNEY_AdjPrev,OBESITY_AdjPrev,STROKE_AdjPrev,TEETHLOST_AdjPrev
0,AL,2018,Autauga,1001,POINT (-86.64301145 32.5350198),29.3,36.6,6.5,10.2,6.9,7.7,11.3,31.8,2.8,35.5,3.4,16.3
1,AL,2018,Baldwin,1003,POINT (-87.72275422 30.72811673),27.3,33.1,6.7,9.6,6.0,7.1,9.5,32.3,2.6,29.4,3.0,13.4
2,AL,2018,Barbour,1005,POINT (-85.39330307 31.86925212),33.2,45.8,6.1,11.7,9.0,10.5,16.9,34.4,3.9,40.8,5.1,27.1
3,AL,2018,Bibb,1007,POINT (-87.12653219 32.99854412),30.6,40.8,6.4,10.3,7.6,9.1,12.6,33.2,3.0,38.4,3.8,20.7
4,AL,2018,Blount,1009,POINT (-86.56784586 33.98070621),30.1,37.3,6.6,10.2,7.5,9.0,11.3,33.6,2.9,33.7,3.5,19.3


In [11]:
limited_df['CountyName'].nunique()

1839

In [12]:
ct_df = limited_df[limited_df['StateAbbr'] == 'CT']

In [13]:
ct_df['CountyName'].value_counts()

Fairfield     2
Hartford      2
Litchfield    2
Middlesex     2
New Haven     2
New London    2
Tolland       2
Windham       2
Name: CountyName, dtype: int64

In [14]:
ct_df

Unnamed: 0,StateAbbr,year,CountyName,CountyFIPS,Geolocation,ARTHRITIS_AdjPrev,BPHIGH_AdjPrev,CANCER_AdjPrev,CASTHMA_AdjPrev,CHD_AdjPrev,COPD_AdjPrev,DIABETES_AdjPrev,HIGHCHOL_AdjPrev,KIDNEY_AdjPrev,OBESITY_AdjPrev,STROKE_AdjPrev,TEETHLOST_AdjPrev
308,CT,2018,Fairfield,9001,POINT (-73.38942182 41.27166724),20.3,25.0,6.3,8.9,4.7,4.8,7.4,26.6,2.3,23.1,2.4,8.2
309,CT,2018,Hartford,9003,POINT (-72.73276698 41.80653444),23.2,29.0,6.3,10.2,5.1,5.2,9.0,29.0,2.5,28.9,2.7,10.0
310,CT,2018,Litchfield,9005,POINT (-73.24529195 41.79234069),22.1,26.5,6.6,10.0,5.0,5.2,7.0,27.7,2.2,23.5,2.4,9.5
311,CT,2018,Middlesex,9007,POINT (-72.53513114 41.46333347),23.0,26.7,6.6,9.6,4.9,4.9,7.2,27.6,2.2,27.0,2.4,8.0
312,CT,2018,New Haven,9009,POINT (-72.93263656 41.41052748),24.1,29.7,6.4,11.2,5.5,5.6,9.4,28.2,2.5,31.0,2.7,10.2
313,CT,2018,New London,9011,POINT (-72.10184166 41.48674009),24.7,27.7,6.4,10.1,5.5,5.8,8.7,26.7,2.4,30.8,2.6,9.9
314,CT,2018,Tolland,9013,POINT (-72.33645479 41.8545849),22.7,26.1,6.5,9.7,5.4,5.3,7.4,26.8,2.2,26.6,2.4,8.2
315,CT,2018,Windham,9015,POINT (-71.98753068 41.82994523),27.7,29.5,6.8,11.2,5.8,6.7,9.2,28.2,2.5,30.5,2.7,12.8
164,CT,2019,Fairfield,9001,POINT (-73.38942182 41.27166724),18.2,25.6,6.1,8.2,4.1,4.1,7.9,27.6,2.4,24.4,2.3,8.2
749,CT,2019,New Haven,9009,POINT (-72.93263656 41.41052748),21.5,29.3,6.2,10.6,4.6,5.1,9.6,28.9,2.6,32.3,2.6,10.2


In [15]:
states_file = ref_dir / 'states.json'
with open(states_file, 'r') as js_in:
    states = json.load(js_in)

In [16]:
cdc_locations = limited_df[['StateAbbr', 'CountyName']].drop_duplicates()
cdc_locations['STATE'] = cdc_locations['StateAbbr']
cdc_locations['STATE'].replace(states, inplace=True)
cdc_locations.rename({'StateAbbr': 'ABBR', 'CountyName': 'COUNTY'}, axis=1, inplace=True)
cdc_locations = cdc_locations[['ABBR', 'STATE', 'COUNTY']]
print(f'unique locations: {cdc_locations.shape[0]}')
cdc_locations.head()

unique locations: 3142


Unnamed: 0,ABBR,STATE,COUNTY
0,AL,Alabama,Autauga
1,AL,Alabama,Baldwin
2,AL,Alabama,Barbour
3,AL,Alabama,Bibb
4,AL,Alabama,Blount


In [17]:
loc_file = data_dir / 'cdc_locations.csv'
cdc_locations.to_csv(loc_file, index=False)

In [18]:
limited_df['STATE'] = limited_df['StateAbbr']
limited_df['STATE'].replace(states, inplace=True)
limited_df.rename({'StateAbbr': 'ABBR', 'CountyName': 'COUNTY'}, axis=1, inplace=True)

In [19]:
out_file = cdc_dir / 'cdc_outcomes_data.csv'
limited_df.to_csv(out_file, index=False)