In [1]:
## Standard Imports
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import json

In [2]:
df = pd.read_csv('filtered_SDOHs.csv')
df.head()

Unnamed: 0,YEAR,STATEFIPS,ZIPCODE,ZCTA,POINT_ZIP,ACS_TOT_POP_WT_ZC,ACS_TOT_POP_US_ABOVE1_ZC,ACS_TOT_POP_ABOVE5_ZC,ACS_TOT_POP_ABOVE15_ZC,ACS_TOT_POP_ABOVE16_ZC,...,CEN_POPDENSITY_ZC,HIFLD_DIST_UC_ZP,POS_DIST_ED_ZP,POS_DIST_MEDSURG_ICU_ZP,POS_DIST_TRAUMA_ZP,POS_DIST_PED_ICU_ZP,POS_DIST_OBSTETRICS_ZP,POS_DIST_CLINIC_ZP,POS_DIST_ALC_ZP,CEN_AIAN_NH_IND
0,2020,6,90622,90620,1,46228,45660,43603,37830,37449,...,,1.67,1.16,1.16,6.86,7.66,1.62,3.19,6.51,0
1,2020,6,90620,90620,0,46228,45660,43603,37830,37449,...,,1.73,0.74,0.74,6.86,7.91,1.48,3.39,6.0,0
2,2020,6,90624,90621,1,35448,34931,33289,28342,27917,...,,1.38,0.81,0.81,6.32,7.89,2.84,3.48,7.29,0
3,2020,6,90621,90621,0,35448,34931,33289,28342,27917,...,,1.78,0.45,0.45,5.91,8.3,2.81,3.89,7.36,0
4,2020,6,90623,90623,0,15624,15600,14981,13344,13229,...,,0.93,1.97,1.97,5.7,9.57,0.24,4.21,4.65,0


In [3]:
ocPop = 3167000
picked_data = pd.DataFrame({})
picked_data['zcta'] = df['ZCTA']
picked_data['tot_pop_zcta'] = df['ACS_TOT_POP_WT_ZC']
picked_data['unemployed_civ'] = df['ACS_PCT_UNEMPLOY_ZC']
picked_data['no_vehicle'] = df['ACS_PCT_HU_NO_VEH_ZC']
picked_data['no_school_job'] = df['ACS_PCT_NO_WORK_NO_SCHL_16_19_ZC']
picked_data['no_food_stamps'] = df['ACS_PCT_HH_NO_FD_STMP_BLW_POV_ZC']
picked_data['distance_clinic'] = df['POS_DIST_CLINIC_ZP']
picked_data['uninsured'] = df['ACS_PCT_UNINSURED_ZC']
picked_data.head()

Unnamed: 0,zcta,tot_pop_zcta,unemployed_civ,no_vehicle,no_school_job,no_food_stamps,distance_clinic,uninsured
0,90620,46228,4.42,2.87,2.5,7.08,3.19,6.13
1,90620,46228,4.42,2.87,2.5,7.08,3.39,6.13
2,90621,35448,4.67,5.1,0.0,6.19,3.48,9.89
3,90621,35448,4.67,5.1,0.0,6.19,3.89,9.89
4,90623,15624,2.71,6.29,0.0,6.34,4.21,3.75


Economic Stability  
Percentage of civilian labor force that is unemployed (ages 16 and over, ZCTA level) 
#### (ACS_PCT_UNEMPLOY_ZC) (FS)
Neighborhood and Physical Environment 
Percentage of housing units with no vehicle available (ZCTA level) 
#### (ACS_PCT_HU_NO_VEH_ZC)
Education 
Percentage of teens and adults who are unemployed and not in school (between ages 16 and 19, ZCTA level) 
#### (ACS_PCT_NO_WORK_NO_SCHL_16_19_ZC) (HX)
Food 
Percentage of households not receiving food stamps/SNAP with income below the poverty level (ZCTA level)
#### (ACS_PCT_HH_NO_FD_STMP_BLW_POV_ZC) (HT)
Community and Social Context 
Distance in miles to the nearest health clinic (FQHC, RHC), calculated using population weighted ZIP centroids 
#### (POS_DIST_CLINIC_ZP) (LM)
Health Care System 
Percentage of population with no health insurance coverage (ZCTA level) 
#### (ACS_PCT_UNINSURED_ZC) (LC)


## Weighting the Data by Population

- get the total population of orange county
- divide each zcta region's population by the total population of orange county
- this gives us the weight for each region
- multiply the weight for each region by the stat we want to use
- we now have data that is weighted by population for each zcta region

In [4]:
# total oc pop
print(ocPop)

3167000


In [5]:
picked_data['weights'] = picked_data['tot_pop_zcta'] / ocPop

In [6]:
picked_data.head()

Unnamed: 0,zcta,tot_pop_zcta,unemployed_civ,no_vehicle,no_school_job,no_food_stamps,distance_clinic,uninsured,weights
0,90620,46228,4.42,2.87,2.5,7.08,3.19,6.13,0.014597
1,90620,46228,4.42,2.87,2.5,7.08,3.39,6.13,0.014597
2,90621,35448,4.67,5.1,0.0,6.19,3.48,9.89,0.011193
3,90621,35448,4.67,5.1,0.0,6.19,3.89,9.89,0.011193
4,90623,15624,2.71,6.29,0.0,6.34,4.21,3.75,0.004933


In [7]:
weighted_data = pd.DataFrame({})
weighted_data['zcta'] = picked_data['zcta']
weighted_data['w_umemployed'] = picked_data['unemployed_civ'] * picked_data['weights'] * 100
weighted_data['w_no_vehicle'] = picked_data['no_vehicle'] * picked_data['weights'] * 100
weighted_data['w_no_school_job'] = picked_data['no_school_job'] * picked_data['weights'] * 100
weighted_data['w_no_food_stamps'] = picked_data['no_food_stamps'] * picked_data['weights'] * 100
weighted_data['w_distance_clinic'] = picked_data['distance_clinic'] * picked_data['weights'] * 100
weighted_data['w_uninsured'] = picked_data['uninsured'] * picked_data['weights'] * 100

In [8]:
weighted_data.head()

Unnamed: 0,zcta,w_umemployed,w_no_vehicle,w_no_school_job,w_no_food_stamps,w_distance_clinic,w_uninsured
0,90620,6.451776,4.189276,3.649195,10.33452,4.656373,8.947826
1,90620,6.451776,4.189276,3.649195,10.33452,4.948308,8.947826
2,90621,5.227097,5.708393,0.0,6.928422,3.895139,11.069805
3,90621,5.227097,5.708393,0.0,6.928422,4.354049,11.069805
4,90623,1.336945,3.103093,0.0,3.12776,2.076951,1.850016


In [23]:
json_obj = weighted_data.to_json(orient='records', indent=1)

In [22]:
with open('weighted_sdoh_data.json', 'w') as f:
    f.write(json_obj)