In [1]:
import pandas as pd
import numpy as np
from sodapy import Socrata
import os
import requests
import matplotlib.pyplot as plt
import pyproj
import datetime
import json
import sys

pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
%matplotlib inline

This code obtains the American Community Survey data from the Census API that is used in this study. We use 2016 data as covariates and the 2009 data for the propensity score analysis.

In [2]:
# Get 2016 ACS data from Census API
acs_results_2016 = requests.get('https://api.census.gov/data/2016/acs/acs5?get=NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B25031_001E,B06011_001E,B06009_005E&for=place:*&key=91aa6a44edb890abeea862f2d9befb5d520a5cc4')

In [6]:
# Convert 2016 request to data frame
data = acs_results_2016.text
city_df = pd.read_json(data)
city_df.columns = city_df.iloc[0]
city_df.drop([0], inplace = True) 

In [7]:
city_df.head()

Unnamed: 0,NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B25031_001E,B06011_001E,B06009_005E,state,place
1,"Abanda CDP, Alabama",151,0,24,18,0,71,138,13,0,32.9,-666666666,12908,0,1,100
2,"Abbeville city, Alabama",2627,81,72,52,41,1359,1386,1211,39,50.4,514,17653,116,1,124
3,"Adamsville city, Alabama",4422,168,95,63,91,2164,2147,2243,28,42.0,844,26402,301,1,460
4,"Addison town, Alabama",757,18,41,26,11,383,722,0,0,46.2,471,20765,64,1,484
5,"Akron town, Alabama",252,0,6,8,0,121,36,216,0,43.9,533,11875,5,1,676


The ACS data for the three counties included in this study (El Dorado, Boulder, and Clark counties) as well as Middleborough Township had to be obtained separately

In [8]:
el_dorado = requests.get('https://api.census.gov/data/2016/acs/acs5?get=NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B25031_001E,B06011_001E,B06009_005E&for=county:017&in=state:06&key=91aa6a44edb890abeea862f2d9befb5d520a5cc4')

In [9]:
ed_data = el_dorado.text
ed_df = pd.read_json(ed_data)
ed_df.columns = ed_df.iloc[0]
ed_df.drop([0], inplace = True) 
ed_df.rename(columns={'county':'place'}, inplace=True)

In [11]:
mb = requests.get('https://api.census.gov/data/2016/acs/acs5?get=NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B25031_001E,B06011_001E,B06009_005E&for=county%20subdivision:40850&in=state:25%20county:023&key=91aa6a44edb890abeea862f2d9befb5d520a5cc4')
mb_data = mb.text
mb_df = pd.read_json(mb_data)
mb_df.columns = mb_df.iloc[0]
mb_df.drop([0], inplace = True)
mb_df['place'] = mb_df['county'] + mb_df['county subdivision']
mb_df.drop(columns = ['county', 'county subdivision'], inplace = True)

In [12]:
boulder = requests.get('https://api.census.gov/data/2016/acs/acs5?get=NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B25031_001E,B06011_001E,B06009_005E&for=county:013&in=state:08&key=91aa6a44edb890abeea862f2d9befb5d520a5cc4')
b_data = boulder.text
b_df = pd.read_json(b_data)
b_df.columns = b_df.iloc[0]
b_df.drop([0], inplace = True) 
b_df.rename(columns={'county':'place'}, inplace=True)

In [13]:
clark = requests.get('https://api.census.gov/data/2016/acs/acs5?get=NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B25031_001E,B06011_001E,B06009_005E&for=county:011&in=state:53&key=91aa6a44edb890abeea862f2d9befb5d520a5cc4')
c_data = clark.text
c_df = pd.read_json(c_data)
c_df.columns = c_df.iloc[0]
c_df.drop([0], inplace = True) 
c_df.rename(columns={'county':'place'}, inplace=True)

In [15]:
total_df = pd.concat([city_df, ed_df, b_df, mb_df, c_df])

In [16]:
total_df.tail()

Unnamed: 0,NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B25031_001E,B06011_001E,B06009_005E,state,place
29574,"Yaurel comunidad, Puerto Rico",1255,0,33,27,80,580,635,395,1255,36.0,-666666666,,,72,88121
1,"El Dorado County, California",183000,4543,4617,4118,4554,91349,159165,1784,22868,45.2,1105,31086.0,27316.0,6,17
1,"Boulder County, Colorado",313961,11803,10105,9852,9696,157617,278048,2914,42914,36.0,1236,31927.0,64270.0,8,13
1,"Middleborough town, Plymouth County, Massachus...",24042,515,828,918,892,12088,22763,422,296,43.0,1111,36214.0,3413.0,25,2340850
1,"Clark County, Washington",450893,14121,14340,13967,14916,222759,381593,8486,39042,37.8,1022,30953.0,54925.0,53,11


In [17]:
# Rename variables
variables = {'B01001_001E':"population",'B01001_011E':"m_25_29", 'B01001_012E':'m_30_34', 
             'B01001_035E': "f_25_29", 'B01001_036E':"f_30_34",'B01001_002E':"total_male",'DP05_0066PE': "pct_hispanic",
             'B02001_002E': "total_white",'B02001_003E':"total_black",'B03001_003E': "total_hispanic", 'B01002_001E':"median_age",'B25031_001E':"median_gross_rent",
             'B06011_001E': "median_income",'B06009_005E':"total_bachelor"}
total_df.rename(columns = variables, inplace=True)


In [18]:
# Calculate percentage variables
cols = ['population', 'f_25_29', 'm_25_29', 'f_30_34', 'm_30_34', 'total_male', 'total_black', 'total_white', 
       'total_hispanic', 'total_bachelor']
total_df[cols] = total_df[cols].apply(pd.to_numeric, errors='coerce')

total_df['pct_25_34'] = (total_df['f_25_29'] + total_df['m_25_29'] + total_df['f_30_34']+ total_df['f_30_34'])/total_df['population']
total_df['pct_male'] = total_df['total_male']/total_df['population']
total_df['pct_black'] = total_df['total_black']/total_df['population']
total_df['pct_white'] = total_df['total_white']/total_df['population']
total_df['pct_hispanic'] = total_df['total_hispanic']/total_df['population']
total_df['pct_bachelor'] = total_df['total_bachelor']/total_df['population']

In [19]:
# Separate city and state from name variable
cs = total_df['NAME'].str.split(",", expand = True)
total_df['city'] = cs[0]
total_df['state_name'] = cs[1]

In [22]:
# Read state code data in for merging of dataframes
st = pd.read_table('state_code.txt', sep = '|')
st.rename(index = str, columns = {'STATE': 'state','STUSAB':'state_code', 'STATE_NAME': 'state_name'}, inplace = True)
st['state'] = st['state'].apply(lambda x: '{0:0>2}'.format(x))
st = st[['state', 'state_code']]

In [23]:
total_df = total_df.merge(st, how = 'left', on = 'state')

In [25]:
total_df.to_csv('city_data.csv', index= False)

In [26]:
total_df.tail()

Unnamed: 0,NAME,population,m_25_29,m_30_34,f_25_29,f_30_34,total_male,total_white,total_black,total_hispanic,median_age,median_gross_rent,median_income,total_bachelor,state,place,year,pct_25_34,pct_male,pct_black,pct_white,pct_hispanic,pct_bachelor,city,state_name,state_code
29573,"Yaurel comunidad, Puerto Rico",1255,0,33,27,80,580,635,395,1255,36.0,-666666666,,,72,88121,2016,0.149004,0.462151,0.314741,0.505976,1.0,,Yaurel comunidad,Puerto Rico,PR
29574,"El Dorado County, California",183000,4543,4617,4118,4554,91349,159165,1784,22868,45.2,1105,31086.0,27316.0,6,17,2016,0.097098,0.499175,0.009749,0.869754,0.124962,0.149268,El Dorado County,California,CA
29575,"Boulder County, Colorado",313961,11803,10105,9852,9696,157617,278048,2914,42914,36.0,1236,31927.0,64270.0,8,13,2016,0.130739,0.502027,0.009281,0.885613,0.136686,0.204707,Boulder County,Colorado,CO
29576,"Middleborough town, Plymouth County, Massachus...",24042,515,828,918,892,12088,22763,422,296,43.0,1111,36214.0,3413.0,25,2340850,2016,0.133808,0.502787,0.017553,0.946801,0.012312,0.14196,Middleborough town,Plymouth County,MA
29577,"Clark County, Washington",450893,14121,14340,13967,14916,222759,381593,8486,39042,37.8,1022,30953.0,54925.0,53,11,2016,0.128456,0.49404,0.01882,0.846305,0.086588,0.121814,Clark County,Washington,WA


In [27]:
# Get 2009 ACS data from Census API, note that 2009 data does not have median gross rent variable
acs_results_2009 = requests.get('https://api.census.gov/data/2009/acs5?key=91aa6a44edb890abeea862f2d9befb5d520a5cc4&get=B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B06011_001E,B06009_005E,NAME&for=place:*')
data = acs_results_2009.text
city_df = pd.read_json(data)
city_df.columns = city_df.iloc[0]
city_df.drop([0], inplace = True) 

el_dorado_09 = requests.get('https://api.census.gov/data/2009/acs5?key=91aa6a44edb890abeea862f2d9befb5d520a5cc4&get=NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B06011_001E,B06009_005E&for=county:017&in=state:06')
ed_data = el_dorado_09.text
ed_df = pd.read_json(ed_data)
ed_df.columns = ed_df.iloc[0]
ed_df.drop([0], inplace = True) 
ed_df.rename(columns={'county':'place'}, inplace=True)

mb_09 = requests.get('https://api.census.gov/data/2009/acs5?key=91aa6a44edb890abeea862f2d9befb5d520a5cc4&get=NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B06011_001E,B06009_005E&for=county%20subdivision:40850&in=state:25%20county:023')
mb_data = mb_09.text
mb_df = pd.read_json(mb_data)
mb_df.columns = mb_df.iloc[0]
mb_df.drop([0], inplace = True)
mb_df['place'] = mb_df['county'] + mb_df['county subdivision']
mb_df.drop(columns = ['county', 'county subdivision'], inplace = True)

boulder_09 = requests.get('https://api.census.gov/data/2009/acs5?key=91aa6a44edb890abeea862f2d9befb5d520a5cc4&get=NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B06011_001E,B06009_005E&for=county:013&in=state:08')
b_data = boulder_09.text
b_df = pd.read_json(b_data)
b_df.columns = b_df.iloc[0]
b_df.drop([0], inplace = True) 
b_df.rename(columns={'county':'place'}, inplace=True)
total_df = pd.concat([city_df, ed_df, b_df, mb_df])

clark_09 = requests.get('https://api.census.gov/data/2009/acs5?key=91aa6a44edb890abeea862f2d9befb5d520a5cc4&get=NAME,B01001_001E,B01001_011E,B01001_012E,B01001_035E,B01001_036E,B01001_002E,B02001_002E,B02001_003E,B03001_003E,B01002_001E,B06011_001E,B06009_005E&for=county:011&in=state:53')
c_data = clark_09.text
c_df = pd.read_json(c_data)
c_df.columns = c_df.iloc[0]
c_df.drop([0], inplace = True) 
c_df.rename(columns={'county':'place'}, inplace=True)
total_df = pd.concat([city_df, ed_df, b_df, c_df, mb_df])


variables = {'B01001_001E':"population_09",'B01001_011E':"m_25_29", 'B01001_012E':'m_30_34', 
             'B01001_035E': "f_25_29", 'B01001_036E':"f_30_34",'B01001_002E':"total_male",
             'B02001_002E': "total_white",'B02001_003E':"total_black",'B03001_003E': "total_hispanic", 'B01002_001E':"median_age_09",
             'B06011_001E': "median_income_09",'B06009_005E':"total_bachelor"}
total_df.rename(columns = variables, inplace=True)
total_df['year'] = 2009

In [28]:
total_df.head()

Unnamed: 0,population_09,total_male,m_25_29,m_30_34,f_25_29,f_30_34,median_age_09,total_white,total_black,total_hispanic,total_bachelor,median_income_09,NAME,place,state,year
1,64,36,0,4,1,6,39.5,40,0,10,7,35833,"Adak city, Alaska",65,2,2009
2,101,69,11,0,2,6,29.6,10,0,22,7,8688,"Akhiok city, Alaska",650,2,2009
3,579,309,3,27,19,11,26.3,22,0,0,9,11667,"Akiachak CDP, Alaska",760,2,2009
4,289,137,22,8,15,13,26.9,16,0,0,12,11058,"Akiak city, Alaska",870,2,2009
5,1210,813,139,139,49,8,37.6,77,275,144,116,20996,"Akutan city, Alaska",1090,2,2009


In [29]:
# Calculate percentage variables
cols = ['population_09', 'f_25_29', 'm_25_29', 'f_30_34', 'm_30_34', 'total_male', 'total_black', 'total_white', 
       'total_hispanic', 'total_bachelor']
total_df[cols] = total_df[cols].apply(pd.to_numeric, errors='coerce')

total_df['pct_25_34_09'] = (total_df['f_25_29'] + total_df['m_25_29'] + total_df['f_30_34']+ total_df['f_30_34'])/total_df['population_09']
total_df['pct_male_09'] = total_df['total_male']/total_df['population_09']
total_df['pct_black_09'] = total_df['total_black']/total_df['population_09']
total_df['pct_white_09'] = total_df['total_white']/total_df['population_09']
total_df['pct_hispanic_09'] = total_df['total_hispanic']/total_df['population_09']
total_df['pct_bachelor_09'] = total_df['total_bachelor']/total_df['population_09']

cols_to_keep = ['population_09', 'median_age_09', 'median_income_09', 'NAME', 'place', 'state', 'year', 'pct_25_34_09', 
                'pct_male_09', 'pct_black_09', 'pct_white_09', 'pct_hispanic_09', 'pct_bachelor_09']

total_df = total_df[cols_to_keep]

In [30]:
# Merge and save 2009 data
cs = total_df['NAME'].str.split(",", expand = True)
total_df['city'] = cs[0]
total_df['state_name'] = cs[1]
st = pd.read_table('state_code.txt', sep = '|')
st.rename(index = str, columns = {'STATE': 'state','STUSAB':'state_code', 'STATE_NAME': 'state_name'}, inplace = True)
st['state'] = st['state'].apply(lambda x: '{0:0>2}'.format(x))
st = st[['state', 'state_code']]
total_df = total_df.merge(st, how = 'left', on = 'state')
total_df.to_csv('city_data_2009.csv', index= False)