# 4. ACS Data

EPA data is collected, so now we have to compile the ACS data!

In [2]:
# set up path to app credentials - see exploration/README.md
%env GOOGLE_APPLICATION_CREDENTIALS=../google_app_credentials.json

# set up bigquery client
from google.cloud import bigquery
bq = bigquery.Client()

env: GOOGLE_APPLICATION_CREDENTIALS=../google_app_credentials.json


In [3]:
# set up some dependencies
import json, time, random, csv
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
resp = bq.query('''
    SELECT DISTINCT do_date AS year
    FROM `eosc410-project.data.acs_cbsa_20*`
    ORDER BY do_date ASC
''')
years = [row["year"] for row in resp]

## Selecting Metrics for Assessing Societal Changes

The ACS survey collects a large collection of metrics for each GeoID (in this case, of CBSA regions). We're interested in fields described in the query below.

In [11]:
resp = bq.query('''
    SELECT
        geo_id AS geoid,
        do_date AS year,
        
        # population
        total_pop,
        median_age,
        nonfamily_households,
        family_households,
        
        # education
        high_school_including_ged,
        graduate_professional_degree,
        some_college_and_associates_degree,

        # infrastructure and housing
        housing_units,
        occupied_housing_units,
        2018-CAST(median_year_structure_built AS INT64) AS median_structure_age,
        dwellings_1_units_detached + dwellings_1_units_attached + dwellings_2_units + dwellings_3_to_4_units AS dwellings_under_4_units,
        dwellings_5_to_9_units + dwellings_10_to_19_units AS dwellings_5_to_19_units,
        dwellings_20_to_49_units,
        dwellings_50_or_more_units,

        # commute
        walked_to_work,
        commuters_by_public_transportation,
        commuters_by_car_truck_van,
        commute_less_10_mins + commute_5_9_mins AS commute_under_10_mins,
        commute_10_14_mins + commute_15_19_mins + commute_20_24_mins + commute_25_29_mins AS commute_10_29_mins,
        # 2013- has different buckets
        commute_30_34_mins + commute_35_39_mins + commute_35_44_mins + commute_40_44_mins + commute_45_59_mins AS commute_30_59_mins,
        commute_60_more_mins + commute_60_89_mins + commute_90_more_mins AS commute_over_60_mins,

        # economic
        income_per_capita,
        gini_index, # 0 to 1, with 0 being perfect income equality
        poverty, # number of people in poverty
        rent_over_50_percent,
        rent_40_to_50_percent + rent_35_to_40_percent + rent_30_to_35_percent + rent_20_to_25_percent AS rent_25_to_50_percent,
        rent_15_to_20_percent + rent_10_to_15_percent + rent_under_10_percent AS rent_under_25_percent,
        median_rent,
    FROM `eosc410-project.data.acs_cbsa_20*`
    ORDER BY year
''')
data = resp.to_dataframe()

In [10]:
print(data)
data.to_csv('../docs/_dataacs/raw.csv')

geoid  year combined_id   total_pop  median_age  nonfamily_households  \
0     16820  2007   16820.acs    193336.0        35.2               29989.0   
1     12220  2007   12220.acs    130516.0        28.8               22784.0   
2     37900  2007   37900.acs    369721.0        37.5               49897.0   
3     40060  2007   40060.acs   1211608.0        37.1              155939.0   
4     22180  2007   22180.acs    348940.0        32.8               38351.0   
...     ...   ...         ...         ...         ...                   ...   
6194  31080  2018   31080.acs  13291486.0        37.1             1425767.0   
6195  42660  2018   42660.acs   3939363.0        37.0              555278.0   
6196  14460  2018   14460.acs   4875390.0        38.8              691687.0   
6197  41860  2018   41860.acs   4729484.0        39.1              603802.0   
6198  37980  2018   37980.acs   6096372.0        38.8              811592.0   

      family_households  high_school_including_ged  \
0  

Now we organize this data into a formats similar to that of our aggregated EPA data.

In [26]:
timeseries_flat = pd.DataFrame()
for y in years:
    print('%s: processing' % y)

    # load data
    ms = data[data.year == y]

    # setup
    flat_row = {'year': y}

    # for each record, add each column as a feature or add each column as a row
    for index, row in ms.iterrows():
        geoid = row['geoid']
        for col in ms:
            if col in ['geoid', 'year']:
                continue
            combined_id = '%s.epa.%s' % (geoid, col)
            val = ms.loc[index][col]
            flat_row[combined_id] = val

    # each row is a year of data
    timeseries_flat = timeseries_flat.append(flat_row, ignore_index=True)

print('exporting...')
timeseries_flat.to_csv('../docs/_dataacs/timeseries_flat.csv')
print('done')

2007: processing
2008: processing
2009: processing
2010: processing
2011: processing
2012: processing
2013: processing
2014: processing
2015: processing
2016: processing
2017: processing
2018: processing
exporting...
done


In [28]:
timeseries_vert = pd.DataFrame(columns=['combined_id'])
for geoid in data['geoid'].unique():
    # load data
    ms = data[data.geoid == geoid]

    # add columns each feature, spanning all years
    for col in ms:
        if col in ['geoid', 'year']:
            continue
        combined_id = '%s.epa.%s' % (geoid, col)
        vert_cols = {'combined_id':combined_id}

        # for each year, append a columns
        vals = ms[col]
        for y in years:
            vals = ms[ms.year == y][col]
            if len(vals) > 0:
                vert_cols[y] = vals.iloc[0]
        timeseries_vert = timeseries_vert.append(vert_cols, ignore_index=True)

print('exporting...')
timeseries_vert.to_csv('../docs/_dataacs/timeseries_vert.csv')
print('done')

exporting...
done
