# 1. ACS Boundaries

Our first step in this project is to take a look at what kind of regions we are looking at.

We'll start off with CBSA boundaries, which are available from the official census.gov websites.

In [1]:
# utility functions for downloading things
import requests, zipfile, io
def download_url(url, save_path):
    r = requests.get(url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(save_path)

In [2]:
# working with shapefiles
import shapefile as shp # pip install pyshp
import matplotlib.pyplot as plt

In [3]:
# set up path to app credentials - see exploration/README.md
%env GOOGLE_APPLICATION_CREDENTIALS=../google_app_credentials.json

# set up bigquery client
from google.cloud import bigquery
bq = bigquery.Client()

env: GOOGLE_APPLICATION_CREDENTIALS=../google_app_credentials.json


We have ACS data available from 2007 to 2018:

In [4]:
resp = bq.query('''
    SELECT DISTINCT do_date AS year
    FROM `eosc410-project.data.acs_cbsa_*`
    ORDER BY do_date ASC
''')
years = [row["year"] for row in resp]
print(years)

['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']


Now we can download all the boundary data from [census.gov](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html):

In [5]:
for y in years:
    save_to = '../docs/_datatmp/acs_cbsa_%s' % y

    if y == '2007': # 2007 has a special format...
        target = 'fe_%s_us_cbsa' % y
        url = 'https://www2.census.gov/geo/tiger/TIGER%sFE/%s.zip' % (y, target)
        print('fetching "%s" to "%s"' % (url, save_to))
        download_url(url, save_to)
    else:
        target = 'tl_%s_us_cbsa' % y
        if y == '2008' or y == '2009': # some random years have special formats too...
            url = 'https://www2.census.gov/geo/tiger/TIGER%s/%s.zip' % (y, target)
        elif y == '2010': # this single year has a slightly different path...
            url = 'https://www2.census.gov/geo/tiger/TIGER2010/CBSA/2010/tl_2010_us_cbsa10.zip'
        else:
            url = 'https://www2.census.gov/geo/tiger/TIGER%s/CBSA/%s.zip' % (y, target)

        print('fetching "%s" to "%s"' % (url, save_to))
        try:
            download_url(url, save_to)
        except:
            print('failed to fetch "%s"' % url)

print('Done!')

fetching "https://www2.census.gov/geo/tiger/TIGER2007FE/fe_2007_us_cbsa.zip" to "../docs/_datatmp/acs_cbsa_2007"
fetching "https://www2.census.gov/geo/tiger/TIGER2008/tl_2008_us_cbsa.zip" to "../docs/_datatmp/acs_cbsa_2008"
fetching "https://www2.census.gov/geo/tiger/TIGER2009/tl_2009_us_cbsa.zip" to "../docs/_datatmp/acs_cbsa_2009"
fetching "https://www2.census.gov/geo/tiger/TIGER2010/CBSA/2010/tl_2010_us_cbsa10.zip" to "../docs/_datatmp/acs_cbsa_2010"
fetching "https://www2.census.gov/geo/tiger/TIGER2011/CBSA/tl_2011_us_cbsa.zip" to "../docs/_datatmp/acs_cbsa_2011"
fetching "https://www2.census.gov/geo/tiger/TIGER2012/CBSA/tl_2012_us_cbsa.zip" to "../docs/_datatmp/acs_cbsa_2012"
fetching "https://www2.census.gov/geo/tiger/TIGER2013/CBSA/tl_2013_us_cbsa.zip" to "../docs/_datatmp/acs_cbsa_2013"
fetching "https://www2.census.gov/geo/tiger/TIGER2014/CBSA/tl_2014_us_cbsa.zip" to "../docs/_datatmp/acs_cbsa_2014"
fetching "https://www2.census.gov/geo/tiger/TIGER2015/CBSA/tl_2015_us_cbsa.zip

Let's see what we have here. CBSA boundaries shift from year to year, since they are regions anchored on "urban centers" of a certain number of people (see [definition](https://en.wikipedia.org/wiki/Core-based_statistical_area)), so let's make sure they look reasonable regardless and don't shift too dramatically so as to become very difficult to use:

In [6]:
def load_shapefile(y):
    # random stackoverflow says these might be latin-1 encoding for some reason... it worked eh
    # https://stackoverflow.com/questions/5552555/unicodedecodeerror-invalid-continuation-byte
    if y == '2007':
        return shp.Reader('../docs/_datatmp/acs_cbsa_2007/fe_2007_us_cbsa.shp', encoding='latin-1')
    elif y == '2010':
        # why??????
        return shp.Reader('../docs/_datatmp/acs_cbsa_2010/tl_2010_us_cbsa10.shp', encoding='latin-1')
    
    return shp.Reader('../docs/_datatmp/acs_cbsa_%s/tl_%s_us_cbsa.shp' % (y, y), encoding='latin-1')

# load years of shapes
cbsa_years = [load_shapefile(y) for y in years]
print(len(cbsa_years), 'years of data loaded')

# test files
i = 0
for y in cbsa_years:
    year = years[i]
    try:
        recs = y.shapeRecords()
        print(year, 'okay')
    except:
        print(year, 'failed to load records')
        print(sys.exc_info()[0])
    i+=1
print('all okay')

12 years of data loaded
2007 okay
2008 okay
2009 okay
2010 okay
2011 okay
2012 okay
2013 okay
2014 okay
2015 okay
2016 okay
2017 okay
2018 okay
all okay


In [3]:
import math

plt.figure(figsize=(24,12))
subplot = 0
rows = 3
columns = 4
for cbsa_year in cbsa_years:
    year = years[subplot]
    print('plotting year', year)

    plt.subplot(rows, columns, subplot+1)
    for shape in cbsa_year.shapeRecords():
        x = [i[0] for i in shape.shape.points[:]]
        y = [i[1] for i in shape.shape.points[:]]
        plt.plot(x,y)
    plt.title(year)

    subplot += 1

print('all years plotted')
plt.savefig('./figs/1/cbsa_all_years.png')

NameError: name 'cbsa_years' is not defined

Great! Let's dump this to some other formats to make it less painful to load. Forgot that [geopandas](https://geopandas.org/io.html) exists so we'll leverage that for the next part.

In [10]:
from json import dumps

i = 0
for reader in cbsa_years:
    out = '../docs/_datatmp/acs_cbsa_%s/geojson.json' % years[i]

    # adjusted from https://gist.github.com/agalea91/c0e0d1897d1d98a0029ac0baa02b9fca
    fields = reader.fields[1:]
    field_names = [field[0] for field in fields]
    buffer = []
    for sr in reader.shapeRecords():
        record = sr.record
        # Make sure everything is utf-8 compatable
        record = [r.decode('utf-8', 'ignore') if isinstance(r, bytes) 
                  else r for r in record]
        atr = dict(zip(field_names, record))
        geom = sr.shape.__geo_interface__
        buffer.append(dict(type="Feature", geometry=geom, properties=atr)) 

    # write the GeoJSON file
    if os.path.exists(out):
        os.remove(out)
    with open(out, "w") as geojson:
        geojson.write(dumps({"type": "FeatureCollection", "features": buffer}) + "\n")

    i += 1

print('Done!')

SyntaxError: invalid syntax (<ipython-input-10-cac3695805a4>, line 28)