In [19]:
import io
import multiprocessing as mul
import os
from pathlib import Path
import requests
import zipfile

from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as figf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import synapseclient
from synapseclient import Activity, Project, Folder, File, Table, Schema, as_table_columns
from tqdm import tqdm

# set the user's home directory as the data directory
data_dir = os.path.join(str(Path.home()), '.gscap')
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
syn = synapseclient.Synapse()
syn.login()

dpath = lambda s: os.path.join(data_dir, s)

init_notebook_mode(connected=True)
InteractiveShell.ast_node_interactivity = 'all'

def isnum(x):
    try:
        float(x)
        return True
    except ValueError:
        return False

def isstr(x):
    try:
        str(x)
        return True
    except ValueError:
        return False

Welcome, Luke Waninger!



In [None]:
storage = syn.get(Project(name='GSCAP Data'))
source_files = syn.store(Folder(name='source_files', parent=storage, downloadPath=data_dir))

### Zipcodes

In [None]:
zname = 'zb16totals'
url =  'https://www2.census.gov/programs-surveys/cbp/datasets/2016/zbp16totals.zip'

response = requests.get(url)
if response.ok:
    t_path = dpath('2017_Gaz_zcta_national.txt')
    f = io.BytesIO(response.content)

    with zipfile.ZipFile(f, 'r') as f:
        contents = [
            dict(
                name=name[:name.find('.')],
                data=f.read(name),
                ext=name[name.find('.')+1:]
            )
            for name in f.namelist()
        ]

    for fi in contents:        
        with io.BytesIO(fi['data']) as fi_:
            a = pd.read_csv(fi_)  
        
        a.to_csv(dpath(zname))
        cbp_totals_syn = syn.setProvenance(
            syn.store(File(name=zname, path=dpath(zname), parent=source_files)),
            activity=Activity(used=[dict(name='US Census Bureau', url=url)])
        )
        
        a.drop(columns=['name', 'empflag', 'emp_nf', 'emp', 'qp1_nf', 'qp1', 'ap_nf', 'ap', 'est'], inplace=True)
        a.rename(columns={'cty_name':'county', 'zip':'zipcode'}, inplace=True)
        
        a.city = [str(x).title() if isstr(x) else x for x in a.city]
        a.county = [str(x).title() if isstr(x) else x for x in a.county]

In [None]:
zname = '2017_national_zipcodes.csv'
url =  'http://www2.census.gov/geo/docs/maps-data/data/gazetteer/2017_Gazetteer/2017_Gaz_zcta_national.zip'

response = requests.get(url)
if response.ok:
    t_path = dpath('2017_Gaz_zcta_national.txt')
    f = io.BytesIO(response.content)

    with zipfile.ZipFile(f, 'r') as f:
        f.extractall(path=data_dir)

    b = pd.read_csv(t_path, sep='\t')
    
    name = '2017_Gaz_zcta_national.csv'    
    b.to_csv(dpath(name))
    
    zips_syn = syn.setProvenance(
        syn.store(File(name=name, path=dpath(name), parent=source_files)),
        activity=Activity(used=[dict(name='US Census Bureau', url=url)])
    )
    
    b.columns = [s.strip() for s in b.columns]
    b = b.loc[:, ['GEOID', 'INTPTLAT', 'INTPTLONG']]
    b.columns = ['zipcode', 'lat', 'lon']

In [None]:
sname = 'state_codes.csv'
state_codes = pd.read_csv(dpath(sname))
state_codes_syn = syn.store(File(name=sname, path=dpath(sname), parent=source_files))

In [None]:
d = pd.merge(a, b, on='zipcode')
d = pd.merge(d, state_codes, left_on='stabbr', right_on='Code')
d = d.drop(columns='Code')
d = d.rename(columns={'State':'state'})

d.lat = np.round(d.lat, 5)
d.lon = np.round(d.lon, 5)

d = d[['zipcode', 'city', 'county', 'state', 'stabbr', 'lat', 'lon']]

In [None]:
timezones = syn.get('syn16810024')
tz = pd.read_csv(timezones.path)
tz = tz.rename(columns={'zip':'zipcode'})

e = pd.merge(d, tz.loc[:, ['zipcode', 'timezone']], on=['zipcode'])

In [None]:
ua = syn.get('syn16816765')
ualist = pd.read_csv(ua.path)

ualist['city']   = [x[:x.find(',')].strip() for x in ualist.NAME]
ualist['stabbr'] = [x[x.find(',')+1:].strip() for x in ualist.NAME]

In [None]:
def ua_designation(args):
    city, stabbr = args
    
    r = ualist.loc[(ualist.city == city) & (ualist.stabbr == stabbr)]
    d = 'rural'
    
    if len(r) > 0:
        d = r.iloc[0].LSADC
        d = 'urban_area' if d == 75 else 'urban_cluster'

    return d


pool = mul.Pool(mul.cpu_count())
e['geo_designation'] = list(pool.map(ua_designation, [(r.city, r.stabbr) for r in e.itertuples()]))
pool.close(); pool.join()

e.head()

In [None]:
iplot(go.Figure(
    [go.Bar(
        x=pd.unique(e.geo_designation), 
        y=pd.value_counts(e.geo_designation),
        text=pd.value_counts(e.geo_designation),
        textposition='outside')
    ]
))

In [None]:
zips = syn.setProvenance(
    syn.store(Table(
        Schema(name='zipcodes', columns=as_table_columns(e), parent=storage), e)
    ),
    activity=Activity(
        name='zipcode collation',
        description='Collecting zipcodes, counties, states, and geo-locations into a single table.',
        used=['syn16816617', 'syn16816613', 'syn16816612', 'syn16810024', 'syn16816765'],
        executed=[dict(
            name='synapse_project_setup.ipynb', 
            url='https://github.com/lukeWaninger/GSCAT/blob/master/notebooks/synapse_project_setup.ipynb'
        )]
    )
)

### Adding in the timezone UTC offset

In [24]:
zips = syn.tableQuery('select * from syn17050200').asDataFrame().reset_index(drop=True)
zips.head()

Unnamed: 0,zipcode,city,county,state,stabbr,lat,lon,timezone,geo_designation,utc_offset
0,1001,Agawam,Hampden,Massachusetts,MA,42.06237,-72.62575,America/New_York,rural,
1,1002,Amherst,Hampshire,Massachusetts,MA,42.36406,-72.45874,America/New_York,rural,
2,1003,Amherst,Hampshire,Massachusetts,MA,42.3897,-72.52401,America/New_York,rural,
3,1005,Barre,Worcester,Massachusetts,MA,42.41885,-72.1066,America/New_York,rural,
4,1007,Belchertown,Hampshire,Massachusetts,MA,42.27901,-72.40047,America/New_York,rural,


In [27]:
tzmap = pd.DataFrame([
    # tz name, UTC offset
    ('America/Adak', -10),
    ('America/Anchorage', -9),
    ('America/Boise', -7),
    ('America/Chicago', -6),
    ('America/Creston', -7),
    ('America/Denver', -7),
    ('America/Detroit', -5),
    ('America/Edmonton', -7),
    ('America/Hermosillo', -7),
    ('America/Indiana/Indianapolis', -5),
    ('America/Indiana/Knox', -6),
    ('America/Indiana/Marengo', -5),
    ('America/Indiana/Petersburg', -5),
    ('America/Indiana/Tell_City', -6),
    ('America/Indiana/Vincennes', -5),
    ('America/Juneau', -9),
    ('America/Kentucky/Louisville', -5),
    ('America/Kentucky/Monticello', -5),
    ('America/Los_Angeles', -8),
    ('America/Matamoros', -6),
    ('America/Menominee', -6),
    ('America/Moncton', -4),
    ('America/Monterrey', -6),
    ('America/New_York', -5),
    ('America/Nome', -9),
    ('America/North_Dakota/Beulah', -6),
    ('America/North_Dakota/New_Salem', -6),
    ('America/Ojinaga', -7),
    ('America/Phoenix', -7),
    ('America/Regina', -6),
    ('America/Sitka', -9),
    ('America/Tijuana', -8),
    ('America/Toronto', -5),
    ('America/Vancouver', -8),
    ('America/Winnipeg', -6),
    ('America/Yakutat', -9),
    ('Pacific/Honolulu', -10)
], columns=['name', 'offset'])

zips['utc_offset'] = [tzmap.loc[tzmap.name == x].offset.values[0] for x in tqdm(zips.timezone)]
zips.head()

Unnamed: 0,zipcode,city,county,state,stabbr,lat,lon,timezone,geo_designation,utc_offset
0,1001,Agawam,Hampden,Massachusetts,MA,42.06237,-72.62575,America/New_York,rural,-5
1,1002,Amherst,Hampshire,Massachusetts,MA,42.36406,-72.45874,America/New_York,rural,-5
2,1003,Amherst,Hampshire,Massachusetts,MA,42.3897,-72.52401,America/New_York,rural,-5
3,1005,Barre,Worcester,Massachusetts,MA,42.41885,-72.1066,America/New_York,rural,-5
4,1007,Belchertown,Hampshire,Massachusetts,MA,42.27901,-72.40047,America/New_York,rural,-5


In [28]:
sum(pd.isnull(zips.utc_offset))

0

In [29]:
zips = syn.setProvenance(
    syn.store(Table(
        Schema(name='zipcodes', columns=as_table_columns(zips), parent='syn16816579'), zips)
    ),
    activity=Activity(
        name='zipcode collation',
        description='Collecting zipcodes, counties, states, and geo-locations into a single table.',
        used=['syn16816617', 'syn16816613', 'syn16816612', 'syn16810024', 'syn16816765'],
        executed=[dict(
            name='synapse_project_setup.ipynb', 
            url='https://github.com/lukeWaninger/GSCAP/blob/master/notebooks/zipcodes.ipynb'
        )]
    )
)