In [21]:
import io
import multiprocessing as mul
import os
from pathlib import Path
import requests
import zipfile

import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as figf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import synapseclient
from synapseclient import Activity, Project, Folder, File, Table, Schema, as_table_columns

# set the user's home directory as the data directory
data_dir = os.path.join(str(Path.home()), '.gscat')
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
syn = synapseclient.Synapse()
syn.login()

storage = syn.get(Project(name='GSCAP Data'))
source_files = syn.store(Folder(name='source_files', parent=storage, downloadPath=data_dir))
dpath = lambda s: os.path.join(data_dir, s)

init_notebook_mode(connected=True)
InteractiveShell.ast_node_interactivity = 'all'

def isnum(x):
    try:
        float(x)
        return True
    except ValueError:
        return False

def isstr(x):
    try:
        str(x)
        return True
    except ValueError:
        return False

Welcome, Luke Waninger!



NameError: name 'InteractiveShell' is not defined

### Zipcodes

In [2]:
zname = 'zb16totals'
url =  'https://www2.census.gov/programs-surveys/cbp/datasets/2016/zbp16totals.zip'

response = requests.get(url)
if response.ok:
    t_path = dpath('2017_Gaz_zcta_national.txt')
    f = io.BytesIO(response.content)

    with zipfile.ZipFile(f, 'r') as f:
        contents = [
            dict(
                name=name[:name.find('.')],
                data=f.read(name),
                ext=name[name.find('.')+1:]
            )
            for name in f.namelist()
        ]

    for fi in contents:        
        with io.BytesIO(fi['data']) as fi_:
            a = pd.read_csv(fi_)  
        
        a.to_csv(dpath(zname))
        cbp_totals_syn = syn.setProvenance(
            syn.store(File(name=zname, path=dpath(zname), parent=source_files)),
            activity=Activity(used=[dict(name='US Census Bureau', url=url)])
        )
        
        a.drop(columns=['name', 'empflag', 'emp_nf', 'emp', 'qp1_nf', 'qp1', 'ap_nf', 'ap', 'est'], inplace=True)
        a.rename(columns={'cty_name':'county', 'zip':'zipcode'}, inplace=True)
        
        a.city = [str(x).title() if isstr(x) else x for x in a.city]
        a.county = [str(x).title() if isstr(x) else x for x in a.county]


##################################################
 Uploading file to Synapse storage 
##################################################



In [3]:
zname = '2017_national_zipcodes.csv'
url =  'http://www2.census.gov/geo/docs/maps-data/data/gazetteer/2017_Gazetteer/2017_Gaz_zcta_national.zip'

response = requests.get(url)
if response.ok:
    t_path = dpath('2017_Gaz_zcta_national.txt')
    f = io.BytesIO(response.content)

    with zipfile.ZipFile(f, 'r') as f:
        f.extractall(path=data_dir)

    b = pd.read_csv(t_path, sep='\t')
    
    name = '2017_Gaz_zcta_national.csv'    
    b.to_csv(dpath(name))
    
    zips_syn = syn.setProvenance(
        syn.store(File(name=name, path=dpath(name), parent=source_files)),
        activity=Activity(used=[dict(name='US Census Bureau', url=url)])
    )
    
    b.columns = [s.strip() for s in b.columns]
    b = b.loc[:, ['GEOID', 'INTPTLAT', 'INTPTLONG']]
    b.columns = ['zipcode', 'lat', 'lon']


##################################################
 Uploading file to Synapse storage 
##################################################



In [4]:
sname = 'state_codes.csv'
state_codes = pd.read_csv(dpath(sname))
state_codes_syn = syn.store(File(name=sname, path=dpath(sname), parent=source_files))

In [5]:
d = pd.merge(a, b, on='zipcode')
d = pd.merge(d, state_codes, left_on='stabbr', right_on='Code')
d = d.drop(columns='Code')
d = d.rename(columns={'State':'state'})

d.lat = np.round(d.lat, 5)
d.lon = np.round(d.lon, 5)

d = d[['zipcode', 'city', 'county', 'state', 'stabbr', 'lat', 'lon']]

In [6]:
timezones = syn.get('syn16810024')
tz = pd.read_csv(timezones.path)
tz = tz.rename(columns={'zip':'zipcode'})

e = pd.merge(d, tz.loc[:, ['zipcode', 'timezone']], on=['zipcode'])

Unnamed: 0,zipcode,city,county,state,stabbr,lat,lon,timezone
0,1001,Agawam,Hampden,Massachusetts,MA,42.06237,-72.62575,America/New_York
1,1002,Amherst,Hampshire,Massachusetts,MA,42.36406,-72.45874,America/New_York
2,1003,Amherst,Hampshire,Massachusetts,MA,42.3897,-72.52401,America/New_York
3,1005,Barre,Worcester,Massachusetts,MA,42.41885,-72.1066,America/New_York
4,1007,Belchertown,Hampshire,Massachusetts,MA,42.27901,-72.40047,America/New_York


In [16]:
ua = syn.get('syn16816765')
ualist = pd.read_csv(ua.path)

ualist['city']   = [x[:x.find(',')].strip() for x in ualist.NAME]
ualist['stabbr'] = [x[x.find(',')+1:].strip() for x in ualist.NAME]

Unnamed: 0,UACE,NAME,POP,HU,AREALAND,AREALANDSQMI,AREAWATER,AREAWATERSQMI,POPDEN,LSADC,city,stabbr
0,37,"Abbeville, LA",19824,8460,29222871,11.28,300497,0.12,1757.0,76,Abbeville,LA
1,64,"Abbeville, SC",5243,2578,11315197,4.37,19786,0.01,1200.1,76,Abbeville,SC
2,91,"Abbotsford, WI",3966,1616,5363441,2.07,13221,0.01,1915.2,76,Abbotsford,WI
3,118,"Aberdeen, MS",4666,2050,7416616,2.86,52732,0.02,1629.4,76,Aberdeen,MS
4,145,"Aberdeen, SD",25977,12114,33002447,12.74,247597,0.1,2038.6,76,Aberdeen,SD


In [18]:
def ua_designation(args):
    city, stabbr = args
    
    r = ualist.loc[(ualist.city == city) & (ualist.stabbr == stabbr)]
    d = 'rural'
    
    if len(r) > 0:
        d = r.iloc[0].LSADC
        d = 'urban_area' if d == 75 else 'urban_cluster'

    return d


pool = mul.Pool(mul.cpu_count())
e['geo_designation'] = list(pool.map(ua_designation, [(r.city, r.stabbr) for r in e.itertuples()]))
pool.close(); pool.join()

e.head()

Unnamed: 0,zipcode,city,county,state,stabbr,lat,lon,timezone,geo_designation
0,1001,Agawam,Hampden,Massachusetts,MA,42.06237,-72.62575,America/New_York,rural
1,1002,Amherst,Hampshire,Massachusetts,MA,42.36406,-72.45874,America/New_York,rural
2,1003,Amherst,Hampshire,Massachusetts,MA,42.3897,-72.52401,America/New_York,rural
3,1005,Barre,Worcester,Massachusetts,MA,42.41885,-72.1066,America/New_York,rural
4,1007,Belchertown,Hampshire,Massachusetts,MA,42.27901,-72.40047,America/New_York,rural


In [24]:
iplot(go.Figure(
    [go.Bar(
        x=pd.unique(e.geo_designation), 
        y=pd.value_counts(e.geo_designation),
        text=pd.value_counts(e.geo_designation),
        textposition='outside')
    ]
))

In [27]:
zips = syn.setProvenance(
    syn.store(Table(
        Schema(name='zipcodes', columns=as_table_columns(e), parent=storage), e)
    ),
    activity=Activity(
        name='zipcode collation',
        description='Collecting zipcodes, counties, states, and geo-locations into a single table.',
        used=['syn16816617', 'syn16816613', 'syn16816612', 'syn16810024', 'syn16816765'],
        executed=[dict(
            name='synapse_project_setup.ipynb', 
            url='https://github.com/lukeWaninger/GSCAT/blob/master/notebooks/synapse_project_setup.ipynb'
        )]
    )
)

### Weather cache

In [8]:
db_name = 'weather_cache.sqlite'
weather_cache = syn.setProvenance(
    syn.get(File(name=db_name, path=dpath(db_name), parent=storage)),
    activity=Activity(
        used=[dict(name='DarkSky API', url='https://darksky.net/dev/docs')],
        executed=[dict(name='weather.py', url='https://github.com/lukeWaninger/GSCAT/blob/master/weather.py')]
    )
)

### GoogleMaps cache