In [1]:
import io
import os
from pathlib import Path
import requests
import zipfile

import numpy as np
import pandas as pd
import synapseclient
from synapseclient import Activity, Project, Folder, File, Table, Schema, as_table_columns

# set the user's home directory as the data directory
data_dir = os.path.join(str(Path.home()), '.gscat')
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
syn = synapseclient.Synapse()
syn.login()

storage = syn.get(Project(name='GSCAP Data'))
source_files = syn.store(Folder(name='source_files', parent=storage, downloadPath=data_dir))
dpath = lambda s: os.path.join(data_dir, s)

def isnum(x):
    try:
        float(x)
        return True
    except ValueError:
        return False

def isstr(x):
    try:
        str(x)
        return True
    except ValueError:
        return False

Welcome, Luke Waninger!



### Zipcodes

In [None]:
zname = 'zb16totals'
url =  'https://www2.census.gov/programs-surveys/cbp/datasets/2016/zbp16totals.zip'

response = requests.get(url)
if response.ok:
    t_path = dpath('2017_Gaz_zcta_national.txt')
    f = io.BytesIO(response.content)

    with zipfile.ZipFile(f, 'r') as f:
        contents = [
            dict(
                name=name[:name.find('.')],
                data=f.read(name),
                ext=name[name.find('.')+1:]
            )
            for name in f.namelist()
        ]

    for fi in contents:        
        with io.BytesIO(fi['data']) as fi_:
            a = pd.read_csv(fi_)  
        
        a.to_csv(dpath(zname))
        cbp_totals_syn = syn.setProvenance(
            syn.store(File(name=zname, path=dpath(zname), parent=source_files)),
            activity=Activity(used=[dict(name='US Census Bureau', url=url)])
        )
        
        a.drop(columns=['name', 'empflag', 'emp_nf', 'emp', 'qp1_nf', 'qp1', 'ap_nf', 'ap', 'est'], inplace=True)
        a.rename(columns={'cty_name':'county', 'zip':'zipcode'}, inplace=True)
        
        a.city = [str(x).title() if isstr(x) else x for x in a.city]
        a.county = [str(x).title() if isstr(x) else x for x in a.county]

In [3]:
zname = '2017_national_zipcodes.csv'
url =  'http://www2.census.gov/geo/docs/maps-data/data/gazetteer/2017_Gazetteer/2017_Gaz_zcta_national.zip'

response = requests.get(url)
if response.ok:
    t_path = dpath('2017_Gaz_zcta_national.txt')
    f = io.BytesIO(response.content)

    with zipfile.ZipFile(f, 'r') as f:
        f.extractall(path=data_dir)

    b = pd.read_csv(t_path, sep='\t')
    
    name = '2017_Gaz_zcta_national.csv'    
    b.to_csv(dpath(name))
    
    zips_syn = syn.setProvenance(
        syn.store(File(name=name, path=dpath(name), parent=source_files)),
        activity=Activity(used=[dict(name='US Census Bureau', url=url)])
    )
    
    b.columns = [s.strip() for s in b.columns]
    b = b.loc[:, ['GEOID', 'INTPTLAT', 'INTPTLONG']]
    b.columns = ['zipcode', 'lat', 'lon']


##################################################
 Uploading file to Synapse storage 
##################################################



In [4]:
sname = 'state_codes.csv'
state_codes = pd.read_csv(dpath(sname))
state_codes_syn = syn.store(File(name=sname, path=dpath(sname), parent=source_files))

In [5]:
d = pd.merge(a, b, on='zipcode')
d = pd.merge(d, state_codes, left_on='stabbr', right_on='Code')
d = d.drop(columns='Code')
d = d.rename(columns={'State':'state'})

d.lat = np.round(d.lat, 5)
d.lon = np.round(d.lon, 5)

d = d[['zipcode', 'city', 'county', 'state', 'stabbr', 'lat', 'lon']]

In [12]:
timezones = syn.get('syn16810024')
tz = pd.read_csv(timezones.path)
tz = tz.rename(columns={'zip':'zipcode'})

e = pd.merge(d, tz.loc[:, ['zipcode', 'timezone']], on=['zipcode'])

e.head()

In [None]:
zips = syn.setProvenance(
    syn.store(Table(
        Schema(name='zipcodes', columns=as_table_columns(e), parent=storage), e)
    ),
    activity=Activity(
        name='zipcode collation',
        description='Collecting zipcodes, counties, states, and geo-locations into a single table.',
        used=['syn16816617', 'syn16816613', 'syn16816612', 'syn16810024'],
        executed=[dict(
            name='synapse_project_setup.ipynb', 
            url='https://github.com/lukeWaninger/GSCAT/blob/master/notebooks/synapse_project_setup.ipynb'
        )]
    )
)

### Weather cache

In [17]:
db_name = 'weather_cache.sqlite'
weather_cache = syn.setProvenance(
    syn.store(File(name=db_name, path=dpath(db_name), parent=storage)),
    activity=Activity(
        used=[dict(name='DarkSky API', url='https://darksky.net/dev/docs')],
        executed=[dict(name='weather.py', url='https://github.com/lukeWaninger/GSCAT/blob/master/weather.py')]
    )
)


##################################################
 Uploading file to Synapse storage 
##################################################







### GoogleMaps cache