In [74]:
import io
import os
from pathlib import Path
import requests
import zipfile

import pandas as pd
import synapseclient
from synapseclient import Activity, Project, Folder, File

# set the user's home directory as the data directory
data_dir = os.path.join(str(Path.home()), '.mhealth')
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
syn = synapseclient.Synapse()
syn.login()

storage = syn.get(Project(name='GSCAP Data'))
source_files = syn.store(Folder(name='source_files', parent=storage, downloadPath=data_dir))
dpath = lambda s: os.path.join(data_dir, s)

Welcome, Luke Waninger!



### Get the national zipcode listing

In [79]:
link = 'http://www2.census.gov/econ2016/CB/sector00/CB1600CZ11.zip'

response = requests.get(link)
if response.ok:
    f = io.BytesIO(response.content)

    with zipfile.ZipFile(f, 'r') as f:
        contents = [
            dict(
                name=name[:name.find('.')],
                data=f.read(name),
                ext=name[name.find('.')+1:]
            )
            for name in f.namelist()
        ]

    for fi in contents:        
        with io.BytesIO(fi['data']) as fi_:
            a = pd.read_csv(fi_, sep='|')
        
        name = 'CB1600CZ11.csv'
        a.to_csv(dpath(name))
        syn.setProvenance(
            syn.store(File(name=name, path=dpath(name), parent=source_files)),
            activity=Activity(used=[dict(name='US Census Bureau', url=link)])
        )
        
        a.columns = [s.strip().lower() for s in a.columns]
        a = a.loc[:, ['zipcode', 'geo_ttl']] 
        
        a.geo_ttl  = [ti[ti.find('(')+1:ti.find(')')] for ti in a.geo_ttl]
        a['city']  = [ti[:ti.find(',')].title() for ti in a.geo_ttl]
        a['state'] = [ti[ti.find(',')+1:] for ti in a.geo_ttl]
        a.drop(columns=['geo_ttl'], inplace=True)


##################################################
 Uploading file to Synapse storage 
##################################################



In [80]:
zname = '2017_national_zipcodes.csv'
url =  'http://www2.census.gov/geo/docs/maps-data/data/gazetteer/2017_Gazetteer/2017_Gaz_zcta_national.zip'

response = requests.get(url)
if response.ok:
    t_path = dpath('2017_Gaz_zcta_national.txt')
    f = io.BytesIO(response.content)

    with zipfile.ZipFile(f, 'r') as f:
        f.extractall(path=data_dir)

    b = pd.read_csv(t_path, sep='\t')
    
    name = '2017_Gaz_zcta_national.csv'    
    b.to_csv(dpath(name))
    syn.setProvenance(
        syn.store(File(name=name, path=dpath(name), parent=source_files)),
        activity=Activity(used=[dict(name='US Census Bureau', url=link)])
    )

    b.columns = [s.strip() for s in t.columns]
    b = b.loc[:, ['GEOID', 'INTPTLAT', 'INTPTLONG']]
    b.columns = ['zipcode', 'lat', 'lon']


##################################################
 Uploading file to Synapse storage 
##################################################



In [81]:
c = pd.merge(a,b, on='zipcode')
c.head(3)

Unnamed: 0,zipcode,city,state,lat,lon
0,35004,Moody,AL,33.603431,-86.493783
1,35005,Adamsville,AL,33.59595,-87.000649
2,35006,Adger,AL,33.422751,-87.209751


### Weather cache

In [None]:
db_name = "weather_cache.sqlite"
weather_cache = syn.setProvenance(
    syn.get(File(name=db_name, path=dpath(db_name), parent=syn_project)),
    activity=Activity(
        used=[dict(name='DarkSky API', url='https://darksky.net/dev/docs')],
        executed=[dict(name='weather.py', url='https://github.com/lukeWaninger/mHealthFeaturization/blob/master/weather.py')]
    )
)

### GoogleMaps cache