In [3]:
import sys, os, json
sys.path.append('/Users/benjamin/Desktop/repos/chi-data/backend') 
sys.path.append('/Users/benjamin/Desktop/repos/chi-data/backend/aws') 
from s3 import S3
from dynamo import DynamoConn
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import s3fs
import decimal
from time import time
import requests as r
import geopandas as gpd
from shapely.geometry import mapping, shape
from config import cook_tracts, chicago_tracts, msa_tracts
arrow_s3fs = s3fs.S3FileSystem()
s3 = S3()
d = DynamoConn()

boundaries = {
	# 'chicago-zillow-opposite': None,
	'chicago': chicago_tracts,
}

stats = {
	# 'S000': 'total_jobs',
	# 'SA01': 'age_group_1',
	# 'SA02': 'age_group_2',
	# 'SA03': 'age_group_3',
	# 'SE01': 'salary_group_1',
	# 'SE02': 'salary_group_2',
	'SE03': 'salary_group_3',
	# 'SI01': 'industry_group_1',
	# 'SI02': 'industry_group_2',
	# 'SI03': 'industry_group_3'
}

dataset_names = {}

for boundary in boundaries:
	for stat in stats:
		dataset_names['final-jobs-eigs-%s-%s' % (boundary, stat)] = {
			'dataset': 'JT00',
			'stat': stat,
			'stat_name': stats[stat],
			'boundaries': boundaries[boundary],
			'boundary': 'chicago'
		}


res = r.get('https://s3.amazonaws.com/chicago.bnroths.com/data/boundaries/ZillowNeighborhaoods-IL.json').json()
with open('zillow.json', 'wb') as f:
	f.write(json.dumps(res))
res = r.get('https://s3.amazonaws.com/chicago.bnroths.com/data/boundaries/Boundaries+-+Census+Tracts+-+2010.json').json()
with open('tracts.json', 'wb') as f:
	f.write(json.dumps(res))

tracts = gpd.read_file('tracts.json')
neighborhoods = gpd.read_file('zillow.json')

tracts_center = tracts
tracts_center['centroid_column'] = tracts.centroid
tracts_center = tracts.set_geometry('centroid_column')
neighborhoods_w_tracts = gpd.sjoin(tracts_center, neighborhoods, how="inner", op='within')
neighborhoods_w_tracts_small = neighborhoods_w_tracts[['geoid10', 'Name']].set_index('geoid10')


In [10]:
year = 2004
ds = pq.ParquetDataset(
	path_or_paths=[
		'bnroths/chicago-data/lehd_od/year=%s/il_lehd_od_main_%s_%s.parquet' % (year, 'JT00', year), 
		'bnroths/chicago-data/lehd_od/year=%s/il_lehd_od_aux_%s_%s.parquet' % (year, 'JT00', year), 
		
		'bnroths/chicago-data/lehd_od/year=%s/in_od_main_%s_%s.parquet' % (year, 'JT00', year), 
		'bnroths/chicago-data/lehd_od/year=%s/in_od_aux_%s_%s.parquet' % (year, 'JT00', year), 
		
		'bnroths/chicago-data/lehd_od/year=%s/wi_od_main_%s_%s.parquet' % (year, 'JT00', year), 
		'bnroths/chicago-data/lehd_od/year=%s/wi_od_aux_%s_%s.parquet' % (year, 'JT00', year), 
	
	],
	filesystem=arrow_s3fs, 
	validate_schema=False
)

table = ds.read(columns=['w_tract', 'h_tract', stat])
df = table.to_pandas()



In [11]:
final_df = df.set_index('w_tract').join(neighborhoods_w_tracts_small).rename(columns={'Name': 'w_hood'})
final_df = final_df.set_index('h_tract').join(neighborhoods_w_tracts_small, lsuffix='left').rename(columns={'Name': 'h_hood'})


diff1 = set(final_df.h_hood) - set(final_df.w_hood)
for tract in diff1:
	final_df = final_df[final_df.h_hood != tract]

diff2 = set(final_df.w_hood) - set(final_df.h_hood)
for tract in diff2:
	final_df = final_df[final_df.w_hood != tract]


In [12]:
print final_df.head()
print stat

             SE03    w_hood h_hood
10001040202     1  The Loop    NaN
10001040203     0       NaN    NaN
10001041100     0       NaN    NaN
10001041701     0       NaN    NaN
10001043100     0       NaN    NaN
SE03


In [13]:
pivot = pd.pivot_table(
	final_df, 
	values=stat, 
	columns=['w_hood'], 
	index=['h_hood'], 
	aggfunc=np.sum, 
	fill_value=0)

In [14]:
pivot.head()

w_hood,Albany Park,Altgeld Gardens,Andersonville,Arcadia Terrace,Archer Heights,Ashburn,Avalon Park,Avondale,Back of the Yards,Belmont Central,...,West Rogers Park,West Town,West Woodlawn,Wicker Park,Wildwood,Wolf Lake,Woodlawn,Wrightwood,Wrightwood Neighbors,Wrigleyville
h_hood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albany Park,38,1,3,2,22,1,0,19,11,4,...,13,69,0,0,0,1,2,0,3,4
Altgeld Gardens,0,2,0,0,5,0,0,0,4,0,...,0,1,0,0,0,0,0,0,0,0
Andersonville,3,0,42,1,2,0,0,8,1,1,...,3,23,0,0,0,0,1,0,0,1
Arcadia Terrace,3,0,0,7,1,0,0,2,3,0,...,4,9,0,0,0,1,0,0,2,0
Archer Heights,6,0,1,0,122,1,1,8,29,2,...,1,34,0,2,0,4,0,0,0,3
