In [1]:
import numpy as np
import pandas as pd
import geopandas as gp
import os
import subprocess
import shapely as shp

# make sure we are at the top of the repo
wd = subprocess.check_output('git rev-parse --show-toplevel', shell = True)
os.chdir(wd[:-1]) #-1 removes \n

In [2]:
# read in LEHD OD data
lehd = pd.read_csv('data/lehd_od_2014.csv',
 dtype = {'w_geocode':'string',
          'h_geocode':'string'})

# keep only what we need
lehd = lehd[['w_geocode','h_geocode']]

In [3]:
# read in nyc census block shapefiles
nycshp = gp.read_file('data/nycb2010_16c/nycb2010.shp')

# set up cooridnate system
nycshp = nycshp.to_crs(epsg=4326) # epsg=4326: lat/on | 26918: NAD83/UTM zone 18N | epsg=2263 is US feet

# we only need manhattan 
manshp = nycshp[nycshp.BoroName == 'Manhattan']

# add state code 36 and leading digits for county code 06
manshp['BCTCB2010'] = '3606' + manshp['BCTCB2010']

# keep only what we need
manshp = manshp[['BCTCB2010','geometry']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
# keep only LEHD with work and home both in manhattan
lehd = lehd[np.in1d(lehd.h_geocode, manshp.BCTCB2010) & np.in1d(lehd.w_geocode, manshp.BCTCB2010)]
lehd.index = range(len(lehd))

In [5]:
# find the distance between work and home
work = lehd.merge(manshp, how = 'left', left_on = 'w_geocode', right_on = 'BCTCB2010')
work = gp.GeoDataFrame(work, crs = manshp.crs, geometry = 'geometry')
home = lehd.merge(manshp, how = 'left', left_on = 'h_geocode', right_on = 'BCTCB2010')
home = gp.GeoDataFrame(home, crs = manshp.crs, geometry = 'geometry')

lehd['distance'] = work.distance(home)

In [6]:
# aggregate LEHD data to home geocode level
lehd = lehd.groupby('h_geocode').agg('mean')

In [7]:
# create and write a shapefile with census block and average commute distance
cb_dist = manshp.merge(lehd, left_on = 'BCTCB2010', right_index = True)
cb_dist.columns = ['cb','geometry','commute_dist_avg']

if not os.path.exists('data/cb_dist/'):
    os.makedirs('data/cb_dist/')
    
cb_dist.to_file('data/cb_dist/cb_dist.shp')