In [1]:
import numpy as np
import pandas as pd
import geopandas as gp
import os
import subprocess
import pylab as pl
import shapely as shp


%pylab inline

# make sure we are at the top of the repo
wd = subprocess.check_output('git rev-parse --show-toplevel', shell = True)
os.chdir(wd[:-1]) #-1 removes \n

Populating the interactive namespace from numpy and matplotlib


In [2]:
# read in nyc census block shapefiles
nycshp = gp.read_file('data/nycb2010_16c/nycb2010.shp', )

# we only need manhattan 
manshp = nycshp[nycshp.BoroName == 'Manhattan']

In [3]:
manshp.head()

Unnamed: 0,BCTCB2010,BoroCode,BoroName,CB2010,CT2010,Shape_Area,Shape_Leng,geometry
3120,10005000003,1,Manhattan,3,500,183471.102412,6870.140044,(POLYGON ((978645.7485961914 188780.5050048828...
3121,10002011000,1,Manhattan,1000,201,127311.991692,1566.229697,"POLYGON ((988376.7305908203 199328.6176147461,..."
3122,10002021000,1,Manhattan,1000,202,57115.914029,1187.88386,"POLYGON ((989804.9920043945 199396.6488037109,..."
3123,10007001000,1,Manhattan,1000,700,71173.193979,1130.724601,"POLYGON ((981572.016784668 197495.8228149414, ..."
3124,10009001000,1,Manhattan,1000,900,147868.288795,1596.659767,"POLYGON ((981514.6381835938 196417.3043823242,..."


In [4]:
# read in LEHD OD data
lehd = pd.read_csv('data/lehd_od_2014.csv',
 dtype = {'w_geocode':'string',
          'h_geocode':'string'})

In [5]:
# read in bikeshare data
bks = pd.read_csv('data/citibike_201401.csv')

In [6]:
# group rides by start station

# get number of rides per station
bks_count = bks.groupby('start station id', as_index = False)[['tripduration']].count()
bks_count.columns = ['start station id', 'n_rides']

# get station locations
bks_locs = bks.groupby('start station id', as_index = False)[['start station latitude','start station longitude']].first()


# merge
bks_clean = bks_count.merge(bks_locs, on = 'start station id')
bks_clean.columns = ['id','n_rides','lat','lon']
bks_clean.head()

Unnamed: 0,id,n_rides,lat,lon
0,72,924,40.767272,-73.993929
1,79,617,40.719116,-74.006667
2,82,356,40.711174,-74.000165
3,83,444,40.683826,-73.976323
4,116,1919,40.741776,-74.001497


In [7]:
# convert bike share data to points data
crs = manshp.crs
geometry = [shp.geometry.Point(xy) for xy in zip(bks_clean.lat,bks_clean.lon)]
bks_geo = gp.GeoDataFrame(bks_clean, crs=crs, geometry=geometry)

bks_geo.head()

Unnamed: 0,id,n_rides,lat,lon,geometry
0,72,924,40.767272,-73.993929,POINT (40.76727216 -73.99392888)
1,79,617,40.719116,-74.006667,POINT (40.71911552 -74.00666661)
2,82,356,40.711174,-74.000165,POINT (40.71117416 -74.00016545)
3,83,444,40.683826,-73.976323,POINT (40.68382604 -73.97632328)
4,116,1919,40.741776,-74.001497,POINT (40.74177603 -74.00149746)


In [8]:
# correct units for geometry. Not sure why they weren't the same in the first place since the projection (crs) was the same
manshp = manshp.to_crs(epsg=4326)
bks_geo = bks_geo.to_crs(epsg=4326)


In [None]:
# combine bks_geo and manshp somehow. What block does the point fall in? What is the average block info for blocks intersecting 
# buffer around station? Couldn't get any to work.



In [None]:
# this should eventually replace manshp with whatever comes out of spacial join/merge/whatever

# clean LEHD geocodes to match census blocks from manshp
lehd['w_geo'] = lehd['w_geocode'].str[4:]
lehd['h_geo'] = lehd['h_geocode'].str[4:]

# prepare manhattan data for merge
manshp['w_geo'] = manshp['BCTCB2010']
manshp['h_geo'] = manshp['BCTCB2010']

# merge on home geocode
#df = manshp.merge(lehd, on = ['h_geo'])