In [1]:
import numpy as np
import pandas as pd
import geopandas as gp
import os
import subprocess
import shapely as shp
from datetime import datetime

from itertools import compress

# make sure we are at the top of the repo
wd = subprocess.check_output('git rev-parse --show-toplevel', shell = True)
os.chdir(wd[:-1]) #-1 removes \n

In [2]:
# read in nyc census block shapefiles
nycshp = gp.read_file('data/nycb2010_16c/nycb2010.shp')

# set up cooridnate system
nycshp = nycshp.to_crs(epsg=4326) # epsg=4326: lat/on | 26918: NAD83/UTM zone 18N | epsg=2263 is US feet

# we only need manhattan 
manshp = nycshp[nycshp.BoroName == 'Manhattan']

# add state code 36 and leading digits for county code 06
manshp['BCTCB2010'] = '3606' + manshp['BCTCB2010']

# keep only what we need
manshp = manshp[['BCTCB2010','geometry']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
# read in 2014 citibike data
citi = pd.DataFrame()
for m in list(compress(os.listdir('data'), [('citibike' in i) for i in os.listdir('data')])):
    citinew = pd.read_csv('data/'+m)
    citi = citi.append(citinew, ignore_index = True)
    


In [4]:
#Convert starttime to timpstamp format
citi['timestamp'] = pd.to_datetime(citi["starttime"])

#Pull day of week from timstamp (Mon, Tues etc)
citi['dow'] = citi['timestamp'].dt.dayofweek 

#Only keep weekdays
citi = citi[citi['dow'] <= 4]

In [5]:
citi.index = citi.timestamp

In [6]:
#Keep only morning commute hours, e.g 5am to 12pm

citi = citi.between_time('5:00:00','12:00:00', include_start=True, include_end=True)

In [7]:

citi = citi.reset_index(drop = True)


In [8]:
# dataframe for stations    
citi_docks = citi[['start station id','start station name','start station latitude','start station longitude']]
citi_docks.columns = ['station_id','name','lat','lon']
citi_docks.drop_duplicates(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return func(*args, **kwargs)


In [9]:
# even after dropping duplicates there are some duplicated station ids. Drop them.
dups = citi_docks[citi_docks.station_id.duplicated(keep = False)].station_id.unique()
citi_docks = citi_docks[[i not in dups for i in citi_docks.station_id]]

In [10]:
# dataframe for rides per station
citi_rides = citi[['start station id', 'tripduration']] # keep duration because we need a column to count with
citi_rides.columns = ['station_id','n_rides']
citi_rides = citi_rides.groupby('station_id').count()

In [11]:
# merge number of rides into station detail table
citi_docks = citi_docks.merge(citi_rides, how = 'left', left_on = 'station_id', right_index = True)

In [12]:
# combine lat and lon to one column
citi_docks['lonlat']=zip(citi_docks.lon, citi_docks.lat)
# Create Point Geometry for based on lonlat column
citi_docks['geometry']=citi_docks[['lonlat']].applymap(lambda x:shp.geometry.Point(x))
citi_docks = citi_docks[['station_id', 'n_rides', 'geometry']]
citi_docks.head()

Unnamed: 0,station_id,n_rides,geometry
0,379,22104,POINT (-73.99160000000001 40.749156)
1,474,7429,POINT (-73.98683077 40.7451677)
2,539,2939,POINT (-73.96024116 40.71534825)
3,2023,2437,POINT (-73.97031366 40.75968085)
5,352,5664,POINT (-73.97722478999999 40.76340613)


In [13]:
manshp.head()

Unnamed: 0,BCTCB2010,geometry
3120,360610005000003,(POLYGON ((-74.02020686710533 40.6848338331104...
3121,360610002011000,POLYGON ((-73.98511406569527 40.71378675143526...
3122,360610002021000,"POLYGON ((-73.9799619880113 40.71397269933503,..."
3123,360610007001000,POLYGON ((-74.00965928671496 40.70875672251924...
3124,360610009001000,POLYGON ((-74.00986580893488 40.70579642669662...


In [14]:
# check for each citibike stations to see what CB they belong to
a = []
for i in range(len(citi_docks)):
    # includes stations not in Manhattan, return 0 for CB. 
    try: 
        a.append(manshp[manshp.geometry.intersects(citi_docks.iloc[i].geometry)].BCTCB2010.values[0])
    except: 
        a.append(0)
    print '\r',"%",str((i+1)*100./len(citi_docks))[:4],
citi_docks['BCTCB2010']=a

% 100.


In [15]:
# get n_rides per census block
cb_rides = citi_docks.groupby('BCTCB2010', as_index = False).agg('sum')

# drop CB 0 which is non-Manhattan rides
cb_rides = cb_rides[cb_rides.index != 0]

# keep only what we need
cb_rides = cb_rides[['BCTCB2010', 'n_rides']]


In [16]:
# expore final data
cb_rides.to_csv('data/cb_rides.csv', index = False)

In [17]:
len(citi)

1979465