In [1]:
sc

<pyspark.context.SparkContext at 0x10e1fbd90>

In [2]:
import datetime
import operator
import os
import sys
import time
from operator import add
import numpy as np
import matplotlib.path as mplPath

In [3]:
def geojson_create(filename,data):
    import json
    coordinatesList = {}
    with open ('block-groups-polygons-simple.geojson') as dataFile:
        blockData = json.load(dataFile)
    count = 0
    for i in data:
        for block in blockData['features']:
            if i == block['properties']['OBJECTID']:
                coordinatesList[count] = [block['geometry'],block['properties']]
                count+=1

    template =         '''         { "type" : "Feature",
            "id" : %s,
            "properties" : %s,
            "geometry" : %s
            },
        '''

    # the head of the geojson file
    output =         '''     { "type" : "FeatureCollection",
        "features" : [
        '''

    for k,v in coordinatesList.iteritems():
        output += template % (k,json.dumps(v[1]),json.dumps(v[0]))

    # the tail of the geojson file
    output +=         '''         ]
    }
        '''

    # opens an geoJSON file to write the output to
    outFileHandle = open(filename+".geojson", "w")
    outFileHandle.write(output)
    outFileHandle.close()

In [4]:
def indexZones(shapeFilename):
    import rtree
    import fiona.crs
    import geopandas as gpd
    index = rtree.Rtree()
    zones = gpd.read_file(shapeFilename).to_crs(fiona.crs.from_epsg(2263))
    for idx,geometry in enumerate(zones.geometry):
        index.insert(idx, geometry.bounds)
    return (index, zones)

def find_Block(p, index, zones):
    match = index.intersection((p.x, p.y, p.x, p.y))
    for idx in match:
        z = mplPath.Path(np.array(zones.geometry[idx].exterior))
        if z.contains_point(np.array(p)):
            return zones['OBJECTID'][idx]
    return -1

def find_Borough(p, index, zones):
    match = index.intersection((p.x, p.y, p.x, p.y))
    for idx in match:
        if any(map(lambda x: x.contains(p), zones.geometry[idx])):
            return zones['boroname'][idx]
    return -1

In [10]:
def pick_mapToZone(parts):
    import pyproj
    import shapely.geometry as geom
    proj = pyproj.Proj(init="epsg:2263", preserve_units=True)    
    index, zones = indexZones('block-groups-polygons-simple.geojson')
    index2, zones2 = indexZones('boroughs.geojson')
    for line in parts:
        if line.startswith('vendor_id'): continue 
        fields = line.strip('').split(',')
        if fields ==['']: continue
        if all((fields[5],fields[6],fields[9],fields[10])) and float(fields[4])<=2:
            pickup_location  = geom.Point(proj(float(fields[5]), float(fields[6])))
            pickup_block = find_Block(pickup_location, index, zones)
            pickup_borough = find_Borough(pickup_location, index2, zones2)
            if pickup_block>=0 and pickup_borough>0 :#np.array(pickup_block.exterior)
                yield (pickup_block,pickup_borough)

In [11]:
def drop_mapToZone(parts):
    import pyproj
    import shapely.geometry as geom
    proj = pyproj.Proj(init="epsg:2263", preserve_units=True)    
    index, zones = indexZones('block-groups-polygons-simple.geojson')
    index2, zones2 = indexZones('boroughs.geojson')
    for line in parts:
        if line.startswith('vendor_id'): continue 
        fields = line.strip('').split(',')
        if fields ==['']: continue
        if all((fields[5],fields[6],fields[9],fields[10])) and float(fields[4])<=2:
            drop_location  = geom.Point(proj(float(fields[9]), float(fields[10])))
            drop_block = find_Block(drop_location, index, zones)
            drop_borough = find_Borough(drop_location, index2, zones2)
            if drop_block>=0 and drop_borough>0 :
                yield (drop_block,drop_borough)

In [12]:
if __name__=='__main__':
    
    trips = sc.textFile('../yellow_tripdata_2012-07.csv')
#     trips = sc.textFile('../yellow_tripdata_2014-07.csv')

    pickup = sc.parallelize(pick_mapToZone(trips.take(1000)))
    pickup = pickup.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)
    pickup_M = pickup.filter(lambda x: x[0][1] == "Manhattan").takeOrdered(10, lambda x: -x[1])
    pickup_Q = pickup.filter(lambda x: x[0][1] == "Queens").takeOrdered(10, lambda x: -x[1])
    pickup_Bx = pickup.filter(lambda x: x[0][1] == "Bronx").takeOrdered(10, lambda x: -x[1])
    pickup_Bk = pickup.filter(lambda x: x[0][1] == "Brooklyn").takeOrdered(10, lambda x: -x[1])
    pickup_SI = pickup.filter(lambda x: x[0][1] == "Staten Island").takeOrdered(10, lambda x: -x[1])
    pickup_all = sc.parallelize(pickup_M+pickup_Q+pickup_Bx+pickup_Bk+pickup_SI)

    dropoff = sc.parallelize(drop_mapToZone(trips.take(5000)))
    dropoff = pickup.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)
    dropoff_M = dropoff.filter(lambda x: x[0][1] == "Manhattan").takeOrdered(10, lambda x: -x[1])
    dropoff_Q = dropoff.filter(lambda x: x[0][1] == "Queens").takeOrdered(10, lambda x: -x[1])
    dropoff_Bx = dropoff.filter(lambda x: x[0][1] == "Bronx").takeOrdered(10, lambda x: -x[1])
    dropoff_Bk = dropoff.filter(lambda x: x[0][1] == "Brooklyn").takeOrdered(10, lambda x: -x[1])
    dropoff_SI = dropoff.filter(lambda x: x[0][1] == "Staten Island").takeOrdered(10, lambda x: -x[1])
    dropoff_all = sc.parallelize(pickup_M+pickup_Q+pickup_Bx+pickup_Bk+pickup_SI)
    print dropoff_all.collect()

[((9245, u'Manhattan'), 9), ((9052, u'Manhattan'), 9), ((9560, u'Manhattan'), 7), ((9394, u'Manhattan'), 7), ((9612, u'Manhattan'), 6), ((9085, u'Manhattan'), 5), ((9564, u'Manhattan'), 5), ((9650, u'Manhattan'), 5), ((9593, u'Manhattan'), 4), ((9509, u'Manhattan'), 4), ((3147, u'Queens'), 1), ((3431, u'Queens'), 1), ((2815, u'Queens'), 1), ((2281, u'Queens'), 1), ((5439, u'Bronx'), 1), ((5888, u'Bronx'), 1), ((12860, u'Brooklyn'), 1), ((12756, u'Brooklyn'), 1), ((11224, u'Brooklyn'), 1), ((11292, u'Brooklyn'), 1), ((12856, u'Brooklyn'), 1), ((12556, u'Brooklyn'), 1), ((12608, u'Brooklyn'), 1), ((12061, u'Brooklyn'), 1), ((12054, u'Brooklyn'), 1), ((12158, u'Brooklyn'), 1)]


In [18]:
geojson_create("Pickup_Map",pickup_all.map(lambda x: x[0][0]).collect())
geojson_create("Dropoff_Map",dropoff_all.map(lambda x: x[0][0]).collect())