In [None]:
# This is the project code.

In [2]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("DF2_Practice") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 

spark = configure_spark_with_delta_pip(builder).getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

In [59]:
import pandas as pd
import json
import numpy as np
import shapely
from shapely import Point
from shapely.geometry import mapping, shape

## Reading in data

In [50]:
taxiDataDf = (spark.read
             .option("sep", ",") # separator
             .option("header", True) # file has header row
             .option("inferSchema", True) # spark tries to infer data types
             .csv("sample.csv") #path
            )

In [51]:
display(taxiDataDf)

medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
89D227B655E5C82AECF13C3F540D4CF4,BA96DE419E711691B9445D6A6307C170,CMT,1,N,2013-01-01 15:11:48,2013-01-01 15:18:10,4,382,1.0,-73.978165,40.757977,-73.989838,40.751171
0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-06 00:18:35,2013-01-06 00:22:54,1,259,1.5,-74.006683,40.731781,-73.994499,40.75066
0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-05 18:49:41,2013-01-05 18:54:23,1,282,1.1,-74.004707,40.73777,-74.009834,40.726002
DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:54:15,2013-01-07 23:58:20,2,244,0.7,-73.974602,40.759945,-73.984734,40.759388
DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:25:03,2013-01-07 23:34:24,1,560,2.1,-73.97625,40.748528,-74.002586,40.747868
20D9ECB2CA0767CF7A01564DF2844A3E,598CCE5B9C1918568DEE71F43CF26CD2,CMT,1,N,2013-01-07 15:27:48,2013-01-07 15:38:37,1,648,1.7,-73.966743,40.764252,-73.983322,40.743763
496644932DF3932605C22C7926FF0FE0,513189AD756FF14FE670D10B92FAF04C,CMT,1,N,2013-01-08 11:01:15,2013-01-08 11:08:14,1,418,0.8,-73.995804,40.743977,-74.007416,40.744343
0B57B9633A2FECD3D3B1944AFC7471CF,CCD4367B417ED6634D986F573A552A62,CMT,1,N,2013-01-07 12:39:18,2013-01-07 13:10:56,3,1898,10.7,-73.989937,40.756775,-73.86525,40.77063
2C0E91FF20A856C891483ED63589F982,1DA2F6543A62B8ED934771661A9D2FA0,CMT,1,N,2013-01-07 18:15:47,2013-01-07 18:20:47,1,299,0.8,-73.980072,40.743137,-73.982712,40.735336
2D4B95E2FA7B2E85118EC5CA4570FA58,CD2F522EEE1FF5F5A8D8B679E23576B3,CMT,1,N,2013-01-07 15:33:28,2013-01-07 15:49:26,2,957,2.5,-73.977936,40.786983,-73.952919,40.80637


In [79]:
with open('nyc-boroughs.geojson') as file:
    boroughs = json.load(file)

features = boroughs['features']

properties = [feature['properties'] for feature in features]
geometry = [feature['geometry'] for feature in features]
properties_df = pd.DataFrame(properties)
geometry_df = pd.DataFrame(geometry)
borough_df = pd.concat([properties_df, geometry_df], axis=1)

In [80]:
borough_df

Unnamed: 0,boroughCode,borough,@id,type,coordinates
0,5,Staten Island,http://nyc.pediacities.com/Resource/Borough/St...,Polygon,"[[[-74.05050806403247, 40.566422034160816], [-..."
1,5,Staten Island,http://nyc.pediacities.com/Resource/Borough/St...,Polygon,"[[[-74.05314036821109, 40.577702715545755], [-..."
2,5,Staten Island,http://nyc.pediacities.com/Resource/Borough/St...,Polygon,"[[[-74.15945602438188, 40.641448333324036], [-..."
3,5,Staten Island,http://nyc.pediacities.com/Resource/Borough/St...,Polygon,"[[[-74.08221272914938, 40.64828016229008], [-7..."
4,4,Queens,http://nyc.pediacities.com/Resource/Borough/Qu...,Polygon,"[[[-73.83668274106708, 40.59494669701581], [-7..."
...,...,...,...,...,...
99,2,Bronx,http://nyc.pediacities.com/Resource/Borough/Bronx,Polygon,"[[[-73.78103351104956, 40.87648400204773], [-7..."
100,2,Bronx,http://nyc.pediacities.com/Resource/Borough/Bronx,Polygon,"[[[-73.78650554049729, 40.8809401344792], [-73..."
101,2,Bronx,http://nyc.pediacities.com/Resource/Borough/Bronx,Polygon,"[[[-73.87294860352858, 40.90444102266826], [-7..."
102,2,Bronx,http://nyc.pediacities.com/Resource/Borough/Bronx,Polygon,"[[[-73.80518266940541, 40.815271916427264], [-..."


In [81]:
borough_df['area'] = 0

In [91]:
from shapely import Polygon
for index, row in borough_df.iterrows():
    polygon = Polygon(list(row['coordinates'][0]))
    area = polygon.area
    borough_df.at[index, 'area'] = area

borough_df.sort_values(by=['area'], ascending = False)

Unnamed: 0,boroughCode,borough,@id,type,coordinates,area
23,4,Queens,http://nyc.pediacities.com/Resource/Borough/Qu...,Polygon,"[[[-73.89145055584646, 40.776372179016676], [-...",2.719387e-02
41,3,Brooklyn,http://nyc.pediacities.com/Resource/Borough/Br...,Polygon,"[[[-73.95439555417089, 40.739114772522505], [-...",1.870559e-02
3,5,Staten Island,http://nyc.pediacities.com/Resource/Borough/St...,Polygon,"[[[-74.08221272914938, 40.64828016229008], [-7...",1.603548e-02
101,2,Bronx,http://nyc.pediacities.com/Resource/Borough/Bronx,Polygon,"[[[-73.87294860352858, 40.90444102266826], [-7...",1.140587e-02
72,1,Manhattan,http://nyc.pediacities.com/Resource/Borough/Ma...,Polygon,"[[[-73.92640556921117, 40.87762147653734], [-7...",5.859078e-03
...,...,...,...,...,...,...
60,1,Manhattan,http://nyc.pediacities.com/Resource/Borough/Ma...,Polygon,"[[[-73.99476934191568, 40.70395289495355], [-7...",1.540025e-08
83,2,Bronx,http://nyc.pediacities.com/Resource/Borough/Bronx,Polygon,"[[[-73.78245880016405, 40.84397215208452], [-7...",9.327276e-09
13,4,Queens,http://nyc.pediacities.com/Resource/Borough/Qu...,Polygon,"[[[-73.73944380843434, 40.59702754555849], [-7...",8.313199e-09
84,2,Bronx,http://nyc.pediacities.com/Resource/Borough/Bronx,Polygon,"[[[-73.78203383698589, 40.844093822799046], [-...",8.237252e-09


In [92]:
def findingBorough (x_coordinate, y_coordinate):
    for index, row in borough_df.iterrows():
        polygon = Polygon(list(row['coordinates'][0]))
        geoms = np.array([Point(x_coordinate, y_coordinate)])
        if shapely.contains(polygon, geoms)[0] == True:
            return properties_df.iloc[i]['borough']
        else:
            continue
        

In [93]:
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
taxiDataDf = taxiDataDf.withColumn("startBorough", lit(None))
taxiDataDf = taxiDataDf.withColumn("endBorough",  lit(None))
display(taxiDataDf)

medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,startBorough,endBorough
89D227B655E5C82AECF13C3F540D4CF4,BA96DE419E711691B9445D6A6307C170,CMT,1,N,2013-01-01 15:11:48,2013-01-01 15:18:10,4,382,1.0,-73.978165,40.757977,-73.989838,40.751171,,
0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-06 00:18:35,2013-01-06 00:22:54,1,259,1.5,-74.006683,40.731781,-73.994499,40.75066,,
0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-05 18:49:41,2013-01-05 18:54:23,1,282,1.1,-74.004707,40.73777,-74.009834,40.726002,,
DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:54:15,2013-01-07 23:58:20,2,244,0.7,-73.974602,40.759945,-73.984734,40.759388,,
DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:25:03,2013-01-07 23:34:24,1,560,2.1,-73.97625,40.748528,-74.002586,40.747868,,
20D9ECB2CA0767CF7A01564DF2844A3E,598CCE5B9C1918568DEE71F43CF26CD2,CMT,1,N,2013-01-07 15:27:48,2013-01-07 15:38:37,1,648,1.7,-73.966743,40.764252,-73.983322,40.743763,,
496644932DF3932605C22C7926FF0FE0,513189AD756FF14FE670D10B92FAF04C,CMT,1,N,2013-01-08 11:01:15,2013-01-08 11:08:14,1,418,0.8,-73.995804,40.743977,-74.007416,40.744343,,
0B57B9633A2FECD3D3B1944AFC7471CF,CCD4367B417ED6634D986F573A552A62,CMT,1,N,2013-01-07 12:39:18,2013-01-07 13:10:56,3,1898,10.7,-73.989937,40.756775,-73.86525,40.77063,,
2C0E91FF20A856C891483ED63589F982,1DA2F6543A62B8ED934771661A9D2FA0,CMT,1,N,2013-01-07 18:15:47,2013-01-07 18:20:47,1,299,0.8,-73.980072,40.743137,-73.982712,40.735336,,
2D4B95E2FA7B2E85118EC5CA4570FA58,CD2F522EEE1FF5F5A8D8B679E23576B3,CMT,1,N,2013-01-07 15:33:28,2013-01-07 15:49:26,2,957,2.5,-73.977936,40.786983,-73.952919,40.80637,,


## Query 1: Utilization

Utilization: This is per taxi/driver. This can be computed by computing the idle 
time per taxi. We will elaborate on that more later

In [15]:
# ?

## Query 2: Average next trip time

The average time it takes for a taxi to find its next fare(trip) per destination borough. This can be computed by finding the time difference, e.g. in seconds, between the trip's drop off and the next trip's pick up.


In [16]:
# ?

## Query 3: Trips started in one borough

The number of trips that started and ended within the same borough,

In [17]:
# ?

## Query 4: Trips between different boroughs

The number of trips that started in one borough and ended in another one

In [18]:
# ?