In [1]:
# little trick to make spark work locally
import findspark

findspark.init()

In [6]:
# Create SparkSession
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]").appName("blogpost").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()

26/01/06 13:54:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [8]:
sdf_taxi_data = spark.read.csv("../data/taxi_trip_data.csv", header=True, inferSchema=True)
sdf_taxi_data.show(5)

[Stage 3:>                                                          (0 + 1) / 1]

+---------+-------------------+-------------------+---------------+-------------+---------+------------------+------------+-----------+-----+-------+----------+------------+-------------+------------+------------------+-------------------+
|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|rate_code|store_and_fwd_flag|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|imp_surcharge|total_amount|pickup_location_id|dropoff_location_id|
+---------+-------------------+-------------------+---------------+-------------+---------+------------------+------------+-----------+-----+-------+----------+------------+-------------+------------+------------------+-------------------+
|        2|2018-02-01 15:19:54|2018-02-01 15:21:39|              2|         0.12|        1|                 N|           2|        3.0|  0.0|    0.5|       0.0|         0.0|          0.3|         3.8|               237|                237|
|        4|2018-11-08 16:28:57|2018-11-0

                                                                                

In [42]:
sdf_taxi_data.printSchema()

root
 |-- vendor_id: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- rate_code: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- imp_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)



In [33]:
import pyspark.sql.functions as F
from pyspark.sql import Window

In [10]:
sdf_taxi_data.count()

1000000

In [31]:
sdf_taxi_zone_geo = spark.read.csv("../data/taxi_zone_geo.csv", header=True, inferSchema=True)
sdf_taxi_zone_geo.show(5)

+-------+--------------------+-------+
|zone_id|           zone_name|borough|
+-------+--------------------+-------+
|      1|      Newark Airport|    EWR|
|      3|Allerton/Pelham G...|  Bronx|
|     18|        Bedford Park|  Bronx|
|     20|             Belmont|  Bronx|
|     31|          Bronx Park|  Bronx|
+-------+--------------------+-------+
only showing top 5 rows



In [87]:
sdf_prepared_data = (
    sdf_taxi_data
    .dropDuplicates()
    # keep only the evening rides in a time range
    .filter(F.date_format(F.col("pickup_datetime"), "MMyyyy") > "032017")
    .filter(F.date_format(F.col("pickup_datetime"), "MMyyyy") <= "112018")
    .filter(F.date_format(F.col("dropoff_datetime"), "HH") >= "17")
    .filter(F.date_format(F.col("dropoff_datetime"), "HH") <= "23")
    # bind the fare amount to a range
    .withColumn(
        "fare_amount",
        F.when(F.col("fare_amount") < 0, F.lit(0))
        .when(F.col("fare_amount") > 50, F.lit(50))
        .otherwise(F.col("fare_amount"))
    )
    # remove rides to and from the airport
    .join(
        sdf_taxi_zone_geo
        .withColumnRenamed("zone_id", "pickup_location_id"),
        on="pickup_location_id"
    )
    .withColumnRenamed("zone_name", "pickup_zone_name")
    .withColumnRenamed("borough", "pickup_borough")
    .filter(~F.lower(F.col("pickup_zone_name")).like("%airport%"))
    .join(
        sdf_taxi_zone_geo
        .withColumnRenamed("zone_id", "dropoff_location_id"),
        on="dropoff_location_id"
    )
    .withColumnRenamed("zone_name", "dropoff_zone_name")
    .withColumnRenamed("borough", "dropoff_borough")
    .filter(~F.lower(F.col("dropoff_zone_name")).like("%airport%"))
    # take the first 3 rides per day for each pickup location
    .withColumn(
        "row_number",
        F.row_number()
        .over(
            Window
            .partitionBy("pickup_location_id", F.date_format(F.col("pickup_datetime"), "ddMMyyyy"))
            .orderBy(F.asc("pickup_datetime"))
        )
    )
    .filter(F.col("row_number") < 4)
    .withColumn("month", F.month(F.col("pickup_datetime")))
    .withColumn("day_of_week", F.dayofweek(F.col("pickup_datetime")))
    .withColumn("day_of_month", F.dayofmonth(F.col("pickup_datetime")))
    .withColumn("store_and_fwd_flag", F.when(F.col("store_and_fwd_flag") == "N", 0).otherwise(1))
    .withColumn("random_number", F.rand())
)

sdf_training = sdf_prepared_data.filter(F.col("random_number") > 0.2)
sdf_test = sdf_prepared_data.filter(F.col("random_number") <= 0.2)

In [51]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor

In [None]:
feature_cols = [
    "passenger_count",
    "trip_distance",
    "rate_code",
    "store_and_fwd_flag",
    "payment_type",
    "fare_amount",
    "tolls_amount",
    "imp_surcharge",
    "month",
    "day_of_week",
    "day_of_month",
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

rf = RandomForestRegressor(
    labelCol="tip_amount",
    featuresCol="features",
    predictionCol="prediction",
    numTrees=10,
    maxDepth=4,
    featureSubsetStrategy="auto",
    seed=42,
    bootstrap=True,
)

pipeline = Pipeline(stages=[assembler, rf])

model = pipeline.fit(sdf_training)

                                                                                

In [81]:
importances = model.stages[-1].featureImportances
importance_dict = dict(zip(feature_cols, importances, strict=False))
importance_dict

{'passenger_count': np.float64(0.0005650769679928612),
 'trip_distance': np.float64(0.03947873128856189),
 'rate_code': np.float64(0.057175622843374456),
 'store_and_fwd_flag': np.float64(0.0004073739239823727),
 'payment_type': np.float64(0.6493802005042605),
 'fare_amount': np.float64(0.1994133028403176),
 'tolls_amount': np.float64(0.039535605550588784),
 'imp_surcharge': np.float64(0.0023625643643575838),
 'month': np.float64(0.008104483413638217),
 'day_of_week': np.float64(9.70230411804262e-07),
 'day_of_month': np.float64(0.0035760680725137753)}

In [82]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator()
evaluator.setPredictionCol("prediction")
evaluator.setLabelCol("tip_amount")

RegressionEvaluator_3a1915527587

In [83]:
sdf_train_predictions = model.transform(sdf_training)
sdf_train_predictions.show(10)

                                                                                

+-------------------+------------------+---------+-------------------+-------------------+---------------+-------------+---------+------------------+------------+-----------+-----+-------+----------+------------+-------------+------------+--------------------+--------------+--------------------+---------------+----------+-----+-----------+------------+-------------------+--------------------+--------------------+
|dropoff_location_id|pickup_location_id|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|rate_code|store_and_fwd_flag|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|imp_surcharge|total_amount|    pickup_zone_name|pickup_borough|   dropoff_zone_name|dropoff_borough|row_number|month|day_of_week|day_of_month|      random_number|            features|          prediction|
+-------------------+------------------+---------+-------------------+-------------------+---------------+-------------+---------+------------------+------------+----

In [84]:
print(evaluator.evaluate(sdf_train_predictions, {evaluator.metricName: "rmse"}))
print(evaluator.evaluate(sdf_train_predictions, {evaluator.metricName: "r2"}))
print(evaluator.evaluate(sdf_train_predictions, {evaluator.metricName: "mae"}))

                                                                                

3.2560588765354903


                                                                                

0.339770764064412


                                                                                

1.6862356593009664


In [85]:
sdf_test_predictions = model.transform(sdf_test)
sdf_test_predictions.show(10)

[Stage 418:>                                                        (0 + 1) / 1]

+-------------------+------------------+---------+-------------------+-------------------+---------------+-------------+---------+------------------+------------+-----------+-----+-------+----------+------------+-------------+------------+--------------------+--------------+--------------------+---------------+----------+-----+-----------+------------+--------------------+--------------------+------------------+
|dropoff_location_id|pickup_location_id|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|rate_code|store_and_fwd_flag|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|imp_surcharge|total_amount|    pickup_zone_name|pickup_borough|   dropoff_zone_name|dropoff_borough|row_number|month|day_of_week|day_of_month|       random_number|            features|        prediction|
+-------------------+------------------+---------+-------------------+-------------------+---------------+-------------+---------+------------------+------------+------

                                                                                

In [86]:
print(evaluator.evaluate(sdf_test_predictions, {evaluator.metricName: "rmse"}))
print(evaluator.evaluate(sdf_test_predictions, {evaluator.metricName: "r2"}))
print(evaluator.evaluate(sdf_test_predictions, {evaluator.metricName: "mae"}))

                                                                                

2.9652940767437475


                                                                                

0.3725774538671087


                                                                                

1.6625858077619693


In [None]:
model.write().save("../data/model")