In [1]:
# little trick to make spark work locally
import findspark
findspark.init()

In [2]:
# Create SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("tip_amount_model").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/08 08:55:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/08 08:55:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
import pyspark.sql.functions as F
from pyspark.sql import Window

sdf_taxi_data = spark.read.csv("../data/taxi_trip_data.csv", header=True, inferSchema=True)
sdf_taxi_zone_geo = spark.read.csv("../data/taxi_zone_geo.csv", header=True, inferSchema=True)

sdf_prepared_data = (
    sdf_taxi_data
    .dropDuplicates()
    # keep only the evening rides in a time range
    .filter(F.date_format(F.col("pickup_datetime"), "yyyyMM") > "201703")
    .filter(F.date_format(F.col("pickup_datetime"), "yyyyMM") <= "201811")
    .filter(F.date_format(F.col("dropoff_datetime"), "HH") >= "17")
    .filter(F.date_format(F.col("dropoff_datetime"), "HH") <= "23")
    # remove rides to and from the airport
    .join(
        sdf_taxi_zone_geo
        .withColumnRenamed("zone_id", "pickup_location_id"),
        on="pickup_location_id"
    )
    .withColumnRenamed("zone_name", "pickup_zone_name")
    .withColumnRenamed("borough", "pickup_borough")
    .filter(~F.lower(F.col("pickup_zone_name")).like("%airport%"))
    .join(
        sdf_taxi_zone_geo
        .withColumnRenamed("zone_id", "dropoff_location_id"),
        on="dropoff_location_id"
    )
    .withColumnRenamed("zone_name", "dropoff_zone_name")
    .withColumnRenamed("borough", "dropoff_borough")
    .filter(~F.lower(F.col("dropoff_zone_name")).like("%airport%"))
    # take the first 3 rides per day for each pickup location
    .withColumn(
        "row_number",
        F.row_number()
        .over(
            Window
            .partitionBy("pickup_location_id", F.date_format(F.col("pickup_datetime"), "ddMMyyyy"))
            .orderBy(F.asc("pickup_datetime"))
        )
    )
    .filter(F.col("row_number") < 4)
    # features engineering
    .withColumn("month", F.month(F.col("pickup_datetime")))
    .withColumn("day_of_week", F.dayofweek(F.col("pickup_datetime")))
    .withColumn("day_of_month", F.dayofmonth(F.col("pickup_datetime")))
    .withColumn("store_and_fwd_flag", F.when(F.col("store_and_fwd_flag") == "N", 0).otherwise(1))
    .withColumn("random_number", F.rand())
)

# train test split
sdf_training = sdf_prepared_data.filter(F.col("random_number") > 0.2)
sdf_test = sdf_prepared_data.filter(F.col("random_number") <= 0.2)

                                                                                

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor

In [5]:
feature_cols = [
    "passenger_count",
    "trip_distance",
    "rate_code",
    "store_and_fwd_flag",
    "payment_type",
    "fare_amount",
    "tolls_amount",
    "imp_surcharge",
    "month",
    "day_of_week",
    "day_of_month",
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

rf = RandomForestRegressor(
    labelCol="tip_amount",
    featuresCol="features",
    predictionCol="prediction",
    numTrees=10,
    maxDepth=4,
    featureSubsetStrategy="auto",
    seed=42,
    bootstrap=True,
)

pipeline = Pipeline(stages=[assembler, rf])

model = pipeline.fit(sdf_training)

                                                                                

In [6]:
importances = model.stages[-1].featureImportances
importance_dict = dict(zip(feature_cols, importances, strict=False))
importance_dict

{'passenger_count': np.float64(0.0005283424469808088),
 'trip_distance': np.float64(0.04446343766282397),
 'rate_code': np.float64(0.05088027237885443),
 'store_and_fwd_flag': np.float64(0.0),
 'payment_type': np.float64(0.6481006882797419),
 'fare_amount': np.float64(0.22206275782788149),
 'tolls_amount': np.float64(0.029572308694520134),
 'imp_surcharge': np.float64(0.0006402185642648316),
 'month': np.float64(0.0022455171205088627),
 'day_of_week': np.float64(1.5729102149322334e-05),
 'day_of_month': np.float64(0.001490727922274248)}

In [7]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator()
evaluator.setPredictionCol("prediction")
evaluator.setLabelCol("tip_amount")

RegressionEvaluator_b03aee9178ae

In [8]:
sdf_train_predictions = model.transform(sdf_training)
print(evaluator.evaluate(sdf_train_predictions, {evaluator.metricName: "rmse"}))
print(evaluator.evaluate(sdf_train_predictions, {evaluator.metricName: "r2"}))
print(evaluator.evaluate(sdf_train_predictions, {evaluator.metricName: "mae"}))

                                                                                

3.122559857683818


                                                                                

0.35981139615648905


[Stage 49:>                                                         (0 + 1) / 1]

1.6291385557044495


                                                                                

In [9]:
sdf_test_predictions = model.transform(sdf_test)
print(evaluator.evaluate(sdf_test_predictions, {evaluator.metricName: "rmse"}))
print(evaluator.evaluate(sdf_test_predictions, {evaluator.metricName: "r2"}))
print(evaluator.evaluate(sdf_test_predictions, {evaluator.metricName: "mae"}))

                                                                                

3.1718932814766116


                                                                                

0.35899696919640944


                                                                                

1.6342744075269313


In [10]:
model.write().overwrite().save("../data/model")

In [11]:
spark.stop()