In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder \
    .master("local[*]")\
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [3]:
from pipeline_oriented_analytics import Phase
phase = Phase.predict

data_to_predict_path = f'../data/processed/{phase.name}/features'
model_path = '../model/trip_duration_min'
predicted_data_path = '../data/reporting/trip_durations'

In [4]:
from pyspark.ml import PipelineModel
from pipeline_oriented_analytics.dataframe import ParquetDataFrame
from pipeline_oriented_analytics.transformer import DropColumns, SaveToParquet

model = PipelineModel.load(model_path)

predicted_df = PipelineModel([
    model,
    DropColumns(inputCols=['features', 'pickup_cell_6_idx', 'dropoff_cell_6_idx']),
    SaveToParquet(predicted_data_path)
]).transform(ParquetDataFrame(data_to_predict_path, spark))

predicted_df.show(2)
spark.stop()

+--------------+-------------+---------+---------------+--------+-----+------------+-----------+----+--------------------+---------------------+-----------------+
|dropoff_cell_6|pickup_cell_6|       id|passenger_count|distance|month|day_of_month|day_of_week|hour|requests_pickup_cell|requests_dropoff_cell|       prediction|
+--------------+-------------+---------+---------------+--------+-----+------------+-----------+----+--------------------+---------------------+-----------------+
|          89c3|         89c3|id1949435|              1|    1.35|    6|          30|          5|  21|                  53|                   53|7.223529411764706|
|          89c3|         89c3|id3246513|              1|   10.82|    6|          30|          5|  21|                  53|                   53|            103.0|
+--------------+-------------+---------+---------------+--------+-----+------------+-----------+----+--------------------+---------------------+-----------------+
only showing top 2 row