In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder \
    .master("local[*]")\
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [8]:
from pipeline_oriented_analytics.pipe import Pipe, IF
from pipeline_oriented_analytics.transformer import *
from pipeline_oriented_analytics.transformer.feature import *
from typing import List, Dict
from pipeline_oriented_analytics.dataframe import CsvDataFrame, ParquetDataFrame
from pipeline_oriented_analytics import Phase

#phase = Phase.train
phase = Phase.predict

def in_train_phase(df: DataFrame) -> bool:
    return 'duration_sec' in df.columns

features_df = Pipe([
    Time('pickup_datetime', [Time.Feature.month, Time.Feature.day_of_month, Time.Feature.day_of_week, Time.Feature.hour]),
    AddMinutes(-15, 'pickup_datetime', '15_min_before'),
    RequestCount(15, 'pickup_cell_6', '15_min_before', 'requests_pickup_cell'),
    RequestCount(15, 'dropoff_cell_6', '15_min_before', 'requests_dropoff_cell'),
    #IF(IF.Predicate.has_column('duration_sec'), then=[
    IF(in_train_phase, then=[
        Duration(Duration.Unit.minute, 'duration_sec', 'duration_min'),
        DropColumns(inputCols=['duration_sec'])
    ]),
    DropColumns(inputCols=['pickup_datetime', '15_min_before']),
    SaveToParquet(f'../data/processed/{phase.name}/features')
]).transform(ParquetDataFrame(f'../data/processed/{phase.name}/inputs', spark))

print(f'Saved {features_df.count()} rows {phase.name} features')
features_df.show(2)

Saved 999 rows predict features
+--------------+---------------------+-------------+--------------------+---------+---------------+--------+-----+------------+-----------+----+
|dropoff_cell_6|requests_dropoff_cell|pickup_cell_6|requests_pickup_cell|       id|passenger_count|distance|month|day_of_month|day_of_week|hour|
+--------------+---------------------+-------------+--------------------+---------+---------------+--------+-----+------------+-----------+----+
|          89c3|                   67|         89c3|                  67|id0868004|              2|    2.69|    6|          30|          5|  21|
|          89c3|                   67|         89c3|                  67|id2736277|              1|    4.43|    6|          30|          5|  21|
+--------------+---------------------+-------------+--------------------+---------+---------------+--------+-----+------------+-----------+----+
only showing top 2 rows

