In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder \
    .master("local[*]")\
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [13]:
ParquetDataFrame(f'../data/processed/train/inputs', spark).show()

+---------+-------------------+------------+-------------+--------------+--------+
|       id|    pickup_datetime|duration_sec|pickup_cell_8|dropoff_cell_8|distance|
+---------+-------------------+------------+-------------+--------------+--------+
|id1090265|2016-02-09 21:30:46|         234|        89c25|         89c25|    1.12|
|id0429543|2016-05-26 18:12:10|         600|        89c25|         89c25|    1.77|
|id0588876|2016-03-12 23:59:16|        1501|        89c25|         89c25|    8.49|
|id1046728|2016-04-16 03:18:55|        1124|        89c25|         89c25|    5.66|
|id2457924|2016-03-02 22:53:19|         223|        89c25|         89c25|    1.12|
|id1997531|2016-04-03 18:43:29|        2358|        89c25|         89c25|    8.01|
|id1160205|2016-01-16 01:33:09|         758|        89c25|         89c25|    2.16|
|id3824700|2016-04-13 22:35:04|         459|        89c25|         89c25|    3.26|
|id2614722|2016-05-11 18:53:28|         287|        89c25|         89c25|     0.5|
|id3

In [8]:
from pipeline_oriented_analytics.pipe import Pipe, IF
from pipeline_oriented_analytics.transformer import *
from pipeline_oriented_analytics.transformer.feature import *
from typing import List, Dict
from pipeline_oriented_analytics.dataframe import CsvDataFrame, ParquetDataFrame
from pipeline_oriented_analytics import Phase

#phase = Phase.train
phase = Phase.predict

features_df = Pipe([
    Time('pickup_datetime', [Time.Feature.month, Time.Feature.day_of_month, Time.Feature.day_of_week, Time.Feature.hour]),
    AddMinutes(-15, 'pickup_datetime', '15_min_before'),
    RequestCount(15, 'pickup_cell_8', '15_min_before', 'requests_pickup_cell'),
    RequestCount(15, 'dropoff_cell_8', '15_min_before', 'requests_dropoff_cell'),
    IF(phase.is_train(), then=[
        Duration(Duration.Unit.minute, 'duration_sec', 'duration_min'),
        DropColumns(inputCols=['duration_min'], ),
        DropOutliers('')
    ]),
    DropColumns(inputCols=['pickup_datetime', '15_min_before']),
   SaveToParquet(f'../data/processed/{phase.name}/features')
]).transform(ParquetDataFrame(f'../data/processed/{phase.name}/inputs', spark))

print(f'Saved {features_df.count()} rows {phase.name} features')
features_df.show(2)

Saved 625134 rows predict features
+--------------+-------------+---------+--------+-----+------------+-----------+----+--------------------+---------------------+
|dropoff_cell_6|pickup_cell_6|       id|distance|month|day_of_month|day_of_week|hour|requests_pickup_cell|requests_dropoff_cell|
+--------------+-------------+---------+--------+-----+------------+-----------+----+--------------------+---------------------+
|          89c3|         89c3|id0557539|     1.7|    1|           2|          7|  22|                  39|                   39|
|          89c3|         89c3|id0482177|    2.48|    1|           2|          7|  22|                  39|                   39|
+--------------+-------------+---------+--------+-----+------------+-----------+----+--------------------+---------------------+
only showing top 2 rows

