In [8]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [9]:
spark = SparkSession.builder \
    .master("local[*]")\
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [11]:
from pipeline_oriented_analytics.pipe import Pipe
from pipeline_oriented_analytics.transformer import *
from typing import List, Dict
from pipeline_oriented_analytics.dataframe import CsvDataFrame, ParquetDataFrame
from pipeline_oriented_analytics import Phase

#phase = Phase.train
phase = Phase.predict

variables = ['id', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
lables = ['trip_duration']
column_names = {'pickup_longitude': 'pickup_lon', 'pickup_latitude': 'pickup_lat', 'dropoff_longitude': 'dropoff_lon', 'dropoff_latitude': 'dropoff_lat', 'trip_duration': 'duration_sec'}
variable_types = {'id': 'string', 'pickup_datetime': 'timestamp', 'pickup_lon': 'double', 'pickup_lat': 'double', 'dropoff_lon': 'double', 'dropoff_lat': 'double'}
label_types = {'duration': 'int'}

if phase.is_predict():
    columns = variables
    column_types = variable_types
    data_path = '../data/raw/test.csv'
else: 
    columns = variables + lables
    column_types = {**variable_types, **label_types}
    data_path = '../data/raw/train.csv'

df = Pipe([
    SelectColumns(columns),
    RenameColumns(column_names),
    NormalizeColumnTypes(column_types),
    CellToken(6, 'pickup_lat', 'pickup_lon', 'pickup_cell_6'),
    CellToken(6, 'dropoff_lat', 'dropoff_lon', 'dropoff_cell_6'),
    CellToken(14, 'pickup_lat', 'pickup_lon', 'pickup_cell_14'),
    CellToken(14, 'dropoff_lat', 'dropoff_lon', 'dropoff_cell_14'),
    Join(['pickup_cell_14', 'dropoff_cell_14'], Join.Method.left, ParquetDataFrame('../data/processed/distance_matrix', spark)),
    DropColumns(inputCols=['pickup_lat', 'pickup_lon', 'dropoff_lon', 'dropoff_lat', 'pickup_cell_14', 'dropoff_cell_14']),
    SaveToParquet(f'../data/processed/{phase.name}/inputs'),
]).transform(CsvDataFrame(data_path, spark))

print(f'Saved {df.count()} rows of {phase.name} inputs')

Saved 625134 rows of predict inputs


In [12]:
df.select('pickup_cell_6').groupby('pickup_cell_6').count().sort(f.desc('count')).show(100)

+-------------+------+
|pickup_cell_6| count|
+-------------+------+
|         89c3|625112|
|         89e9|     7|
|         89b7|     6|
|         8835|     1|
|         89d1|     1|
|         89fb|     1|
|         898d|     1|
|         808f|     1|
|         89bf|     1|
|         89e3|     1|
|         89c1|     1|
|         89dd|     1|
+-------------+------+

