In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder \
    .master("local[*]")\
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [3]:
from pyspark.ml import PipelineModel
from pipeline_oriented_analytics.transformer import *
from pipeline_oriented_analytics.dataframe import *
from typing import List, Dict

column_names = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
column_new_names = {'pickup_longitude': 'pickup_lon', 'pickup_latitude': 'pickup_lat', 'dropoff_longitude': 'dropoff_lon', 'dropoff_latitude': 'dropoff_lat'}
column_types = {'pickup_lon': 'double', 'pickup_lat': 'double', 'dropoff_lon': 'double', 'dropoff_lat': 'double'}
level = 18
pickup_cell = f'pickup_cell_{level}'
dropoff_cell = f'dropoff_cell_{level}'

def PREPARE_TRIP_DATA(level: int, column_names: List[str], column_new_names: List[str], column_types: Dict[str, str]) -> PipelineModel: 
    return PipelineModel([
        SelectColumns(column_names),
        RenameColumns(column_new_names),
        NormalizeColumnTypes(column_types),
        CellId(level, 'pickup_lat', 'pickup_lon', pickup_cell),
        CellId(level, 'dropoff_lat', 'dropoff_lon', dropoff_cell)
    ])

df = PipelineModel([
    PREPARE_TRIP_DATA(level, column_names, column_new_names, column_types),
    Union(
        PREPARE_TRIP_DATA(level, column_names, column_new_names, column_types).transform((CsvDataFrame('../data/raw/test.csv', spark)))
    ),
    SelectColumns([pickup_cell, dropoff_cell]),
    DropDuplicates(),
    SphereDistance(pickup_cell, dropoff_cell),
    SaveToParquet('../data/processed/distance_matrix')
]).transform(CsvDataFrame('../data/raw/train.csv', spark))

In [4]:
df.show(2)
df.printSchema()
df.count()

+--------------+---------------+--------+
|pickup_cell_18|dropoff_cell_18|distance|
+--------------+---------------+--------+
|    89c259ae67|     89c258ec75|    3.05|
|    89c259bd67|     89c258596f|    3.55|
+--------------+---------------+--------+
only showing top 2 rows

root
 |-- pickup_cell_18: string (nullable = true)
 |-- dropoff_cell_18: string (nullable = true)
 |-- distance: float (nullable = true)



2000986

In [1]:
df.count()

NameError: name 'df' is not defined