In [4]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [5]:
spark = SparkSession.builder \
    .master("local[*]")\
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [None]:
from pyspark.ml import PipelineModel
from pipeline_oriented_analytics.transformer import *
from pipeline_oriented_analytics.dataframe import *
from typing import List, Dict

column_names = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
column_new_names = {'pickup_longitude': 'pickup_lon', 'pickup_latitude': 'pickup_lat', 'dropoff_longitude': 'dropoff_lon', 'dropoff_latitude': 'dropoff_lat'}
column_types = {'pickup_lon': 'double', 'pickup_lat': 'double', 'dropoff_lon': 'double', 'dropoff_lat': 'double'}
level = 18

def prepare_trip_data(level: int, column_names: List[str], column_new_names: List[str], column_types: Dict[str, str]) -> PipelineModel: 
    return PipelineModel([
        SelectColumns(column_names),
        RenameColumns(column_new_names),
        NormalizeColumnTypes(column_types),
        CellId(level, 'pickup_lat', 'pickup_lon', 'pickup_cell'),
        CellId(level, 'dropoff_lat', 'dropoff_lon', 'dropoff_cell')
    ])


df = PipelineModel([
    prepare_trip_data(level, column_names, column_new_names, column_types),
    Union(
        prepare_trip_data(level, column_names, column_new_names, column_types).transform((CsvDataFrame('../data/raw/test.csv', spark)))
    ),
    SelectColumns(['pickup_cell_18', 'dropoff_cell_18']),
    DropDuplicates(),
    SphereDistance('pickup_cell_18', 'dropoff_cell_18'),
    SaveToParquet('../data/processed/distance_matrix')
]).transform(CsvDataFrame('../data/raw/train.csv', spark)).cache()


df.show(2)
df.printSchema()
df.count()

In [None]:
df.show(5)

In [None]:
df.write.parquet('../data/processed/distance_matrix', mode='overwrite')

In [None]:
spark.read.parquet('../data/processed/distance_matrix').show(5)