In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder \
    .master("local[*]")\
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [None]:
from pyspark.ml import PipelineModel
from pipeline_oriented_analytics.transformer import *
from typing import List, Dict

column_names = ['id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'trip_duration']
column_new_names = {'pickup_longitude': 'pickup_lon', 'pickup_latitude': 'pickup_lat', 'dropoff_longitude': 'dropoff_lon', 'dropoff_latitude': 'dropoff_lat', 'trip_duration': 'duration'}
column_types = {'id': 'string', 'pickup_datetime': 'timestamp', 'dropoff_datetime': 'timestamp', 'pickup_lon': 'double', 'pickup_lat': 'double', 'dropoff_lon': 'double', 'dropoff_lat': 'double', 'duration': 'int'}

def load_trips(column_names: List[str], column_new_names: List[str], column_types: Dict[str, str]) -> PipelineModel:
    return PipelineModel([
        SelectColumns(column_names),
        RenameColumns(column_new_names),
        NormalizeColumnTypes(column_types)
    ])

train_df = PipelineModel([
    SelectColumns(column_names),
    RenameColumns(column_new_names),
    NormalizeColumnTypes(column_types)
]).transform(CsvDataFrame('../data/raw/train.csv', spark))

df.count()
df.show(5)
df.printSchema()

In [None]:
df.sort(f.asc('pickup_datetime')).show(5)

In [None]:
df.sort(f.desc('pickup_datetime')).show(5)

In [None]:
df.withColumn('year', f.year('pickup_datetime'))\
.withColumn('month', f.month('pickup_datetime'))\
.withColumn('day', f.dayofmonth('pickup_datetime'))\
.select('year', 'month', 'day')\
.groupby('year', 'month').count()\
.sort(f.asc('year'), f.asc('month'))\
.show(400)

In [None]:
test_df = load_trips(column_names, column_new_names, column_types).transform(CsvDataFrame('../data/raw/test.csv', spark))

test_df.count()
test_df.show(2)
test_df.printSchema()

In [None]:
test_df = CsvDataFrame('../data/raw/test.csv', spark)

In [None]:
test_df.count()
test_df.show(2)


In [None]:
column_names = ['id', 'pickup_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
column_new_names = {'pickup_longitude': 'pickup_lon', 'pickup_latitude': 'pickup_lat', 'dropoff_longitude': 'dropoff_lon', 'dropoff_latitude': 'dropoff_lat'}
column_types = {'id': 'string', 'pickup_datetime': 'timestamp', 'pickup_lon': 'double', 'pickup_lat': 'double', 'dropoff_lon': 'double', 'dropoff_lat': 'double'}

test_df = load_trips(column_names, column_new_names, column_types).transform(CsvDataFrame('../data/raw/test.csv', spark))

test_df.count()
test_df.show(2)
test_df.printSchema()

In [None]:
test_df.sort(f.unix_timestamp('pickup_datetime').desc()).show(5)

In [6]:
from pyspark.ml import PipelineModel
from pipeline_oriented_analytics.transformer import *
from pipeline_oriented_analytics.dataframe import *
from typing import List, Dict

def load_trips(column_names: List[str], column_new_names: List[str], column_types: Dict[str, str]) -> PipelineModel:
    return PipelineModel([
        SelectColumns(column_names),
        RenameColumns(column_new_names),
        NormalizeColumnTypes(column_types)
    ])

column_names = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
column_new_names = {'pickup_longitude': 'pickup_lon', 'pickup_latitude': 'pickup_lat', 'dropoff_longitude': 'dropoff_lon', 'dropoff_latitude': 'dropoff_lat'}
column_types = {'pickup_lon': 'double', 'pickup_lat': 'double', 'dropoff_lon': 'double', 'dropoff_lat': 'double'}

df = PipelineModel([
    load_trips(column_names, column_new_names, column_types),
    CellId(18, 'pickup_lat', 'pickup_lon', 'pickup_cell'),
    CellId(18, 'dropoff_lat', 'dropoff_lon', 'dropoff_cell'),
    Union(
        PipelineModel([
            load_trips(column_names, column_new_names, column_types),
            CellId(18, 'pickup_lat', 'pickup_lon', 'pickup_cell'),
            CellId(18, 'dropoff_lat', 'dropoff_lon', 'dropoff_cell')
        ]).transform((CsvDataFrame('../data/raw/test.csv', spark)))
    ),
    SelectColumns(['pickup_cell_18', 'dropoff_cell_18']),
    DropDuplicates()
]).transform(CsvDataFrame('../data/raw/train.csv', spark))


df.show(2)
df.printSchema()
df.count()

+--------------+---------------+
|pickup_cell_18|dropoff_cell_18|
+--------------+---------------+
|    89c259ae67|     89c258ec75|
|    89c259bd67|     89c258596f|
+--------------+---------------+
only showing top 2 rows

root
 |-- pickup_cell_18: string (nullable = true)
 |-- dropoff_cell_18: string (nullable = true)



2000986

In [4]:
df.count()

9248