# Working with RDDs

In [1]:
import pyspark
from datetime import datetime
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("NYTaxi") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/27 19:10:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df_green = spark.read.parquet('data/pq/green/*/*')

                                                                                

We want to implement the following, but this time with RDDs instead of the convenient API that DataFrames provide.

``` sql
SELECT 
    date_trunc('hour', lpep_pickup_datetime) AS hour, 
    PULocationID AS zone,

    SUM(total_amount) AS amount,
    COUNT(1) AS number_records
FROM
    green
WHERE
    lpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
```

In [4]:
df_green.rdd

MapPartitionsRDD[7] at javaToPython at NativeMethodAccessorImpl.java:0

In [5]:
df_green.rdd.take(5)

                                                                                

[Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), lpep_dropoff_datetime=datetime.datetime(2020, 1, 23, 13, 38, 16), store_and_fwd_flag='N', RatecodeID=1, PULocationID=74, DOLocationID=130, passenger_count=1, trip_distance=12.77, fare_amount=36.0, extra=0.0, mta_tax=0.5, tip_amount=2.05, tolls_amount=6.12, ehail_fee=None, improvement_surcharge=0.3, total_amount=44.97, payment_type=1, trip_type=1, congestion_surcharge=0.0),
 Row(VendorID=None, lpep_pickup_datetime=datetime.datetime(2020, 1, 20, 15, 9), lpep_dropoff_datetime=datetime.datetime(2020, 1, 20, 15, 46), store_and_fwd_flag=None, RatecodeID=None, PULocationID=67, DOLocationID=39, passenger_count=None, trip_distance=8.0, fare_amount=29.9, extra=2.75, mta_tax=0.5, tip_amount=0.0, tolls_amount=0.0, ehail_fee=None, improvement_surcharge=0.3, total_amount=33.45, payment_type=None, trip_type=None, congestion_surcharge=None),
 Row(VendorID=2, lpep_pickup_datetime=datetime.datetime(2020, 1, 15, 20, 23, 41)

In [6]:
# Select only the columns we need
rdd = df_green \
    .select("lpep_pickup_datetime", "PULocationID", "total_amount") \
    .rdd

In [7]:
rdd.take(5)

[Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), PULocationID=74, total_amount=44.97),
 Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 20, 15, 9), PULocationID=67, total_amount=33.45),
 Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 15, 20, 23, 41), PULocationID=260, total_amount=8.3),
 Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 5, 16, 32, 26), PULocationID=82, total_amount=8.3),
 Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 29, 19, 22, 42), PULocationID=166, total_amount=12.74)]

## Implementing the `WHERE` Clause with `filter()`

In [8]:
start_date = datetime(year=2020, month=1, day=1)

def filter_date_outliers(row):
    return row.lpep_pickup_datetime >= start_date

rdd \
    .filter(filter_date_outliers)
    # Or just
    # .filter(lambda row: row.lpep_pickup_datetime >= start_date)

PythonRDD[16] at RDD at PythonRDD.scala:53

## Preparing for Grouping with `map()` and Implementing the `GROUP BY` Clause with `reduce()`

In [9]:
sample_row = rdd.take(1)[0]
sample_row

Row(lpep_pickup_datetime=datetime.datetime(2020, 1, 23, 13, 10, 15), PULocationID=74, total_amount=44.97)

In [11]:
# We only want the hour (see `date_trunc` in the SQL query)
sample_row.lpep_pickup_datetime.replace(minute=0, second=0, microsecond=0)

datetime.datetime(2020, 1, 23, 13, 0)

In [12]:
def prepare_for_grouping(row):
    # (hour, zone) comprises our composite key 
    hour = row.lpep_pickup_datetime.replace(minute=0, second=0, microsecond=0)
    zone = row.PULocationID
    
    # (amount, count) makes up our value
    amount = row.total_amount
    count = 1
    
    key = (hour, zone)
    value = (amount, count)
    
    return (key, value)

In [14]:
rdd \
    .filter(filter_date_outliers) \
    .map(prepare_for_grouping) \
    .take(5)

[((datetime.datetime(2020, 1, 23, 13, 0), 74), (44.97, 1)),
 ((datetime.datetime(2020, 1, 20, 15, 0), 67), (33.45, 1)),
 ((datetime.datetime(2020, 1, 15, 20, 0), 260), (8.3, 1)),
 ((datetime.datetime(2020, 1, 5, 16, 0), 82), (8.3, 1)),
 ((datetime.datetime(2020, 1, 29, 19, 0), 166), (12.74, 1))]

In [16]:
def custom_calc(left_value, right_value):
    left_amount, left_count = left_value
    right_amount, right_count = right_value
    
    output_amount = left_amount + right_amount
    output_count = left_count + right_count
    
    return (output_amount, output_count)

In [17]:
rdd \
    .filter(filter_date_outliers) \
    .map(prepare_for_grouping) \
    .reduceByKey(custom_calc) \
    .take(5)

                                                                                

[((datetime.datetime(2020, 1, 15, 20, 0), 260), (163.90000000000003, 14)),
 ((datetime.datetime(2020, 1, 29, 19, 0), 166), (695.0099999999999, 45)),
 ((datetime.datetime(2020, 1, 16, 8, 0), 41), (736.1399999999996, 54)),
 ((datetime.datetime(2020, 1, 4, 20, 0), 129), (583.27, 38)),
 ((datetime.datetime(2020, 1, 2, 8, 0), 66), (197.69, 10))]

That took considerably longer, because it had to go through and aggregate the entire dataset! Now let's just format the output (the nested tuples aren't very nice).

In [18]:
def unwrap(row):
    return (row[0][0], row[0][1], row[1][0], row[1][1])

In [19]:
rdd \
    .filter(filter_date_outliers) \
    .map(prepare_for_grouping) \
    .reduceByKey(custom_calc) \
    .map(unwrap) \
    .take(5)

                                                                                

[(datetime.datetime(2020, 1, 15, 20, 0), 260, 163.90000000000003, 14),
 (datetime.datetime(2020, 1, 29, 19, 0), 166, 695.0099999999999, 45),
 (datetime.datetime(2020, 1, 16, 8, 0), 41, 736.1399999999996, 54),
 (datetime.datetime(2020, 1, 4, 20, 0), 129, 583.27, 38),
 (datetime.datetime(2020, 1, 2, 8, 0), 66, 197.69, 10)]

And finally turn it into a DataFrame.

In [21]:
rdd \
    .filter(filter_date_outliers) \
    .map(prepare_for_grouping) \
    .reduceByKey(custom_calc) \
    .map(unwrap) \
    .toDF() \
    .show()

                                                                                

+-------------------+---+------------------+---+
|                 _1| _2|                _3| _4|
+-------------------+---+------------------+---+
|2020-01-15 20:00:00|260|163.90000000000003| 14|
|2020-01-29 19:00:00|166| 695.0099999999999| 45|
|2020-01-16 08:00:00| 41| 736.1399999999996| 54|
|2020-01-04 20:00:00|129|            583.27| 38|
|2020-01-02 08:00:00| 66|            197.69| 10|
|2020-01-03 09:00:00| 61|            142.21|  9|
|2020-01-17 21:00:00|236|              33.6|  4|
|2020-01-12 12:00:00| 82|            290.41| 14|
|2020-01-28 16:00:00|197| 831.4399999999998| 18|
|2020-01-10 22:00:00| 95| 407.7100000000002| 37|
|2020-01-10 01:00:00|215|            109.69|  2|
|2020-01-07 18:00:00| 25| 554.2900000000001| 37|
|2020-01-18 07:00:00| 55|              48.3|  1|
|2020-01-28 09:00:00|166| 473.0200000000002| 36|
|2020-01-12 15:00:00| 82| 265.7900000000001| 29|
|2020-01-10 20:00:00| 66|            405.88| 21|
|2020-01-31 15:00:00| 43|345.58000000000004| 19|
|2020-01-31 21:00:00

Oh no! Not surprisingly, we have lost our column names and schema! Fret not, as we will restore them with named tuples 😎