In [47]:
import findspark
findspark.init()

In [48]:
import pyspark
from pyspark.sql import SparkSession

In [49]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [50]:
df_green = spark.read.csv('data/raw/green/*/*', header=True, inferSchema=True)

In [51]:
df_green.registerTempTable('green')



In [63]:
df_green_revenue = spark.sql("""
SELECT 
    -- Reveneue grouping 
    PULocationID AS zone,
    date_trunc('hour', lpep_pickup_datetime) AS hour, 

    SUM(total_amount) AS amount,
    count(*) as number_records
FROM
    green
WHERE
    lpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
""")

In [64]:
df_green_revenue.show()

+----+-------------------+------------------+--------------+
|zone|               hour|            amount|number_records|
+----+-------------------+------------------+--------------+
|   7|2020-01-01 00:00:00| 769.7299999999997|            45|
| 217|2020-01-01 01:00:00|             12.55|             2|
|  82|2020-01-01 03:00:00| 401.1400000000002|            34|
| 247|2020-01-01 05:00:00|35.400000000000006|             3|
|  80|2020-01-01 09:00:00|            554.73|            15|
|  72|2020-01-01 09:00:00|               6.3|             1|
| 167|2020-01-01 11:00:00|               0.0|             1|
| 131|2020-01-01 18:00:00|              10.3|             1|
|   7|2020-01-01 19:00:00|            182.14|            17|
|  95|2020-01-02 13:00:00|372.05000000000007|            22|
|  74|2020-01-02 04:00:00|             43.55|             3|
|  52|2020-01-02 06:00:00|             35.17|             1|
|  42|2020-01-02 11:00:00| 503.5600000000002|            32|
| 243|2020-01-02 11:00:0

In [54]:
df_green_revenue.write.parquet('data/report/revenue/green', mode = 'overwrite')

In [65]:
df_yellow = spark.read.csv('data/raw/yellow/*/*', header=True, inferSchema=True)
df_yellow.registerTempTable('yellow')

In [66]:
df_yellow_revenue = spark.sql("""
SELECT 
    -- Reveneue grouping 
    PULocationID AS zone,
    date_trunc('hour', tpep_pickup_datetime) AS hour, 

    SUM(total_amount) AS amount,
    count(*) as number_records
FROM
    yellow
WHERE
    tpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
""")

In [67]:
df_yellow_revenue.show()

+----+-------------------+------------------+--------------+
|zone|               hour|            amount|number_records|
+----+-------------------+------------------+--------------+
|   7|2020-01-01 00:00:00| 455.1700000000002|            38|
| 249|2020-01-01 01:00:00| 5515.590000000013|           272|
| 217|2020-01-01 01:00:00|41.400000000000006|             3|
|  82|2020-01-01 03:00:00|211.70000000000005|            14|
| 247|2020-01-01 05:00:00|               8.8|             1|
|  79|2020-01-01 07:00:00|2021.6899999999978|           106|
| 163|2020-01-01 07:00:00|1355.3999999999999|            45|
| 217|2020-01-01 07:00:00|              22.3|             1|
|  80|2020-01-01 09:00:00|             81.92|             4|
|  24|2020-01-01 10:00:00|            198.02|            11|
|  79|2020-01-01 14:00:00|2402.3199999999997|           154|
| 131|2020-01-01 18:00:00|              31.0|             1|
|   7|2020-01-01 19:00:00|              64.5|             5|
| 148|2020-01-01 21:00:0

In [58]:
df_yellow_revenue.write.parquet('data/report/revenue/yellow', mode = 'overwrite')

In [68]:
df_green_revenue_tmp = df_green_revenue \
    .withColumnRenamed('amount', 'green_amount') \
    .withColumnRenamed('number_records', 'green_number_records')

df_yellow_revenue_tmp = df_yellow_revenue \
    .withColumnRenamed('amount', 'yellow_amount') \
    .withColumnRenamed('number_records', 'yellow_number_records')

In [69]:
df_join = df_green_revenue_tmp.join(df_yellow_revenue_tmp, on =['hour','zone'], how = 'outer')

In [70]:
df_join.show()

+-------------------+----+------------+--------------------+-------------+---------------------+
|               hour|zone|green_amount|green_number_records|yellow_amount|yellow_number_records|
+-------------------+----+------------+--------------------+-------------+---------------------+
|2020-01-01 18:00:00|   1|        null|                null|       400.41|                    3|
|2020-01-01 19:00:00|   1|        null|                null|        96.35|                    1|
|2020-01-02 01:00:00|   1|        null|                null|        84.36|                    1|
|2020-01-02 11:00:00|   1|        null|                null|       215.66|                    2|
|2020-01-03 13:00:00|   1|        null|                null|       428.27|                    4|
|2020-01-03 18:00:00|   1|        null|                null|       226.26|                    2|
|2020-01-05 03:00:00|   1|        null|                null|         90.3|                    1|
|2020-01-05 08:00:00|   1|    

In [71]:
df_join.write.parquet('data/report/revenue/total')