In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import count_distinct
from pyspark.sql.functions import count
from pyspark.sql.functions import round
from pyspark.sql.functions import sum


spark = SparkSession.builder\
    .master("local[*]")\
    .appName("SparkFirst")\
    .config("spark.executor.memory", "10g")\
    .config("spark.executor.cores", 5)\
    .config("spark.dynamicAllocation.enabled", "true")\
    .config("spark.dynamicAllocation.maxExecutors", 5)\
    .config("spark.shuffle.service.enabled", "true")\
.getOrCreate()

In [2]:
df = spark.read.option('header', 'true').csv('yellow_tripdata_2020-01.csv')

In [4]:
df.createOrReplaceTempView('taxi_service')

sql = """
WITH
clear_date_from_table
AS
(
	SELECT date(tpep_pickup_datetime) AS date
	FROM taxi_service ts 
),
date_count
AS
(
	SELECT count(date) AS count_date,
	date AS date
	FROM clear_date_from_table
	GROUP BY date
),
info_zero
AS
(
	SELECT max(total_amount) AS zero_max_amm,
	min(total_amount) AS zero_min_amm,
	date(tpep_pickup_datetime) AS date,
	count(passenger_count) AS percentage_zero
	FROM taxi_service ts 
	WHERE passenger_count = 0
	GROUP BY date
),
info_1p
AS
(
	SELECT max(total_amount) AS max_amm_1p,
	min(total_amount) AS min_amm_1p,
	date(tpep_pickup_datetime) AS date,
	count(passenger_count) AS percentage_1p
	FROM taxi_service ts 
	WHERE passenger_count = 1
	GROUP BY date
),
info_2p
AS
(
	SELECT max(total_amount) AS max_amm_2p,
	min(total_amount) AS min_amm_2p,
	date(tpep_pickup_datetime) AS date,
	count(passenger_count) AS percentage_2p
	FROM taxi_service ts 
	WHERE passenger_count = 2
	GROUP BY date
),
info_3p
AS
(
	SELECT max(total_amount) AS max_amm_3p,
	min(total_amount) AS min_amm_3p,
	date(tpep_pickup_datetime) AS date,
	count(passenger_count) AS percentage_3p
	FROM taxi_service ts 
	WHERE passenger_count = 3
	GROUP BY date
),
info_4p_plus
AS
(
	SELECT max(total_amount) AS max_amm_4p_plus,
	min(total_amount) AS min_amm_4p_plus,
	date(tpep_pickup_datetime) AS date,
	count(passenger_count) AS percentage_4p_plus
	FROM taxi_service ts 
	WHERE passenger_count > 3
	GROUP BY date
)
SELECT date,
COALESCE(round((CAST(percentage_zero AS decimal) / count_date) * 100), 0) AS percentage_zero,
COALESCE(zero_max_amm, 0) AS zero_max_amm,
COALESCE(zero_min_amm, 0) AS zero_min_amm,
COALESCE(round((CAST(percentage_1p AS decimal) / count_date) * 100), 0) AS percentage_1p,
COALESCE(max_amm_1p, 0) AS max_amm_1p,
COALESCE(min_amm_1p, 0) AS min_amm_1p,
COALESCE(round((CAST(percentage_2p AS decimal) / count_date) * 100), 0) AS percentage_2p,
COALESCE(max_amm_2p, 0) AS max_amm_2p,
COALESCE(min_amm_2p, 0) AS min_amm_2p,
COALESCE(round((CAST(percentage_3p AS decimal) / count_date) * 100), 0) AS percentage_3p,
COALESCE(max_amm_3p, 0) AS max_amm_3p,
COALESCE(min_amm_3p, 0) AS min_amm_3p,
COALESCE(round((CAST(percentage_4p_plus AS decimal) / count_date) * 100), 0) AS percentage_4p_plus,
COALESCE(max_amm_4p_plus, 0) AS max_amm_4p_plus,
COALESCE(min_amm_4p_plus, 0) AS min_amm_4p_plus
FROM date_count 
LEFT JOIN info_1p USING(date)
LEFT JOIN info_2p USING(date)
LEFT JOIN info_3p USING(date)
LEFT JOIN info_4p_plus USING(date)
LEFT JOIN info_zero USING(date)
"""

df2 = spark.sql(sql)

df2.createOrReplaceTempView('taxi_service_done')

df2.show(100)

+----------+---------------+------------+------------+-------------+----------+----------+-------------+----------+----------+-------------+----------+----------+------------------+---------------+---------------+
|      date|percentage_zero|zero_max_amm|zero_min_amm|percentage_1p|max_amm_1p|min_amm_1p|percentage_2p|max_amm_2p|min_amm_2p|percentage_3p|max_amm_3p|min_amm_3p|percentage_4p_plus|max_amm_4p_plus|min_amm_4p_plus|
+----------+---------------+------------+------------+-------------+----------+----------+-------------+----------+----------+-------------+----------+----------+------------------+---------------+---------------+
|2009-01-01|              0|           0|           0|           79|       8.8|      10.3|            5|      31.3|      31.3|            5|      13.8|      13.8|                11|            9.3|          61.42|
|2019-12-18|              0|           0|           0|           50|      2.81|      2.81|            0|         0|         0|            0|    