In [0]:
%run "../includes/configuration"

In [0]:
race_results_df = spark.read.parquet(f"{presentation_folder_path}/race_results")

In [0]:
display(race_results_df)

In [0]:
demo_df = race_results_df.filter("race_year = 2020")

In [0]:
display(demo_df)

#### Simple Agg Func

In [0]:
from pyspark.sql.functions import count, countDistinct, sum

In [0]:
demo_df.select(count("*"), countDistinct("race_name")).show()

In [0]:
demo_df.filter("driver_name = 'Lewis Hamilton'").select(sum("points"), countDistinct("race_name")) \
    .withColumnRenamed("sum(points)", "total_points") \
    .withColumnRenamed("count(DISTINCT race_name)", "number_of_races") \
    .show()

#### Group By

In [0]:
demo_df \
    .groupBy("driver_name") \
    .sum("points") \
    .countDistinct("race_name") \
    .show()

- `groupBy` will return grouped data object. But the aggragate functions will only applied on grouped data object.
  So, after using an agg. func., you will not able to apply any other agg func on it. Because after using a agg func, it will retrun a dataframe.
- In order to tackel this you can use `agg()`, it let us to use multiple agg functions. Actually it uses the built-in agg func rather then the functions on the grouped data. 

In [0]:
demo_df \
    .groupBy("driver_name") \
    .agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races")) \
    .show()

#### Window Functions

In [0]:
demo_df = race_results_df.filter("race_year in (2019, 2020)")

In [0]:
demo_grouped_df = demo_df \
    .groupBy("race_year", "driver_name") \
    .agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races"))

In [0]:
display(demo_grouped_df)

##### Step 1 - First you create Window Specification. Like how to partition in what order/
##### Step 2 - Then you tell it, what func you want to apply to produce the output.

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, rank

driverRankSpec = Window.partitionBy("race_year").orderBy(desc("total_points"))
demo_grouped_df.withColumn("rank", rank().over(driverRankSpec)).show(100)