##Demo of simple aggregration funcions in spark


In [0]:
%run "../includes/configurations"

In [0]:
race_results_df = spark.read.parquet(f"{presentation_folder_path}/race_results")
display(race_results_df.count())

In [0]:
from pyspark.sql.functions import count, countDistinct, sum

race_results_df.select(count("*")).show()
race_results_df.select(count("race_name")).show()

In [0]:
# unique number of races
race_results_df.filter("race_year = 2020").select(countDistinct("race_name")).show()

race_results_df.select(countDistinct("race_year")).show()


In [0]:
# total number of points hamiltion scored in 2020
race_results_df.filter("race_year = 2020 and driver_name = 'Lewis Hamilton'").select(sum("points")).show()

race_results_df.filter("race_year = 2020 and driver_name = 'Max Verstappen'").select(sum("points")).show()

In [0]:
race_results_df.filter("race_year = 2020 and driver_name = 'Lewis Hamilton'").select(sum("points"), countDistinct("race_name")) \
    .withColumnRenamed("sum(points)", "total_points") \
    .withColumnRenamed("count(DISTINCT race_name)", "number_of_races") \
    .show()

#### Using Group By method

In [0]:
from pyspark.sql.functions import desc, avg
# here we could only apply one aggregration function after group by because as soon as 
# a aggregration function is applied like sum the returned object becomes a DataFrame
race_results_df.filter("race_year = 2019") \
    .groupBy("driver_name") \
    .sum("points") \
    .orderBy(desc("sum(points)")) \
    .withColumnRenamed("sum(points)", 'total_points') \
    .limit(10).show()

In [0]:
# to apply more than one aggregration after groupby we use .agg


race_results_df.filter("race_year = 2020") \
    .groupby("driver_name") \
    .agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races")) \
    .orderBy(desc("total_points")) \
    .limit(10) \
    .show()



In [0]:
# to apply more than one aggregration after groupby we use .agg


race_results_df.filter("race_year = 2020") \
    .groupby("team") \
    .agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races")) \
    .orderBy(desc("total_points")) \
    .limit(10) \
    .show()



####Windows Functions in Spark

In [0]:
demo_groupby_df = race_results_df.filter("race_year in (2019, 2020)") \
    .groupby("race_year", "driver_name") \
    .agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races")) \
    .orderBy(desc("total_points"))

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, rank

driverRankSpec = Window.partitionBy("race_year").orderBy(desc("total_points"))

display(demo_groupby_df.withColumn("rank", rank().over(driverRankSpec)))