In [None]:
from pyspark.sql.functions import current_timestamp, sum, desc, asc, rank
from pyspark.sql.window import Window

In [None]:
%run "includes/configuration"

In [None]:
CREATE DATABASE IF NOT EXISTS f1_presentation;
LOCATION "s3://<bucket>/presentation"

### 1.race results

In [None]:
race_df = spark.read.parquet(f"{processed_folder_path}/races.parquet") \
    .withColumnRenamed("name", "race_name") \
    .withColumnRenamed("race_timestamp", "race_date")
driver_df = spark.read.parquet(f"{processed_folder_path}/drivers.parquet") \
    .withColumnRenamed("name", "driver_name") \
    .withColumnRenamed("number", "driver_number") \
    .withColumnRenamed("nationality", "driver_nationality")
constructor_df = spark.read.parquet(f"{processed_folder_path}/constructors.parquet") \
    .withColumnRenamed("name", "team")
circuit_df = spark.read.parquet(f"{processed_folder_path}/circuits.parquet") \
    .withColumnRenamed("location", "circuit_location")
result_df = spark.read.parquet(f"{processed_folder_path}/results.parquet") \
    .withColumnRenamed("time", "race_time")

In [None]:
race_result_df = result_df.join(race_df, result_df.race_id == race_df.race_id, "inner") \
    .join(driver_df, result_df.driver_id == driver_df.driver_id, "inner") \
    .join(constructor_df, result_df.constructor_id == constructor_df.constructor_id , "inner") \
    .join(circuit_df, race_df.circuit_id == circuit_df.circuit_id, "inner") \
    .select("race_year", "race_name", "race_date", "circuit_location",
            "driver_name", "driver_number", "driver_nationality", "team", "grid", 
            "fastest_lap", "race_time", "points", "position") \
    .withColumn("created_date", current_timestamp())

In [None]:
display(race_result_df.filter("race_year == 2020") \
    .orderBy(race_result_df.points.desc()))

In [None]:
# race_result_df.write.mode("overwrite") \
#     .parquet(f"{presentation_folder_path}/race_results.parquet")

In [None]:
race_result_df.write.mode("overwrite").format("parquet") \
    .saveAsTable("f1_presentation.race_results")

### 2.driver standings

In [None]:
driver_standing_df = race_result_df \
    .groupBy("race_year", "driver_name", "driver_nationality", "team") \
    .agg(sum("points").alias("total_points"),
         count(when(col("position") == 1, True)).alias("wins"))

In [None]:
driver_rank_spec = Window.partitionBy("race_year") \
    .orderBy(desc("total_points"), desc("wins")) \
driver_standing_df = driver_standing_df \
    .withColumn("rank", rank().over(driver_rank_spec))

In [None]:
display(driver_standing_df.filter("race_year = 2020"))

In [None]:
# driver_standing_df.write.mode("overwrite") \
#     .parquet(f"{presentation_folder_path}/driver_standings.parquet")

In [None]:
driver_standing_df.write.mode("overwrite").format("parquet") \
    .saveAsTable("f1_presentation.driver_standings")

### 3.constructor standings

In [None]:
constructor_standing_df = race_result_df \
    .groupBy("race_year", "team") \
    .agg(sum("points").alias("total_points"),
         count(when(col("position") == 1, True)).alias("wins"))

In [None]:
constructor_rank_spec = Window.partitionBy("race_year") \
    .orderBy(desc("total_points"), desc("wins")) \
constructor_standing_df = constructor_standing_df \
    .withColumn("rank", rank().over(constructor_rank_spec))

In [None]:
display(constructor_standing_df.filter("race_year = 2020"))

In [None]:
# constructor_standing_df.write.mode("overwrite") \
#     .parquet(f"{presentation_folder_path}/constructor_standings.parquet")

In [None]:
constructor_standing_df.write.mode("overwrite").format("parquet") \
    .saveAsTable("f1_presentation.constructor_standings")