## Race results from 5 tables
- drivers
- circuits
- constructors
- races
- results

In [0]:
%run "../includes/configurations"

In [0]:
# read all the data into dataframes
drivers_df = spark.read.parquet(f"{processed_folder_path}/drivers") \
    .withColumnRenamed("number", "driver_number") \
    .withColumnRenamed("name", "driver_name") \
    .withColumnRenamed("nationality", "driver_nationality")

In [0]:
constructors_df = spark.read.parquet(f"{processed_folder_path}/constructors") \
    .withColumnRenamed("name", "team")

In [0]:
circuits_df = spark.read.parquet(f"{processed_folder_path}/circuits") \
    .withColumnRenamed("location", "circuit_location")

In [0]:
races_df = spark.read.parquet(f"{processed_folder_path}/races") \
    .withColumnRenamed("name", "race_name") \
    .withColumnRenamed("race_timestamp", "race_date")

In [0]:
results_df = spark.read.parquet(f"{processed_folder_path}/results") \
    .withColumnRenamed("time", "race_time") 

In [0]:
# joining the races_df with circuits_df with selected columns

races_circuits_df = races_df.join(circuits_df, races_df.circuit_id == circuits_df.circuit_id, 'inner') \
    .select(races_df.race_id, races_df.race_year, races_df.race_name, races_df.race_date, circuits_df.circuit_location)


In [0]:
# join the results dataframe with races_circuit_df, drivers_df & constructors_df

race_results_df = results_df.join(races_circuits_df, results_df.race_id == races_circuits_df.race_id) \
                            .join(drivers_df, results_df.driver_id == drivers_df.driver_id) \
                            .join(constructors_df, results_df.constructor_id == constructors_df.constructor_id) 

In [0]:
from pyspark.sql.functions import current_timestamp
final_df = race_results_df.select('race_year', 'race_name', 'race_date', 'circuit_location', 'driver_name', 'driver_nationality',
                                  'team', 'grid', 'fastest_lap', 'race_time', 'points') \
                                      .withColumn('created_date', current_timestamp())

In [0]:
display(final_df)

In [0]:
# filter data on 2020 Abu Dhabi Grand Prix Results
# order the dataframe by points desc

display(final_df.filter("race_year = 2020 and race_name == 'Abu Dhabi Grand Prix'").orderBy(final_df.points.desc()))

In [0]:
final_df.write.mode("overwrite").parquet(f"{presentation_folder_path}/race_results")