##### Step 1 - Run "configuration" Notebook

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_file_date", "2021-04-18")
v_file_date = dbutils.widgets.get("p_file_date")

##### Step 2 - Read required files using Dataframe and rename fields as required

In [0]:
races_df = spark.read.format("delta").load(f"{processed_folder_path}/races") \
    .withColumnRenamed("name", "race_name") \
    .withColumnRenamed("race_timestamp", "race_date")

In [0]:
# display(races_df)

In [0]:
circuits_df = spark.read.format("delta").load(f"{processed_folder_path}/circuits") \
    .withColumnRenamed("location", "circuit_location")

In [0]:
# display(circuits_df)

In [0]:
drivers_df = spark.read.format("delta").load(f"{processed_folder_path}/drivers") \
    .withColumnRenamed("name", "driver_name") \
    .withColumnRenamed("number", "driver_number") \
    .withColumnRenamed("nationality", "driver_nationality")

In [0]:
# display(drivers_df)

In [0]:
constructors_df = spark.read.format("delta").load(f"{processed_folder_path}/constructors") \
    .withColumnRenamed("name", "team")

In [0]:
# display(constructors_df)

In [0]:
results_df = spark.read.format("delta").load(f"{processed_folder_path}/results") \
    .filter(f"file_date = '{v_file_date}'") \
    .withColumnRenamed("time", "race_time") \
    .withColumnRenamed("race_id", "result_race_id") \
    .withColumnRenamed("file_date", "result_file_date")

In [0]:
# display(results_df)

##### Step 3 - Join Races and Circuits dataframe

In [0]:
race_circuits_df = races_df.join(circuits_df, races_df.circuit_id == circuits_df.circuit_id) \
    .select(races_df.race_id, races_df.race_year, races_df.race_name, races_df.race_date, circuits_df.circuit_location)
    

##### Step 4 - Join results, drivers, constructors to the races_circuits_df

In [0]:
race_results_df = race_circuits_df.join(results_df, race_circuits_df.race_id == results_df.result_race_id) \
    .join(drivers_df, drivers_df.driver_id == results_df.driver_id) \
    .join(constructors_df, constructors_df.constructor_id == results_df.constructor_id)

In [0]:
from pyspark.sql.functions import current_timestamp

##### Step 5 - Select required fields

In [0]:
final_df = race_results_df.select("race_id", "race_year", "race_name", "race_date", "circuit_location", 
                                  "driver_name", "driver_number", "driver_nationality", "team",  "grid", "fastest_lap", "race_time", "points", "position", "result_file_date") \
    .withColumn("created_date", current_timestamp()) \
    .withColumnRenamed("result_file_date", "file_date")

In [0]:
# display(final_df)

##### Step 6 - Write final data to the presentation container

In [0]:
merge_condition = "tgt.driver_name = src.driver_name AND tgt.race_id = src.race_id"
merge_delta_deta(final_df, "f1_presentation", "race_results", presentation_folder_path, merge_condition, "race_id")

In [0]:
# %sql
# SELECT race_id, COUNT(1)
# FROM f1_presentation.race_results
# GROUP BY race_id
# ORDER BY race_id DESC;

In [0]:
%sql
SELECT * FROM f1_presentation.race_results;

In [0]:
dbutils.notebook.exit("Success")