### Ingest lap_times folder 

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source", "Ergast API")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

##### Step 1 - Read the CSV file uisng spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
lap_times_schema = StructType(fields = [StructField("raceId", IntegerType(), False),
                                        StructField("driverId", IntegerType(), True),
                                        StructField("lap", IntegerType(), True),
                                        StructField("position", IntegerType(), True),
                                        StructField("time", StringType(), True),
                                        StructField("milliseconds", IntegerType(), True)
])

In [0]:
lap_times_df = spark.read \
    .schema(lap_times_schema) \
    .csv(f"{raw_folder_path}/{v_file_date}/lap_times")

In [0]:
# lap_times_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- lap: integer (nullable = true)
 |-- position: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- milliseconds: integer (nullable = true)



In [0]:
# display(lap_times_df.limit(5))

raceId,driverId,lap,position,time,milliseconds
1053,830,1,1,1:38.603,98603
1053,830,2,1,2:29.163,149163
1053,830,3,1,2:23.247,143247
1053,830,4,1,2:20.332,140332
1053,830,5,1,2:25.691,145691


##### Step 2 - Rename columns and add new columns

In [0]:
from pyspark.sql.functions import lit

In [0]:
lap_times_final_df = add_ingestion_date(lap_times_df) \
    .withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("driverId", "driver_id") \
    .withColumn("data_source", lit(v_data_source)) \
    .withColumn("file_date", lit(v_file_date))

In [0]:
# lap_times_final_df.count()

1124

##### Step 3 - Write the output to processed container

In [0]:
merge_condition = "tgt.driver_id = src.driver_id AND tgt.lap = src.lap AND tgt.race_id = src.race_id"
merge_delta_deta(lap_times_final_df, "f1_processed", "lap_times", processed_folder_path, merge_condition, "race_id")

In [0]:
%sql
SELECT * FROM f1_processed.lap_times 
LIMIT 5;

race_id,driver_id,lap,position,time,milliseconds,ingestion_date,data_source,file_date
65,4,1,6,1:51.259,111259,2023-12-22T10:51:16.587Z,Ergast,2021-03-21
65,4,2,5,1:38.560,98560,2023-12-22T10:51:16.587Z,Ergast,2021-03-21
65,4,3,5,1:37.943,97943,2023-12-22T10:51:16.587Z,Ergast,2021-03-21
65,4,4,4,1:35.339,95339,2023-12-22T10:51:16.587Z,Ergast,2021-03-21
65,4,5,3,1:34.407,94407,2023-12-22T10:51:16.587Z,Ergast,2021-03-21


In [0]:
dbutils.notebook.exit("Success")