## Ingest lap_times folder

In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text('p_file_date', '2021-03-28')
v_file_date = dbutils.widgets.get('p_file_date')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#### Step 1: Read lap_times files with manual schema

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
lap_times_schema = StructType([StructField('raceId', IntegerType(), False),
                             StructField('driverId', IntegerType(), True),
                             StructField('lap', IntegerType(), True),
                             StructField('position', IntegerType(), True),
                             StructField('time', StringType(), True),
                             StructField('milliseconds', IntegerType(), True)])


In [0]:
lap_times_df = spark.read.table('formula1.bronze.lap_times')

#### Step 2: Rename columns

In [0]:

lap_times_renamed_df = lap_times_df.withColumnRenamed('raceId', 'race_id') \
                                .withColumnRenamed('driverId', 'driver_id')


#### Step 3: Add Ingestion Date

In [0]:
from pyspark.sql.functions import lit

lap_times_final_df = add_ingestion_date(lap_times_renamed_df.withColumn('data_source', lit(v_data_source)))

#### Step 4: Add data to data lake in parquet

In [0]:
#incremental_load(lap_times_final_df, 'f1_processed', 'lap_times', 'race_id')

In [0]:
merge_condition = "tgt.lap = src.lap AND tgt.driver_id = src.driver_id AND tgt.race_id = src.race_id"
partition_column = 'race_id'
merge_delta_data(lap_times_final_df, 'formula1', 'silver', 'lap_times', merge_condition, partition_column)

In [0]:
dbutils.notebook.exit("Success")