## Ingest Pitstops.json File

In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text('p_file_date', '2021-03-28')
v_file_date = dbutils.widgets.get('p_file_date')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#### Step 1: Read pitstops.json file with manual schema

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

pit_stops_schema = StructType([StructField('raceId', IntegerType(), False),
                             StructField('driverId', IntegerType(), True),
                             StructField('stop', StringType(), True),
                             StructField('lap', IntegerType(), True),
                             StructField('time', StringType(), True),
                             StructField('duration', StringType(), True),
                             StructField('milliseconds', IntegerType(), True)])


In [0]:
pit_stops_df = spark.read.table('formula1.bronze.pit_stops')

#### Step 2: Rename columns

In [0]:
pit_stops_renamed_df = pit_stops_df.withColumnRenamed('raceId', 'race_id') \
                                .withColumnRenamed('driverId', 'driver_id')

#### Step 3: Add Ingestion Date

In [0]:
from pyspark.sql.functions import lit

pit_stops_final_df = add_ingestion_date(pit_stops_renamed_df.withColumn('data_source', lit(v_data_source)))

#### Step 4: Add data to data lake in parquet

In [0]:
#incremental_load(pit_stops_final_df, 'f1_processed', 'pit_stops', 'race_id')

In [0]:
merge_condition = "tgt.stop = src.stop AND tgt.driver_id = src.driver_id AND tgt.race_id = src.race_id"
partition_column = 'race_id'
merge_delta_data(pit_stops_final_df, 'formula1', 'silver', 'pit_stops', merge_condition, partition_column)

In [0]:
dbutils.notebook.exit("Success")