## Ingest pitstops.json file

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "../includes/configurations"

In [0]:
%run "../includes/common_functions"

In [0]:
# here the pitstops.json is a multiline json and and not a single line json or nested json
display(dbutils.fs.ls(f'{raw_folder_path}'))

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

pit_stops_schema = StructType( fields= [StructField('raceId', IntegerType(), False),
                                        StructField('driverId', IntegerType(), True),
                                        StructField('stop', StringType(), True),
                                        StructField('lap', IntegerType(), True),
                                        StructField('time', StringType(), True),
                                        StructField('duration', StringType(), True),
                                        StructField('milliseconds', IntegerType(),True)
                                        ])

pit_stops_df = spark.read \
.schema(pit_stops_schema) \
.option('multiLine', True) \
.json(f'{raw_folder_path}/pit_stops.json')

pit_stops_df.limit(5).display()

In [0]:
from pyspark.sql.functions import col, current_timestamp, lit

pit_stops_renamed_df = pit_stops_df.withColumnRenamed('raceId', 'race_id') \
                                   .withColumnRenamed('driverId', 'driver_id') \
                                       .withColumn('data_source', lit(v_data_source))

pit_stops_final_df = ingestion_date(pit_stops_renamed_df)

pit_stops_final_df.limit(5).display()

In [0]:
pit_stops_final_df.write.mode('overwrite').format("parquet").saveAsTable("f1_processed.pit_stops")

In [0]:
display(spark.read.parquet(f'{processed_folder_path}/pit_stops'))

In [0]:
dbutils.notebook.exit("Success")