In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source=dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date=dbutils.widgets.get("p_file_date")

####Ingest pit_stops.json file

######Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
pit_stops_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                              StructField("driverId", IntegerType(), True),
                              StructField("stop", StringType(), True),
                               StructField("lap", IntegerType(), True),
                               StructField("time", StringType(), True),
                               StructField("duration", StringType(), True),
                               StructField("milliseconds", IntegerType(), True)])


In [0]:
pit_stops_df=spark.read.schema(pit_stops_schema).option("multiline", True).json(f"{raw_folder_path}/{v_file_date}/pit_stops.json")

In [0]:
# display(spark.read.option("multiline", True).json(f"{raw_folder_path}/pit_stops.json"))
# display(pit_stops_df)

######Step 2 - Rename columns and add new columns
1. Rename drvierId and raceId
1. Add ingestion_date with current timestamp

In [0]:
from pyspark.sql.functions import current_date,lit

In [0]:
with_column_renamed_df=pit_stops_df.withColumnRenamed("raceId","race_id").withColumnRenamed("driverId","driver_id") \
    .withColumn("data_source", lit(v_data_source)) \
    .withColumn("file_date", lit(v_file_date))

In [0]:
final_df=add_ingestion_date(with_column_renamed_df)

In [0]:
final_df.printSchema()

######Step 3 - Write to output to processed container in parquet format

In [0]:
# final_df.createOrReplaceTempView("pit_stops_staging")

In [0]:
table_exists = spark.sql("SHOW TABLES IN f1_processed").filter("tableName = 'pit_stops'").count() > 0
print(table_exists)

In [0]:
table_exists = spark.sql("SHOW TABLES IN f1_processed").filter("tableName = 'pit_stops'").count() > 0

if not table_exists:
    # If table does not exist, create it
    final_df.write \
        .mode("overwrite") \
        .format("delta") \
        .option("mergeSchema", "true") \
        .saveAsTable("f1_processed.pit_stops")
else:
    # Delete old data for the current file_date
    spark.sql(f"DELETE FROM f1_processed.pit_stops WHERE file_date = '{v_file_date}'")
    # Insert new data
    final_df.write \
        .mode("append") \
        .format("delta") \
        .option("mergeSchema", "true") \
        .saveAsTable("f1_processed.pit_stops")


In [0]:
# final_df.write.mode("append").format("delta").saveAsTable("f1_processed.pit_stops")

In [0]:
# %sql
# drop table f1_processed.pit_stops;

In [0]:
# display(spark.read.parquet(f"{processed_folder_path}/pit_stops"))

In [0]:
%sql
select race_id,file_date,count(1) from 
f1_processed.pit_stops
group by race_id,file_date
having file_date = '2021-04-18';

In [0]:
dbutils.notebook.exit("success")