In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source=dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date=dbutils.widgets.get("p_file_date")

####Ingest qualifying json file

######Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), True),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("constructorId", IntegerType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True)])


In [0]:
qualifying_df=spark.read.schema(qualifying_schema).option("multiline",True).json(f"{raw_folder_path}/{v_file_date}/qualifying/")

######Step 2 - Rename columns and add new columns
1. Rename qualifyId,drvierId, raceId, constructorId
1. Add ingestion_date with current timestamp

In [0]:
from pyspark.sql.functions import lit

In [0]:
with_column_renamed_df=qualifying_df.withColumnRenamed("qualitfyId","qualify_id") \
    .withColumnRenamed("driverId","driver_id") \
    .withColumnRenamed("raceId","race_id") \
    .withColumnRenamed("constructorId","constructor_id") \
    .withColumn("data_source",lit(v_data_source)) \
    .withColumn("file_date",lit(v_file_date))

In [0]:
final_df=add_ingestion_date(with_column_renamed_df)

######Step 3 - Write to output to processed container in parquet format

In [0]:
# def re_arrange_partition_column(input_df,partition_column):
#     column_list= []
#     for column in input_df.schema.names:
#         if column != partition_column:
#             column_list.append(column)
#     column_list.append(partition_column)
#     output_df=input_df.select(column_list)
#     return output_df

In [0]:
# final_df=re_arrange_partition_column(final_df,'race_id')

In [0]:
# spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")
# if (spark._jsparkSession.catalog().tableExists("f1_processed.results")):
#     final_df.write.mode("overwrite").insertInto("f1_processed.results")
# else:
    # final_df.write.mode("overwrite").partitionBy('race_id').format("delta").saveAsTable("f1_processed.results")

In [0]:
# final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/qualifying")

In [0]:
final_df.write.mode("append").format("delta").saveAsTable("f1_processed.qualifying")

In [0]:
table_exists = spark.sql("SHOW TABLES IN f1_processed").filter("tableName = 'qualifying'").count() > 0
# f1_processed.lap_times
if not table_exists:
    # If table does not exist, create it
    final_df.write \
        .mode("overwrite") \
        .format("delta") \
        .option("mergeSchema", "true") \
        .saveAsTable("f1_processed.qualifying")
else:
    # Delete old data for the current file_date
    spark.sql(f"DELETE FROM f1_processed.qualifying WHERE file_date = '{v_file_date}' ")
    # Insert new data
    final_df.write \
        .mode("append") \
        .format("delta") \
        .option("mergeSchema", "true") \
        .saveAsTable("f1_processed.qualifying")


In [0]:
# display(spark.read.parquet(f"{processed_folder_path}/qualifying"))

In [0]:
# %sql
# drop table f1_processed.qualifying ;

In [0]:
%sql
select * from f1_processed.qualifying ;

In [0]:
dbutils.notebook.exit("success")