In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source=dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date=dbutils.widgets.get("p_file_date")

####Ingest lap_times file

######Step 1 - Read the CSV file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
lap_times_schema = StructType(fields=[StructField("race_id", IntegerType(), False),
                              StructField("driver_id", IntegerType(), True),
                               StructField("lap", IntegerType(), True),
                               StructField("position", IntegerType(), True),
                               StructField("time", StringType(), True),
                               StructField("milliseconds", IntegerType(), True)])


In [0]:
lap_times_df=spark.read.schema(lap_times_schema).csv(f"{raw_folder_path}/{v_file_date}/lap_times/lap_times_split*.csv")

In [0]:
# display(lap_times_df)

######Step 2 - Rename columns and add new columns
1. Rename drvierId and raceId
1. Add ingestion_date with current timestamp

In [0]:
from pyspark.sql.functions import lit

In [0]:
with_column_rename_df=lap_times_df.withColumnRenamed("driverId","driver_id") \
    .withColumnRenamed("raceId","race_id") \
    .withColumn("data_source",lit(v_data_source)) \
    .withColumn("v_file_date",lit(v_file_date))

In [0]:
final_df=add_ingestion_date(with_column_rename_df)

In [0]:
# display(final_df)

######Step 3 - Write to output to processed container in parquet format

In [0]:
# final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/lap_times")

In [0]:
# final_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.lap_times")

In [0]:
table_exists = spark.sql("SHOW TABLES IN f1_processed").filter("tableName = 'lap_times'").count() > 0
# f1_processed.lap_times
if not table_exists:
    # If table does not exist, create it
    final_df.write \
        .mode("overwrite") \
        .format("delta") \
        .option("mergeSchema", "true") \
        .saveAsTable("f1_processed.lap_times")
else:
    # Delete old data for the current file_date
    spark.sql(f"DELETE FROM f1_processed.lap_times WHERE v_file_date = '{v_file_date}' ")
    # Insert new data
    final_df.write \
        .mode("append") \
        .format("delta") \
        .option("mergeSchema", "true") \
        .saveAsTable("f1_processed.lap_times")


In [0]:
# display(spark.read.parquet(f"{processed_folder_path}/lap_times"))

In [0]:
# %sql
# drop table f1_processed.lap_times;

In [0]:
%sql
select * from f1_processed.lap_times order by v_file_date  limit 1;

In [0]:
dbutils.notebook.exit("success")