In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#### Ingest races.csv file

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source=dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date=dbutils.widgets.get("p_file_date")

##### Step 1 Read the csv file using the spark dataframe reader


In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, FloatType,DateType

In [0]:
races_schema=StructType(fields=
                           [StructField("raceId",IntegerType(),False),
                            StructField("year",IntegerType(),True),
                            StructField("round",IntegerType(),True),
                            StructField("circuitId",IntegerType(),True),
                            StructField("name",StringType(),True),
                            StructField("date",DateType(),True),
                            StructField("time",StringType(),True),
                            StructField("url",StringType(),True)
                            ])


In [0]:
races_df=spark.read \
    .option("header",True) \
    .schema(races_schema) \
    .csv(f"{raw_folder_path}/{v_file_date}/races.csv")

In [0]:
# races_df.printSchema()
# display(races_df)

#####Step 2 - Add ingestion date and race_timestamp


In [0]:
from pyspark.sql.functions import current_timestamp, lit, col,to_timestamp, concat

In [0]:
races_with_ingestion_df=add_ingestion_date(races_df)

In [0]:
races_with_timestamp_df=races_with_ingestion_df.withColumn("race_timestamp", to_timestamp(concat(col('date'), lit(' '), col('time')), "yyyy-MM-dd HH:mm:ss")) \
  .withColumn("data_source", lit(v_data_source)) \
    .withColumn("file_date", lit(v_file_date))

In [0]:
# display(races_with_timestamp_df)

#####Step 3 - Select only the  column  required


In [0]:
from pyspark.sql.functions import col

In [0]:
races_selected_df = races_with_timestamp_df.select(col("raceId").alias("race_id"),
                                                   col("year").alias("race_year"),
                                                   col("round"),
                                                   col("circuitId").alias("circuit_id"),
                                                   col("name"),
                                                   col("ingestion_date"),
                                                   col("race_timestamp"),
                                                   col("data_source"),
                                                   col("file_date"))

In [0]:
# display(races_selected_df)

#####step 4 - Write data to data lake as parquet


In [0]:
# display(dbutils.fs.ls("/mnt/formuladl15/"))
#/mnt/formuladl15/process/

In [0]:
# races_selected_df.write.mode("overwrite").partitionBy('race_year').parquet(f"{processed_folder_path}/races")

In [0]:
# df = spark.read.parquet(f"{processed_folder_path}/races")

# display(df)

In [0]:
races_selected_df.write.mode("overwrite").partitionBy('race_year').format("delta").saveAsTable("f1_processed.races")

In [0]:
%sql
select * from f1_processed.races;

In [0]:
dbutils.notebook.exit("success")