##Ingesting races.csv into datalake

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
# races_df = spark.read.option("header", True).csv("dbfs:/mnt/formula1stg/raw/races.csv")

# display(races_df)

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [0]:
races_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                   StructField("year", IntegerType(), True),
                                   StructField("round", IntegerType(), True),
                                   StructField("circuitId", IntegerType(), True),
                                   StructField("name", StringType(), True),
                                   StructField("date", StringType(), True),
                                   StructField("time", StringType(), True),
                                   StructField("url", StringType(), True)])

# races_df = spark.read \
#                 .option("header", True) \
#                 .schema(races_schema) \
#                 .csv("dbfs:/mnt/formula1stg/raw/races.csv")


In [0]:
# display(races_df)

In [0]:
races_df = spark.read \
.option("header", True) \
.schema(races_schema) \
.csv(f"{raw_folder_path}/{v_file_date}/races.csv")

#### Step-2 Select only the required columns

In [0]:
from pyspark.sql.functions import col

In [0]:
races_selected_df = races_df.select(col("raceId"), col("year"), col("round"), col("circuitId"), col("name"), col("date"), col("time"), col("url"))

#display(races_selected_df)


In [0]:
from pyspark.sql.functions import concat, lit, to_timestamp, current_timestamp, col, when

In [0]:

races_with_timestamp_df = add_ingestion_date(races_selected_df).withColumn(
'race_timestamp',
    when(
        (col('time').isNull()) | (col('time') == '\\N'),
        to_timestamp(col('date'), 
        'yyyy-MM-dd')
    ).otherwise(
        to_timestamp(
            concat(col('date'), lit(' '), col('time')),
            'yyyy-MM-dd HH:mm:ss'
        )
    )
)

In [0]:
display(races_with_timestamp_df)

In [0]:

races_renamed_df = races_with_timestamp_df.withColumnRenamed("raceId", "race_id") \
.withColumnRenamed("year", "race_year") \
.withColumnRenamed("circuitId", "circuit_id") \
.withColumn("data_source", lit(v_data_source)) \
.withColumn("file_date", lit(v_file_date))

In [0]:
display(races_renamed_df)

In [0]:
races_col_selected_df = races_renamed_df.select(col("race_id"), col("circuit_id"), col("race_year"), col("round"), col("name"), col("ingestion_date"), col("race_timestamp"), col("data_source"), col("file_date"))

# display(races_col_selected_df)

## Write the output to processed container in parquet format

In [0]:
# races_col_selected_df.write.mode("overwrite").parquet(f"{processed_folder_path}/races")

In [0]:
# races_col_selected_df.write.mode("overwrite").partitionBy("race_year").parquet(f"{processed_folder_path}/races")

In [0]:
# Writing the data into a table in a database we created using spark sql -- f1_processed --Section 18
races_col_selected_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.races")

In [0]:
%sql
SELECT * FROM f1_processed.races;

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/races"))

In [0]:
dbutils.notebook.exit("Success")