In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, DateType, TimestampType
from pyspark.sql.functions import col, current_timestamp, concat, lit
from pyspark.sql.functions import to_timestamp

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
races_schema = StructType(fields= [
    StructField("raceId", IntegerType(), False),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("date", StringType(), True),
    StructField("time", StringType(), True),
    StructField("url", StringType(), True)
])

In [0]:
df_races = spark.read \
    .option("header", "true") \
    .schema(races_schema) \
    .csv(f"{raw_folder_path}/{v_file_date}/races.csv")

In [0]:
df_races_transformed = df_races.withColumn("ingestion_date", current_timestamp()) \
    .withColumn("race_timestamp", to_timestamp(concat(col("date"),lit(" "), col("time")), "yyyy-MM-dd HH:mm:ss"))

In [0]:
df_races_selected = df_races_transformed.select(col("raceId"), col("year"), col("round"), col("circuitId"), col("name"), col("ingestion_date"), col("race_timestamp"))

In [0]:
df_races_renamed = df_races_selected.withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("year", "race_year") \
    .withColumnRenamed("circuitId", "circuit_id") \
    .withColumn("file_date", lit(v_file_date))

In [0]:
df_races_renamed.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.races")