In [0]:
from pyspark.sql.functions import lit, col, current_timestamp, to_timestamp, concat
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

In [0]:
races_schema = StructType(
    [
        StructField("raceId", IntegerType(), False),
        StructField("year", IntegerType(), True),
        StructField("round", IntegerType(), True),
        StructField("circuitId", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("date", DateType(), True),
        StructField("time", StringType(), True),
        StructField("url", StringType(), True)
    ]
)

In [0]:
races_df = spark.read.csv(path="/mnt/sanformula1dl/raw/races.csv", header=True, schema=races_schema)

In [0]:
races_df.printSchema()

In [0]:
races_renamed_df = races_df.withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("circuitId", "circuit_id") \
    .withColumnRenamed("year", "race_year")

In [0]:
display(races_renamed_df)

In [0]:
races_final_df = races_renamed_df.withColumn(
    "race_timestamp", to_timestamp(concat(col("date").astype(StringType()), lit(" "), col("time")), "yyyy-MM-dd HH:mm:ss") ) \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("data_source", lit("manual")) \
    .select("race_id", "race_year", "round", "circuit_id", "name", "race_timestamp", "ingestion_date", "data_source")

In [0]:
display(races_final_df)

In [0]:
races_final_df.write.mode("overwrite").partitionBy("race_year").parquet("/mnt/sanformula1dl/processed/races")

In [0]:
%fs
ls /mnt/sanformula1dl/processed/races

In [0]:
display(spark.read.parquet("/mnt/sanformula1dl/processed/races"))