## Ingest drivers.json file

In [0]:
display(dbutils.fs.ls("/mnt/formulaf1adls/raw/"))

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, DoubleType

name_schema = StructType(fields = [StructField("forename", StringType(), True),
                                   StructField("surname", StringType(), True)
                                   ])



drivers_schema = StructType(fields=[StructField("driverId", IntegerType(), False),
                                    StructField("driverRef", StringType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("code", StringType(), True),
                                    StructField("name", name_schema),
                                    StructField("dob", DateType(), True),
                                    StructField("nationality", StringType(), True),
                                    StructField("url", StringType(), True)  
                                    ])

In [0]:
## reading the file with the above proposed schema
drivers_df = spark.read \
    .schema(drivers_schema) \
    .json("/mnt/formulaf1adls/raw/drivers.json")

drivers_df.limit(10).display()

In [0]:
display(drivers_df.printSchema())

In [0]:
from pyspark.sql.functions import col, lit, concat, current_timestamp

drivers_with_columns_df = drivers_df.withColumnRenamed('driverId', 'driver_id') \
                                    .withColumnRenamed('driverRef', 'driver_ref') \
                                    .withColumn('ingestion_date', current_timestamp()) \
                                    .withColumn('name', concat(col('name.forename'), lit(' '), col('name.surname')))

In [0]:
drivers_with_columns_df.limit(10).display()

In [0]:
# drop the unwanted columns
drivers_final_df = drivers_with_columns_df.drop(col('url'))

drivers_final_df.write.mode('overwrite').parquet('/mnt/formulaf1adls/processed/drivers')

In [0]:
display(spark.read.parquet('/mnt/formulaf1adls/processed/drivers'))