## Ingest 2 json files in the qualifying folder

In [0]:
display(dbutils.fs.ls('/mnt/formulaf1adls/raw/qualifying'))

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

qualifying_schema = StructType(fields = [StructField('qualifyId', IntegerType(), False),
                                         StructField('raceId', IntegerType(), True),
                                         StructField('driverId', IntegerType(), True),
                                         StructField('constructorId', IntegerType(), True),
                                         StructField('number', IntegerType(), True),
                                         StructField('position', IntegerType(), True),
                                         StructField('q1', StringType(), True),
                                         StructField('q2', StringType(), True),
                                         StructField('q3', StringType(), True)
                                         ])

qualifying_df = spark.read \
    .schema(qualifying_schema) \
    .option('multiLine', True) \
    .json('/mnt/formulaf1adls/raw/qualifying/qualifying_split*.json')

qualifying_df.limit(5).display()

In [0]:
display(qualifying_df.count())

In [0]:
from pyspark.sql.functions import col, current_timestamp

qualifying_final_df = qualifying_df.withColumnRenamed('qualifyId', 'qualify_id') \
                                    .withColumnRenamed('raceId', 'race_id') \
                                    .withColumnRenamed('driverId', 'driver_id') \
                                    .withColumnRenamed('constructorId', 'constructor_id') \
                                    .withColumn('ingestion_date', current_timestamp())

qualifying_final_df.write.mode('overwrite').parquet('/mnt/formulaf1adls/processed/qualifying')

In [0]:
display(spark.read.parquet('/mnt/formulaf1adls/processed/qualifying'))