# Ingest qualifying folder
- 1. Define Schema
- 2. Read the multiple json file from folder
- 3. Rename Columns and add ingestion date
- 4. write the output in the ADLS


In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
display(dbutils.fs.mounts())

In [0]:
display(dbutils.fs.ls(f'{raw_folder_path}'))

In [0]:
display(dbutils.fs.ls(f'{raw_folder_path}/{v_file_date}/qualifying/'))

In [0]:
qualifying_raw_df = spark.read.option('multiLine', True).json(f'{raw_folder_path}/{v_file_date}/qualifying/')

In [0]:
display(qualifying_raw_df)

In [0]:
display(qualifying_raw_df.describe())

In [0]:
display(len(qualifying_raw_df.columns))

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
qualifying_raw_df.printSchema()

In [0]:
qualifying_schema = StructType([
    StructField('qualifyId', IntegerType(),False),
    StructField('raceId', IntegerType(),True),
    StructField('driverId', IntegerType(),True),
    StructField('constructorId', IntegerType(),True),
    StructField('number', IntegerType(),True),
    StructField('position', IntegerType(),True),
    StructField('q1', StringType(),True),
    StructField('q2', StringType(),True),
    StructField('q3', StringType(),True)
])

In [0]:
qualifying_updated_df = spark.read.option('multiLine', True).schema(qualifying_schema).json(f'{raw_folder_path}/{v_file_date}/qualifying/')

In [0]:
display(qualifying_updated_df)

In [0]:
qualifying_updated_df.printSchema()

In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
qualifying_ingestion_date_df = add_ingestion_date(qualifying_updated_df)

In [0]:
qualifying_df = qualifying_ingestion_date_df\
    .withColumnRenamed('qualifyId', 'qualify_id')\
    .withColumnRenamed('raceId', 'race_id')\
    .withColumnRenamed('driverId', 'driver_id')\
    .withColumnRenamed('constructorId', 'constructor_id')\
    .withColumn('data_source', lit(v_data_source))\
    .withColumn('file_date',lit(v_file_date))

In [0]:
qualifying_df.printSchema()

In [0]:
display(qualifying_df)

In [0]:
# qualifying_df.write.mode('overwrite').format('parquet').saveAsTable('f1_processed.qualifying')

In [0]:
overwrite_partition(qualifying_df, 'f1_processed', 'qualifying', 'race_id')

In [0]:
display(spark.read.parquet(f'{processed_folder_path}/qualifying'))

In [0]:
dbutils.notebook.exit('Success')