### Ingest drivers.json file


In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#### Step 1: Read drivers data from raw container with manually provided schema

In [0]:
from pyspark.sql.types import StringType, StructField, StringType, IntegerType, StructType, DateType

In [0]:
name_struct_schema = StructType([StructField('forename', StringType(), False),
                                 StructField('surname', StringType(), False)])

drivers_schema = StructType([StructField('driverId', IntegerType(), False),
                            StructField('driverRef', StringType(), False),
                            StructField('number', IntegerType(), True),
                            StructField('code', StringType(), False),
                            StructField('name', name_struct_schema , False),
                            StructField('dob', DateType(), False),
                            StructField('nationality', StringType(), False),
                            StructField('url', StringType(), False)])

In [0]:
drivers_df = spark.read \
                    .schema(drivers_schema)\
                    .json(f'{raw_folder_path}/drivers.json')

#### Step 2: Drop URL column

In [0]:
drivers_dropped_df = drivers_df.drop('url')

#### Step 3: Combine forename and surname to form full name

In [0]:
from pyspark.sql.functions import lit, concat
drivers_combined_name_df = add_ingestion_date(drivers_dropped_df.withColumn('name', concat(drivers_dropped_df['name']['forename'], lit(' '),drivers_dropped_df['name']['surname']))\
    .withColumn('data_source', lit(v_data_source)))

#### Step 3: Rename columns

In [0]:
drivers_renamed_df = drivers_combined_name_df.withColumnRenamed('driverId', 'driver_id')\
                        .withColumnRenamed('driverRef', 'driver_ref')

#### Step 4: Add ingestion date

In [0]:
drivers_final_df= add_ingestion_date(drivers_renamed_df)
                    

#### Step 5: Save data to data lake in parquet

In [0]:
drivers_final_df.write\
                .mode('overwrite')\
                .format('parquet')\
                .saveAsTable('f1_processed.drivers')

In [0]:
%fs
ls /mnt/formula1dl244/processed/drivers

In [0]:
dbutils.notebook.exit("Success")