# Ingest Drivers File
- 1. Define Schema
- 2. Read the json file
- 3. Rename and replace name nested columns
- 4. drop url
- 5. write the DataFrame in parquet format to the datalake

In [0]:
dbutils.widgets.help()

In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/common_functions"


In [0]:
%run "../includes/configuration"


### define schema

In [0]:
display(dbutils.fs.mounts())

In [0]:
display(dbutils.fs.ls(f'{raw_folder_path}'))

In [0]:
driver_raw_df = spark.read.json(f'{raw_folder_path}/{v_file_date}/drivers.json')

In [0]:
display(driver_raw_df)

In [0]:
display(driver_raw_df.printSchema())

In [0]:
display(driver_raw_df.describe())

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
name_schema = StructType([
    StructField('forename',StringType(), True),
    StructField('surname',StringType(), True),
]
)

In [0]:
drivers_schema = StructType([
    StructField('driverId', IntegerType(), False),
    StructField('driverRef', StringType() , True),
    StructField('number', IntegerType() , True),
    StructField('code', StringType() , True),
    StructField('name', name_schema, True),
    StructField('dob', DateType(), True),
    StructField('nationality', StringType(), True),
    StructField('url', StringType(), True),
])

### read json file


In [0]:
drivers_read_df = spark.read.schema(drivers_schema).json(f'{raw_folder_path}/{v_file_date}/drivers.json')

In [0]:
display(drivers_read_df)

In [0]:
drivers_read_df.printSchema()

### Rename and replace name nested columns

In [0]:
from pyspark.sql.functions import col, lit, concat, current_timestamp

In [0]:
drivers_renamed_df = drivers_read_df\
    .withColumnRenamed('driverId','driver_id')\
    .withColumnRenamed('driverRef','driver_ref')\
    .withColumn('name',concat(col('name.forename'),lit(' '),col('name.surname')))\
    .withColumn('data_source', lit(v_data_source))\
    .withColumn('file_date',lit(v_file_date))

In [0]:
display(drivers_renamed_df)

In [0]:
drivers_renamed_df.printSchema()

In [0]:
drivers_ingestion_added_df = add_ingestion_date(drivers_renamed_df)

### drop url col

In [0]:
drivers_df = drivers_ingestion_added_df.drop(col('url'))

In [0]:
drivers_df.printSchema()

In [0]:
display(drivers_df)

### write output in parquet format in ADLS

In [0]:
drivers_df.write.mode('overwrite').format('parquet').saveAsTable('f1_processed.drivers')

In [0]:
%sql
SELECT * FROM f1_processed.drivers

In [0]:
dbutils.notebook.exit('Success')