### Ingest drivers.json file

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

##### Step 1 - Read the JOSN file uisng spark dataframe reader

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
name_schema = StructType(fields = [StructField("forename", StringType(), True),
                                   StructField("surname", StringType(), True)
])

In [0]:
drivers_schema = StructType(fields = [StructField("driverId", IntegerType(), False),
                                      StructField("driverRef", StringType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("code", StringType(), True),
                                      StructField("name", name_schema),
                                      StructField("dob", DateType(), True),
                                      StructField("nationality", StringType(), True),
                                      StructField("url", StringType(), True)
])

In [0]:
drivers_df = spark.read \
    .schema(drivers_schema) \
    .json(f"{raw_folder_path}/{v_file_date}/drivers.json")

In [0]:
# drivers_df.printSchema()

root
 |-- driverId: integer (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- number: integer (nullable = true)
 |-- code: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- forename: string (nullable = true)
 |    |-- surname: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



In [0]:
# display(drivers_df.limit(5))

driverId,driverRef,number,code,name,dob,nationality,url
1,hamilton,44.0,HAM,"List(Lewis, Hamilton)",1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
2,heidfeld,,HEI,"List(Nick, Heidfeld)",1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
3,rosberg,6.0,ROS,"List(Nico, Rosberg)",1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg
4,alonso,14.0,ALO,"List(Fernando, Alonso)",1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
5,kovalainen,,KOV,"List(Heikki, Kovalainen)",1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen


##### Step 2 - Rename columns and add new columns
1. driverId renamed to driver_id
1. driverRef renamed to driver_ref
1. ingestion date added
1. name added with concationation of forname and surname

In [0]:
from pyspark.sql.functions import col, concat, lit

In [0]:
drivers_with_columns_df = add_ingestion_date(drivers_df) \
    .withColumnRenamed('driverId', 'driver_id') \
    .withColumnRenamed('driverRef', 'driver_ref') \
    .withColumn('name', concat(col("name.forename"), lit(" "), col("name.surname"))) \
    .withColumn("data_source", lit(v_data_source)) \
    .withColumn("file_date", lit(v_file_date))


##### Step 3 - Drop the unwanted columns

In [0]:
drivers_final_df = drivers_with_columns_df.drop(col('url'))

##### Step 4 - Write the output to processed container

In [0]:
drivers_final_df.write.mode('overwrite').format('delta').saveAsTable('f1_processed.drivers')



In [0]:
%sql
SELECT * FROM f1_processed.drivers
LIMIT 5;

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date,data_source,file_date
1,hamilton,44.0,HAM,Lewis Hamilton,1985-01-07,British,2023-12-22T11:00:51.217Z,Ergast,2021-04-18
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,2023-12-22T11:00:51.217Z,Ergast,2021-04-18
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,2023-12-22T11:00:51.217Z,Ergast,2021-04-18
4,alonso,14.0,ALO,Fernando Alonso,1981-07-29,Spanish,2023-12-22T11:00:51.217Z,Ergast,2021-04-18
5,kovalainen,,KOV,Heikki Kovalainen,1981-10-19,Finnish,2023-12-22T11:00:51.217Z,Ergast,2021-04-18


In [0]:
dbutils.notebook.exit("Success")