## Ingest drivers.json file

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "../includes/configurations"

In [0]:
%run "../includes/common_functions"

In [0]:
display(dbutils.fs.ls(f"{raw_folder_path}"))

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, DoubleType

name_schema = StructType(fields = [StructField("forename", StringType(), True),
                                   StructField("surname", StringType(), True)
                                   ])



drivers_schema = StructType(fields=[StructField("driverId", IntegerType(), False),
                                    StructField("driverRef", StringType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("code", StringType(), True),
                                    StructField("name", name_schema),
                                    StructField("dob", DateType(), True),
                                    StructField("nationality", StringType(), True),
                                    StructField("url", StringType(), True)  
                                    ])

In [0]:
## reading the file with the above proposed schema
drivers_df = spark.read \
    .schema(drivers_schema) \
    .json(f"{raw_folder_path}/drivers.json")

drivers_df.limit(5).display()

In [0]:
display(drivers_df.printSchema())

In [0]:
from pyspark.sql.functions import col, lit, concat, current_timestamp

drivers_with_columns_df = drivers_df.withColumnRenamed('driverId', 'driver_id') \
                                    .withColumnRenamed('driverRef', 'driver_ref') \
                                    .withColumn('name', concat(col('name.forename'), lit(' '), col('name.surname'))) \
                                        .withColumn('data_source', lit(v_data_source))

drivers_with_ingestion_date_df = ingestion_date(drivers_with_columns_df)


In [0]:
drivers_with_ingestion_date_df.limit(5).display()

In [0]:
# drop the unwanted columns
drivers_final_df = drivers_with_ingestion_date_df.drop(col('url'))

drivers_final_df.write.mode('overwrite').format("parquet").saveAsTable("f1_processed.drivers")

In [0]:
display(spark.read.parquet(f'{processed_folder_path}/drivers'))

In [0]:
dbutils.notebook.exit("Success")