## Ingest results.json file

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "../includes/configurations"

In [0]:
%run "../includes/common_functions"

In [0]:
display(dbutils.fs.ls(f'{raw_folder_path}'))

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType

results_schema = StructType(fields=[StructField("resultId", IntegerType(), False),
                                    StructField("raceId", IntegerType(), True),
                                    StructField("driverId", IntegerType(), True),
                                    StructField("constructorId", IntegerType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("grid", IntegerType(), True),
                                    StructField("position", IntegerType(), True),
                                    StructField("positionText", StringType(), True),
                                    StructField("positionOrder", IntegerType(), True),
                                    StructField("points", DoubleType(), True),
                                    StructField("laps", IntegerType(), True),
                                    StructField("time", StringType(), True),
                                    StructField("milliseconds", IntegerType(), True),
                                    StructField("fastestLap", IntegerType(), True),
                                    StructField("rank", IntegerType(), True),
                                    StructField("fastestLapTime", StringType(), True),
                                    StructField("fastestLapSpeed", DoubleType(), True),
                                    StructField("statusId", StringType(), True)])
results_df = spark.read \
    .schema(results_schema) \
    .json(f'{raw_folder_path}/results.json')

results_df.limit(5).display()

In [0]:
from pyspark.sql.functions import col, lit, current_timestamp

results_with_renamed_df = results_df.withColumnRenamed('resultId', 'result_id') \
                                    .withColumnRenamed('raceId', 'race_id') \
                                    .withColumnRenamed('driverId', 'driver_id') \
                                    .withColumnRenamed('constructorId', 'constructor_id') \
                                    .withColumnRenamed('positionText', 'position_text') \
                                    .withColumnRenamed('positionOrder', 'position_order') \
                                    .withColumnRenamed('fastestLap', 'fastest_lap') \
                                    .withColumnRenamed('fastestLapTime', 'fastest_lap_time') \
                                    .withColumnRenamed('fastestLapSpeed', 'fastest_lap_speed') \
                                    .withColumn("data_source", lit(v_data_source))

results_with_ingestion_date_df = ingestion_date(results_with_renamed_df)


results_with_ingestion_date_df.limit(5).display()

In [0]:
results_final_df = results_with_ingestion_date_df.drop(col('statusId'))

results_final_df.write.mode('overwrite').partitionBy('race_id').parquet(f'{processed_folder_path}/results')

In [0]:
display(spark.read.parquet(f'{processed_folder_path}/results'))

In [0]:
dbutils.notebook.exit("Success")