In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source=dbutils.widgets.get("p_data_source")

####Ingest the result.json file


#######Step 1 - Read the JSON file using the spark dataframe API

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType ,FloatType

In [0]:
results_schema = StructType(fields=[StructField("resultId", IntegerType(), False),
                            StructField("raceId", StringType(), True),
                            StructField("driverId", IntegerType(), True),
                            StructField("constructorId", IntegerType(), True),
                            StructField("number", IntegerType(), True),
                            StructField("grid", IntegerType(), True),
                            StructField("position", IntegerType(), True),
                            StructField("positionText", StringType(), True),
                            StructField("positionOrder", IntegerType(), True),
                            StructField("points", FloatType(), True),
                            StructField("laps", IntegerType(), True),
                            StructField("time", StringType(), True),
                            StructField("milliseconds", IntegerType(), True),
                            StructField("fastestLap", IntegerType(), True),
                            StructField("rank", IntegerType(), True),
                            StructField("fastestLapTime", StringType(), True),
                            StructField("fastestLapSpeed", FloatType(), True),
                            StructField("statusId", IntegerType(), True)])

In [0]:
results_df=spark.read.schema(results_schema).json(f"{raw_folder_path}/results.json")

#######Step 2 - Rename columns and add new columns

In [0]:
from pyspark.sql.functions import current_timestamp,lit

In [0]:
results_with_columns_df=results_df.withColumnRenamed("resultId","result_id") \
    .withColumnRenamed("raceId","race_id") \
    .withColumnRenamed("driverId","driver_id") \
    .withColumnRenamed("constructorId","constructor_id") \
    .withColumnRenamed("positionText","position_text") \
    .withColumnRenamed("positionOrder","position_order") \
    .withColumnRenamed("fastestLap","fastest_lap") \
    .withColumnRenamed("fastestLapTime","fastest_lap_time") \
    .withColumnRenamed("fastestLapSpeed","fastest_lap_speed") \
    .withColumn("data_source", lit(v_data_source))

In [0]:
results_with_ingestion_df=add_ingestion_date(results_with_columns_df)

######Step 3 - Drop the unwanted column

In [0]:
from pyspark.sql.functions import col

In [0]:
results_final_df=results_with_ingestion_df.drop(col('statusId'))

######Step 4 - Write to output to the processed container in parquet format

In [0]:
results_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/results")

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/results"))

In [0]:
dbutils.notebook.exit("success")