###Ingesting single line Results.json file

In [0]:
#results_schema = resultId STRING, raceId INTEGER, driverId INTEGER, constructorId INTEGER, number INTEGER, grid INTEGER, position INTEGER, positionText STRING, positionOrder INTEGER, points FLOAT, laps INTEGER, time STRING, milliseconds INTEGER, fastestLap INTEGER, rank INTEGER, fastestLapTime STRING, fastestLapSpeed FLOAT, statusId INTEGER


In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, FloatType, DateType, TimestampType

In [0]:
results_schema = StructType(fields=[StructField("resultId", IntegerType(), True),
                                    StructField("raceId", IntegerType(), True),
                                    StructField("driverId", IntegerType(), True),
                                    StructField("constructorId", IntegerType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("grid", IntegerType(), True),
                                    StructField("position", IntegerType(), True),
                                    StructField("positionText", StringType(), True),
                                    StructField("positionOrder", IntegerType(), True),
                                    StructField("points", FloatType(), True),
                                    StructField("laps", IntegerType(), True),
                                    StructField("time", StringType(), True),
                                    StructField("milliseconds", IntegerType(), True),
                                    StructField("fastestLap", IntegerType(), True),
                                    StructField("rank", IntegerType(), True),
                                    StructField("fastestLapTime", StringType(), True),
                                    StructField("fastestLapSpeed", FloatType(), True),
                                    StructField("statusId", IntegerType(), True)])

In [0]:
# results_df = spark.read.format("json") \
# .schema(results_schema) \
# .json("/mnt/formula1stg/raw/results.json")

In [0]:
results_df = spark.read.format("json") \
.schema(results_schema) \
.json(f"{raw_folder_path}/results.json")

In [0]:
# display(results_df)

In [0]:
from pyspark.sql.functions import current_timestamp, col, lit

In [0]:
results_renamed_df = results_df.withColumnRenamed("resultId", "result_id") \
.withColumnRenamed("raceId", "race_id") \
.withColumnRenamed("driverId", "driver_id") \
.withColumnRenamed("constructorId", "constructor_id") \
.withColumnRenamed("positionText", "position_text") \
.withColumnRenamed("positionOrder", "position_order") \
.withColumnRenamed("fastestLap", "fastest_lap") \
.withColumnRenamed("fastestLapTime", "fastest_lap_time") \
.withColumnRenamed("fastestLapSpeed", "fastest_lap_speed") \
.withColumn("data_source", lit(v_data_source)) 



In [0]:
results_with_ingestion_date_df = add_ingestion_date(results_renamed_df)
# display(results_with_ingestion_date_df)

In [0]:
results_df_drop = results_with_ingestion_date_df.drop(col("statusId")) 

In [0]:
# display(results_df_drop)

In [0]:
# results_df_drop.write.mode("overwrite").partitionBy("race_id").parquet(f"{processed_folder_path}/results")

In [0]:
# Writing the data into a table in a database we created using spark sql -- f1_processed --Section 18
results_df_drop.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.results")

In [0]:
%sql
-- SELECT * FROM f1_processed.results;

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/results"))

In [0]:
dbutils.notebook.exit("Success")