## Ingest multiple (5) csv files of lap_times folder

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "../includes/configurations"

In [0]:
%run "../includes/common_functions"

In [0]:
display(dbutils.fs.ls(f'{raw_folder_path}/lap_times'))

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, DoubleType

lap_times_schema = StructType(fields = [StructField('raceId', IntegerType(), False),
                                        StructField('driverId', IntegerType(), True),
                                        StructField('lap', IntegerType(), True),
                                        StructField('position', IntegerType(), True),
                                        StructField('time', StringType(), True),
                                        StructField('milliseconds', IntegerType(), True)
                                        ])

lap_times_df = spark.read \
    .schema(lap_times_schema) \
    .csv(f'{raw_folder_path}/lap_times/lap_times_split*.csv')

lap_times_df.limit(10).display()

In [0]:
display(lap_times_df.count())

In [0]:
from pyspark.sql.functions import current_timestamp, lit

lap_times_renamed_df = lap_times_df.withColumnRenamed('raceId', 'race_id') \
                                 .withColumnRenamed('driverId', 'driver_id') \
                                 .withColumn('data_source', lit(v_data_source))

lap_times_final_df = ingestion_date(lap_times_renamed_df)

lap_times_final_df.limit(5).display()

In [0]:
lap_times_final_df.write.mode('overwrite').parquet(f'{presentation_folder_path}/lap_times')

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/lap_times"))

In [0]:
dbutils.notebook.exit("Success")