# Ingest Lap times csv files
- 1. Design Schema
- 2. Read files
- 3. Rename columns and add ingestion date column
- 4. Write the output in parquet format to adls

In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
display(dbutils.fs.mounts())

In [0]:
display(dbutils.fs.ls(f'{raw_folder_path}'))

In [0]:
ingestion_raw_df = spark.read.csv(f'{raw_folder_path}/{v_file_date}/lap_times')

In [0]:
display(ingestion_raw_df)

In [0]:
ingestion_raw_df.printSchema()

In [0]:
display(ingestion_raw_df.describe())

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
ingest_schema = StructType([
    StructField('raceId', IntegerType(), False),
    StructField('driverId', IntegerType(), True),
    StructField('lap', IntegerType(), True),
    StructField('position', IntegerType(), True),
    StructField('time', StringType(), True),
    StructField('milliseconds', IntegerType(), True),
])

In [0]:
# ingestion_read_df = spark.read.schema(ingest_schema).csv(f'{raw_folder_path}/lap_times/lap_times_split*.csv')
ingestion_read_df = spark.read.schema(ingest_schema).csv(f'{raw_folder_path}/{v_file_date}/lap_times')

In [0]:
display(ingestion_read_df)

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
ingestion_date_added_df = add_ingestion_date(ingestion_read_df)

In [0]:
from pyspark.sql.functions import lit

In [0]:
ingestion_df = ingestion_date_added_df\
    .withColumnRenamed('raceId','race_id')\
    .withColumnRenamed('driverId','driver_id')\
    .withColumn('data_source', lit(v_data_source))\
    .withColumn('file_date',lit(v_file_date))

In [0]:
display(ingestion_df)

In [0]:
# ingestion_df.write.mode('overwrite').format('parquet').saveAsTable('f1_processed.lap_times')

In [0]:
overwrite_partition(ingestion_df, 'f1_processed', 'lap_times', 'race_id')

In [0]:
display(spark.read.parquet(f'{processed_folder_path}/lap_times'))

In [0]:
dbutils.notebook.exit('Success')