# Ingest pit_stops file
- 1. Design the Schema
- 2. Read the file
- 3. Rename the columns and add Ingestion date
- 4. Write the output in parquet format to adls


In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
display(dbutils.fs.mounts())

In [0]:
display(dbutils.fs.ls(f'{raw_folder_path}'))

In [0]:
pitstops_raw_df = spark.read.option('multiLine',True).json(f'{raw_folder_path}/{v_file_date}/pit_stops.json')

In [0]:
display(pitstops_raw_df)

In [0]:
display(pitstops_raw_df.describe())

In [0]:
pitstops_raw_df.printSchema()

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
pitstops_schema = StructType([
    StructField('raceId', IntegerType(), False),
    StructField('driverId', IntegerType(), True),
    StructField('stop', StringType(), True),
    StructField('lap', IntegerType(), True),
    StructField('time', StringType(), True),
    StructField('duration', StringType(), True),
    StructField('milliseconds', IntegerType(), True)
])

In [0]:
pitstops_updated_schema_df = spark.read.option('multiLine',True).schema(pitstops_schema).json(f'{raw_folder_path}/{v_file_date}/pit_stops.json')

In [0]:
pitstops_updated_schema_df.printSchema()

### Rename the columns and add Ingestion date

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
pitstops_ingestion_date_df = add_ingestion_date(pitstops_updated_schema_df)

In [0]:
from pyspark.sql.functions import lit

In [0]:
pitstops_df = pitstops_ingestion_date_df\
    .withColumnRenamed('raceId','race_id')\
    .withColumnRenamed('driverId','driver_id')\
    .withColumn('data_source', lit(v_data_source))\
    .withColumn('file_date',lit(v_file_date))

In [0]:
pitstops_df.printSchema()

In [0]:
# pitstops_df.write.mode('overwrite').format('parquet').saveAsTable('f1_processed.pit_stops')

In [0]:
overwrite_partition(pitstops_df, 'f1_processed', 'pit_stops', 'race_id')

In [0]:
display(dbutils.fs.ls(f'{processed_folder_path}/pit_stops'))

In [0]:
display(spark.read.parquet(f'{processed_folder_path}/pit_stops'))

In [0]:
dbutils.notebook.exit('Success')