### Ingest Races File

In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text('p_file_date', '2021-03-21')
v_file_date = dbutils.widgets.get('p_file_date')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#### Step 1: Read CSV file with manual schema

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, DoubleType, DateType

races_schema = StructType([ StructField('raceId', IntegerType(), False),
                            StructField('year', IntegerType(), False),
                            StructField('round', IntegerType(), False),
                            StructField('circuitId', IntegerType(), False),
                            StructField('name', StringType(), False),
                            StructField('date', DateType(), False),
                            StructField('time', StringType(), False),
                            StructField('url', StringType(), False)
                            ])

races_df = spark.read.table('formula1.bronze.races')
                            

#### Step 2: Combine data and time columns, and add current timestamp

In [0]:
from pyspark.sql.functions import lit, col, to_timestamp, concat
races_df_date_time = add_ingestion_date(races_df.withColumn('race_timestamp', to_timestamp(concat(col('date'), lit(' '), col('time')), 'yyyy-MM-dd HH:mm:ss'))\
    .withColumn('data_source', lit(v_data_source)))\
    .withColumn('file_date', lit(v_file_date))

#### Step 3: Rename columns and remove url, date and time columns

In [0]:
races_final_df = races_df_date_time.select(races_df_date_time.raceId.alias('race_id'),
                                           races_df_date_time.year.alias('race_year'),
                                           races_df_date_time.round,
                                           races_df_date_time.circuitId.alias('circuit_id'),
                                           races_df_date_time.name,
                                           races_df_date_time.race_timestamp,
                                           races_df_date_time.ingestion_date,
                                           races_df_date_time.data_source,
                                           races_df_date_time.file_date)

#### Step 4: Write data to datalake in delta

In [0]:
races_final_df.write\
                .mode('overwrite')\
                .partitionBy('race_year')\
                .saveAsTable('formula1.silver.races')

In [0]:
dbutils.notebook.exit("Success")

###