# Ingest races file into file system
- 1. Read File
- 2. Select only required column
- 3. Rename colums (raceid,year,circuitid)
- 4. transform date and time  into race_timestamp
- 5. Add ingestion date 
- 6. write data in file system

In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
display(dbutils.fs.mounts())

In [0]:
display(dbutils.fs.ls(f'{raw_folder_path}'))

In [0]:
display(dbutils.fs.ls(f'{raw_folder_path}/{v_file_date}/races.csv'))

### set header

In [0]:
races_read_df = spark.read.option('header',True).csv(f'{raw_folder_path}/{v_file_date}/races.csv')

In [0]:
display(races_read_df)

### Fix Schema

In [0]:
display(races_read_df.describe())

In [0]:
races_read_df.printSchema()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType


In [0]:
races_schema = StructType([
    StructField("raceId", IntegerType(), nullable = False),
    StructField("year",IntegerType(), nullable = True),
    StructField("round",IntegerType(), nullable = True),
    StructField("circuitId",IntegerType(), nullable = True),
    StructField("name",StringType(), nullable = True),
    StructField("date",DateType(), nullable = True),
    StructField("time",StringType(), nullable = True),
    StructField("url",StringType(), nullable = True),
])

In [0]:
races_updated_schema_df = spark.read.option('header',True).schema(races_schema).csv(f'{raw_folder_path}/{v_file_date}/races.csv')

In [0]:
races_updated_schema_df.printSchema()

### Add extra columns

In [0]:
from pyspark.sql.functions import current_timestamp,to_timestamp,col,lit,concat

In [0]:
races_ingestion_date_df = add_ingestion_date(races_updated_schema_df)

In [0]:
races_select_column_df = races_ingestion_date_df\
    .withColumn("race_timestamp",to_timestamp(concat(\
        col('date'),lit(' '),col('time')),'yyyy-MM-dd HH:mm:ss'))\
    .withColumn('data_source', lit(v_data_source))\
    .withColumn('file_date',lit(v_file_date))
    
    

In [0]:
display(races_select_column_df)

In [0]:
races_select_column_df.printSchema()

### update column name

In [0]:
display(len(races_select_column_df.columns))

In [0]:
from pyspark.sql.functions import col

In [0]:
races_final_df = races_select_column_df.select(\
    col('raceId').alias('race_id'),\
    col('year').alias('race_year'),\
    col('round'),\
    col('circuitId').alias('circuit_id'),\
    col('name'),\
    col('ingestion_date'),\
    col('race_timestamp'),\
    col('data_source')
)

In [0]:
races_final_df.printSchema()

In [0]:
display(races_final_df)

### write data in parquet to database

In [0]:
display(dbutils.fs.mounts())

In [0]:
display(dbutils.fs.ls(f'{processed_folder_path}'))

In [0]:
races_final_df.write.mode('overwrite').partitionBy('race_year').format('parquet').saveAsTable('f1_processed.races')

In [0]:
display(dbutils.fs.ls(f'{processed_folder_path}'))

In [0]:
display(spark.read.parquet(f'{processed_folder_path}/races'))

In [0]:
dbutils.notebook.exit('Success')