# Ingest circuit file into file system
- 1. Read File
- 2. Select only required column
- 3. Rename colums
- 4. Add ingestion date
- 5. write data in file system

In [0]:
dbutils.widgets.help()

In [0]:
# p = parameter
dbutils.widgets.text('p_data_source','')

v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/common_functions"

In [0]:
%run "../includes/configuration"

### check the mount

In [0]:
display(dbutils.fs.ls('/'))

In [0]:
display(dbutils.fs.ls('dbfs:/mnt/'))

In [0]:
display(dbutils.fs.mounts())

In [0]:
display(dbutils.fs.ls(f"{raw_folder_path}"))

In [0]:
circuit_read_df = spark.read.csv(f'{raw_folder_path}/{v_file_date}/circuits.csv')

In [0]:
display(circuit_read_df)

In [0]:
circuit_read_df.show(truncate = False)

In [0]:
circuit_read_df = spark.read.option('header',True).csv(f'{raw_folder_path}/{v_file_date}/circuits.csv')

### fix schema and select required column

In [0]:
circuit_read_df.printSchema()

In [0]:
display(circuit_read_df.describe())

In [0]:
display(len(circuit_read_df.columns))

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [0]:
circuit_schema = StructType([
    StructField('circuitId', IntegerType(), False),
    StructField('circuitRef', StringType(), True),
    StructField('name', StringType(), True),
    StructField('location', StringType(), True),
    StructField('country', StringType(), True),
    StructField('lat', DoubleType(), True),
    StructField('lng', DoubleType(), True),
    StructField('alt', IntegerType(), True),
    StructField('url', StringType(), True)

])

In [0]:
circuit_read_df = spark.read\
    .option('header',True)\
    .schema(circuit_schema)\
    .csv(f'{raw_folder_path}/{v_file_date}/circuits.csv')

In [0]:
display(circuit_read_df)

In [0]:
circuit_read_df.printSchema()

In [0]:
from pyspark.sql.functions import col

In [0]:
circuit_select_df = circuit_read_df.select\
    (col('circuitId')\
    ,col('circuitRef')\
    ,col('name')\
    ,col('location')\
    ,col('country')\
    ,col('lat')\
    ,col('lng')\
    ,col('alt'))
    

In [0]:
display(circuit_select_df)

In [0]:
from pyspark.sql.functions import lit

In [0]:
circuit_update_name_df = circuit_select_df\
    .withColumnRenamed('circuitId','circuit_id')\
    .withColumnRenamed('circuitRef', 'circuit_ref')\
    .withColumnRenamed('lat', 'latitude')\
    .withColumnRenamed('lng', 'longitude')\
    .withColumnRenamed('alt', 'altitude')\
    .withColumn('data_source',lit(v_data_source))\
    .withColumn('file_date',lit(v_file_date))

In [0]:
display(circuit_update_name_df)

### add timestamp to the dataframe

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
circuit_df = add_ingestion_date(circuit_update_name_df)

In [0]:
display(circuit_df)

### write data to datalake as parquet 

In [0]:
circuit_df.write.mode('overwrite').format('parquet').saveAsTable('f1_processed.circuits')

In [0]:
%fs
ls /mnt/formula1adls/processed/circuits

In [0]:
display(spark.read.parquet(f'{processed_folder_path}/circuits'))

In [0]:
dbutils.notebook.exit('Success')