#AutoLoader

#####Setup

In [0]:
base_dir = 'abfss://landingzone@stavikaslakefreetrail.dfs.core.windows.net/Autoloader/PR_Demo'

#####1. Verify you can access the invoices directory

In [0]:
%fs ls abfss://landingzone@stavikaslakefreetrail.dfs.core.windows.net/Autoloader/PR_Demo/Invoices/

path,name,size,modificationTime
abfss://landingzone@stavikaslakefreetrail.dfs.core.windows.net/Autoloader/PR_Demo/Invoices/invoices_01-06-2022.csv,invoices_01-06-2022.csv,43357,1726743279000
abfss://landingzone@stavikaslakefreetrail.dfs.core.windows.net/Autoloader/PR_Demo/Invoices/invoices_02-06-2022.csv,invoices_02-06-2022.csv,8644,1726743279000
abfss://landingzone@stavikaslakefreetrail.dfs.core.windows.net/Autoloader/PR_Demo/Invoices/invoices_03-06-2022.csv,invoices_03-06-2022.csv,735,1726743279000


Lets have a look at autoloader in action<br>
Do not leave the below cell running, terminate the query the moment you dont need it

In [0]:
source_df = (spark.readStream
                      .format("cloudFiles")
                      .option("cloudFiles.format", "csv")  
                      .option("header", "true") 
                      .option("timestampFormat","d-M-y H.m")                  
                      .option("cloudFiles.schemaLocation", f"{base_dir}/chekpoint/invoices_schema")
                      .option("cloudFiles.inferColumnTypes", "true")
                      .option("cloudFiles.schemaHints", "InvoiceNo string, CustomerID string")
                      .load(f"{base_dir}/Invoices")
  )

display(source_df)

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,_rescued_data
565231,47504K,ENGLISH ROSE GARDEN SECATEURS,1,2022-06-01T09:26:00Z,3.29,,
565231,47566,PARTY BUNTING,2,2022-06-01T09:26:00Z,10.79,,
565231,51014A,"FEATHER PEN,HOT PINK",2,2022-06-01T09:26:00Z,0.83,,
565231,72760B,VINTAGE CREAM 3 BASKET CAKE STAND,1,2022-06-01T09:26:00Z,20.79,,
565231,72807A,SET/3 ROSE CANDLE IN JEWELLED BOX,1,2022-06-01T09:26:00Z,8.29,,
565231,72807C,SET/3 VANILLA SCENTED CANDLE IN BOX,1,2022-06-01T09:26:00Z,8.29,,
565231,82551,LAUNDRY 15C METAL SIGN,1,2022-06-01T09:26:00Z,2.46,,
565231,82567,"AIRLINE LOUNGE,METAL SIGN",1,2022-06-01T09:26:00Z,1.63,,
565231,82578,KITCHEN METAL SIGN,1,2022-06-01T09:26:00Z,1.25,,
565231,82580,BATHROOM METAL SIGN,1,2022-06-01T09:26:00Z,1.25,,


#####2. Ingest data into invoices_raw table using spark streaming api

In [0]:
#create a managed table in bronze schema for autoloader ingestion
spark.sql("drop table if exists  psl_salesdev.bronze.invoices_raw")
spark.sql("""
          CREATE TABLE IF NOT EXISTS psl_salesdev.bronze.invoices_raw(
        InvoiceNo string COMMENT 'Invoice number',
        StockCode string,
        Description string,
        Quantity int,
        InvoiceDate timestamp,
        UnitPrice double,
        CustomerID string)""")

DataFrame[]

In [0]:
def ingest():
  source_df = (spark.readStream
                      .format("cloudFiles")
                      .option("cloudFiles.format", "csv")  
                      .option("header", "true") 
                      .option("timestampFormat","d-M-y H.m")                  
                      .option("cloudFiles.schemaLocation", f"{base_dir}/chekpoint/invoices_schema")
                      .option("cloudFiles.inferColumnTypes", "true")
                      .option("cloudFiles.schemaHints", "InvoiceNo string, CustomerID string")
                      .load(f"{base_dir}/Invoices")
  )

  write_query = (source_df.writeStream
                          .format("delta")
                          .option("checkpointLocation", f"{base_dir}/chekpoint/Invoices")
                          .option("mergeSchema", "true")
                          .outputMode("append")                          
                          .trigger(availableNow = True)
                          .toTable("psl_salesdev.bronze.invoices_raw")
  )

ingest() 

#####3. Check the records after ingestion

In [0]:
%sql
SELECT * FROM psl_salesdev.bronze.invoices_raw

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,_rescued_data
565231,47504K,ENGLISH ROSE GARDEN SECATEURS,1,2022-06-01T09:26:00Z,3.29,,
565231,47566,PARTY BUNTING,2,2022-06-01T09:26:00Z,10.79,,
565231,51014A,"FEATHER PEN,HOT PINK",2,2022-06-01T09:26:00Z,0.83,,
565231,72760B,VINTAGE CREAM 3 BASKET CAKE STAND,1,2022-06-01T09:26:00Z,20.79,,
565231,72807A,SET/3 ROSE CANDLE IN JEWELLED BOX,1,2022-06-01T09:26:00Z,8.29,,
565231,72807C,SET/3 VANILLA SCENTED CANDLE IN BOX,1,2022-06-01T09:26:00Z,8.29,,
565231,82551,LAUNDRY 15C METAL SIGN,1,2022-06-01T09:26:00Z,2.46,,
565231,82567,"AIRLINE LOUNGE,METAL SIGN",1,2022-06-01T09:26:00Z,1.63,,
565231,82578,KITCHEN METAL SIGN,1,2022-06-01T09:26:00Z,1.25,,
565231,82580,BATHROOM METAL SIGN,1,2022-06-01T09:26:00Z,1.25,,


In [0]:
%sql
DESCRIBE psl_salesdev.bronze.invoices_raw

col_name,data_type,comment
InvoiceNo,string,Invoice number
StockCode,string,
Description,string,
Quantity,int,
InvoiceDate,timestamp,
UnitPrice,double,
CustomerID,string,
_rescued_data,string,


#####4. Ingest some more data into the invoices directory which comes with an additional column

In [0]:
%fs cp abfss://landingzone@stavikaslakefreetrail.dfs.core.windows.net/Autoloader/PR_Demo/invoices_2021.csv abfss://landingzone@stavikaslakefreetrail.dfs.core.windows.net/Autoloader/PR_Demo/Invoices/

#####5. Ingest with a retry

In [0]:
ingest()


#####6. Check the data 

In [0]:
%sql
SELECT * FROM psl_salesdev.bronze.invoices_raw

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,_rescued_data,Country
565231,47504K,ENGLISH ROSE GARDEN SECATEURS,1,2022-06-01T09:26:00Z,3.29,,,
565231,47566,PARTY BUNTING,2,2022-06-01T09:26:00Z,10.79,,,
565231,51014A,"FEATHER PEN,HOT PINK",2,2022-06-01T09:26:00Z,0.83,,,
565231,72760B,VINTAGE CREAM 3 BASKET CAKE STAND,1,2022-06-01T09:26:00Z,20.79,,,
565231,72807A,SET/3 ROSE CANDLE IN JEWELLED BOX,1,2022-06-01T09:26:00Z,8.29,,,
565231,72807C,SET/3 VANILLA SCENTED CANDLE IN BOX,1,2022-06-01T09:26:00Z,8.29,,,
565231,82551,LAUNDRY 15C METAL SIGN,1,2022-06-01T09:26:00Z,2.46,,,
565231,82567,"AIRLINE LOUNGE,METAL SIGN",1,2022-06-01T09:26:00Z,1.63,,,
565231,82578,KITCHEN METAL SIGN,1,2022-06-01T09:26:00Z,1.25,,,
565231,82580,BATHROOM METAL SIGN,1,2022-06-01T09:26:00Z,1.25,,,


In [0]:
%sql
DESCRIBE psl_salesdev.bronze.invoices_raw

col_name,data_type,comment
InvoiceNo,string,Invoice number
StockCode,string,
Description,string,
Quantity,int,
InvoiceDate,timestamp,
UnitPrice,double,
CustomerID,string,
_rescued_data,string,
Country,string,


#####7. Ingest some more records with potential bad records

In [0]:
%fs cp abfss://landingzone@stavikaslakefreetrail.dfs.core.windows.net/Autoloader/PR_Demo/invoices_2022.csv abfss://landingzone@stavikaslakefreetrail.dfs.core.windows.net/Autoloader/PR_Demo/Invoices/

In [0]:
ingest()

#####8. Check the rescued data

In [0]:
%sql
SELECT * FROM psl_salesdev.bronze.invoices_raw where _rescued_data is not null

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,_rescued_data,Country
536639,22822,CREAM WALL PLANTER HEART SHAPED,2,,5.95,15111,"{""InvoiceDate"":""30-30-2022 11.45"",""_file_path"":""abfss://landingzone@stavikaslakefreetrail.dfs.core.windows.net/Autoloader/PR_Demo/Invoices/invoices_2022.csv""}",United Kingdom


&copy; Polestar Solutions and Services India Pvt. Ltd. All rights reserved.<br/>