# Incremental files ingestion from Azure Blob Storage to Microsoft Fabric Lakehouse using PySpark Notebooks

> <br> This project uses the database provided by the government of Brazil called **Cadastro Geral de Empregados e Desempregados (CAGED)** (General Register of Employed and Unemployed).<br><br>
> This dataset consists of monthly .txt files and is made available on ftp for public download.  <br>  
> Download available at: <br> 
> <br> 



**Notebook**: BlobToLanding.ipynb  
**Description**: This PySpark notebook performs incremental ingestion of files from Azure Blob Storage to the Landing layer of the Lakehouse. It uses the modification date of each file in the Blob as a reference.

In [22]:
# Imports
import re

from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Set case sensitive for table and column names
spark.conf.set('spark.sql.caseSensitive', True)


StatementMeta(, 0bb8f224-3723-4e8b-a618-ad1d08b0de00, 24, Finished, Available, Finished)

## Parameters  

In [23]:
# Blob Storage
blob_storage_account_name = "pezzott"
blob_container_name       = "caged"
blob_root_wasbs           = f"wasbs://{blob_container_name}@{blob_storage_account_name}.blob.core.windows.net" 

# Key vault secrets
key_vault_name            = "pezzott"
key_vault_secret_name     = "blob-pezzott-caged"

# Lakehouse paths
landing_root_path         = f"Files/Landing/CAGED"
landing_meta_table        = "CagedLandingMeta"      # Track what was copied from Blob to Landing


StatementMeta(, 0bb8f224-3723-4e8b-a618-ad1d08b0de00, 25, Finished, Available, Finished)

## Shared access signature (SAS) for the WASBS driver

In [24]:
# SAS Token from Blob
sas_token = notebookutils.credentials.getSecret(
        f"https://{key_vault_name}.vault.azure.net/", 
        key_vault_secret_name
    )

# Configuring o SAS for the WASBS driver
spark.conf.set(
    f'fs.azure.sas.{blob_container_name}.{blob_storage_account_name}.blob.core.windows.net',
    sas_token
)

StatementMeta(, 0bb8f224-3723-4e8b-a618-ad1d08b0de00, 26, Finished, Available, Finished)

## List files from Blob

In [25]:
df_source_files = (
    spark.read.format("binaryFile")
         .option("recursiveFileLookup", "true")
         .load(f"{blob_root_wasbs}")
         .select(
             F.col("path").alias("source_path"),
             F.col("modificationTime").alias("source_modified_at"),  
             F.round(F.col("length") / 1048576, 2).alias("source_size_mb")                    
         )
         .orderBy('path')
)

display(df_source_files.limit(10)) 

StatementMeta(, 0bb8f224-3723-4e8b-a618-ad1d08b0de00, 27, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, ce8d95fa-da14-42cf-b598-dd491dbbc649)

## Mount the target keeping same tree after container

In [26]:
source_prefix = blob_root_wasbs.rstrip("/") + "/"
remove_pattern = "^" + re.escape(source_prefix)

file_name_col = F.regexp_extract("source_path", r"([^/]+)$", 1)
df_source_files = (df_source_files
    .withColumn("relative_path", F.regexp_replace("source_path", remove_pattern, ""))
    .withColumn("target_path",   F.concat(F.lit(landing_root_path + "/"), F.col("relative_path")))
)

df_source_files = df_source_files.select(
    "source_path",
    "source_modified_at",  
    "source_size_mb",         
    "target_path"
)

display(df_source_files)

StatementMeta(, 0bb8f224-3723-4e8b-a618-ad1d08b0de00, 28, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 175498eb-e355-44f3-850d-11243149ac22)

## Find the candidate files to copy

In [27]:
# Define schema to landing_meta_table
landing_meta_schema = StructType([
  StructField("source_path",        StringType(),    False),
  StructField("source_modified_at", TimestampType(), False),
  StructField("source_size_mb",     FloatType(),   False),
  StructField("target_path",        StringType(),    True ),
  StructField("copied_at",          TimestampType(), True ),
])

# Create the landing_meta_table (if not exists)
spark.createDataFrame([], landing_meta_schema) \
    .write.format("delta") \
    .mode("ignore") \
    .saveAsTable(landing_meta_table)

# Load as a named table
df_landing_meta = spark.table(landing_meta_table)

# Consolidate latest copied info per source_path
df_landing_meta_latest = df_landing_meta.groupBy("source_path").agg(
    F.max("source_modified_at").alias("last_copied_source_mtime")
)

# Compute candidates to copy
df_candidates = (
    df_source_files
    .join(df_landing_meta_latest, on="source_path", how="left")
    .filter(
        F.col("source_modified_at") > F.coalesce(F.col("last_copied_source_mtime"), F.lit("1970-01-01"))
    )
    .select(
        "source_path", 
        "source_modified_at", 
        "source_size_mb", 
        "target_path"
    )
)    

print(f"Files to copy now (new or updated): {df_candidates.count()}")
display(df_candidates.orderBy("source_path"))

StatementMeta(, 0bb8f224-3723-4e8b-a618-ad1d08b0de00, 29, Finished, Available, Finished)

Files to copy now (new or updated): 1


SynapseWidget(Synapse.DataFrame, 3990d375-a403-4c37-a975-f7ebc864aca6)

## Copy files from Blob to Landing Zone

In [28]:
if df_candidates.isEmpty():
    print("Nothing to copy. Skipping...")
else:
    # Candidates from DataFrame to List
    candidates = df_candidates.select(
        "source_path", 
        "target_path", 
        "source_modified_at", 
        "source_size_mb"
    ).collect()

    # Copy files and prepare for update landing_meta_table
    copied_files = []
    for row in candidates:
        source_path = row["source_path"]
        target_path = row["target_path"]
        source_modified_at = row["source_modified_at"]
        source_size = row["source_size_mb"]
        
        try:
            # Copy file
            notebookutils.fs.fastcp(source_path, target_path)
            
            # Record metadata
            copied_files.append({
                "source_path": source_path,
                "source_modified_at": source_modified_at,
                "source_size_mb": source_size,
                "target_path": target_path,
                "copied_at": datetime.now() 
            })
        except Exception as e:
            print(f"Error copying {source_path} to {target_path}: {str(e)}")

    # Create DataFrame with copied files
    df_copied = spark.createDataFrame(
        copied_files, 
        schema=spark.table(landing_meta_table).schema
    )

    # write the metadata
    df_copied.write.format("delta").mode("append").saveAsTable(landing_meta_table)

    # Show summary
    print(f"Copied files: {len(copied_files)}")
    display(df_copied.orderBy("source_path"))


StatementMeta(, 0bb8f224-3723-4e8b-a618-ad1d08b0de00, 30, Finished, Available, Finished)

Copied files: 1


SynapseWidget(Synapse.DataFrame, e1bac2d1-983e-44ba-8303-6603227082b5)