# Incremental files ingestion from Azure Blob Storage to Microsoft Fabric Lakehouse using PySpark Notebooks

> <br> This project uses the database provided by the government of Brazil called **Cadastro Geral de Empregados e Desempregados (CAGED)** (General Register of Employed and Unemployed).<br><br>
> This dataset consists of monthly .txt files and is made available on ftp for public download.  <br>  
> Download available at: <br> 
> <br> 



**Notebook**: Layout.ipynb  
**Description**: This PySpark notebook performs ingestion of files of Layout from Azure Blob Storage to the Landing layer of the Lakehouse.  

In [None]:
# Imports
import re

from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Set case sensitive for table and column names
spark.conf.set('spark.sql.caseSensitive', True)


## Parameters  

In [None]:
# Blob Storage
blob_storage_account_name = "pezzott"
blob_container_name       = "caged-layout"
blob_root_wasbs           = f"wasbs://{blob_container_name}@{blob_storage_account_name}.blob.core.windows.net" 

# Key vault secrets
key_vault_name            = "pezzott"
key_vault_secret_name     = "blob-pezzott-caged-layout"

# Lakehouse paths
landing_root_path         = f"Files/Landing/CAGED-Layout"
landing_meta_table        = f"CagedLayoutMeta"


## Shared access signature (SAS) for the WASBS driver

In [None]:
# SAS Token from Blob
sas_token = notebookutils.credentials.getSecret(
        f"https://{key_vault_name}.vault.azure.net/", 
        key_vault_secret_name
    )

# Configuring o SAS for the WASBS driver
spark.conf.set(
    f'fs.azure.sas.{blob_container_name}.{blob_storage_account_name}.blob.core.windows.net',
    sas_token
)

## List files from Blob

In [None]:
df_source_files = (
    spark.read.format("binaryFile")
         .option("recursiveFileLookup", "true")
         .load(f"{blob_root_wasbs}")
         .select(
             F.col("path").alias("source_path"),
             F.col("modificationTime").alias("source_modified_at"),  
             F.round(F.col("length") / 1048576, 2).alias("source_size_mb")                    
         )
         .orderBy('path')
)

display(df_source_files.limit(10)) 

## Mount the target keeping same tree after container

In [None]:
source_prefix = blob_root_wasbs.rstrip("/") + "/"
remove_pattern = "^" + re.escape(source_prefix)

file_name_col = F.regexp_extract("source_path", r"([^/]+)$", 1)
df_source_files = (df_source_files
    .withColumn("relative_path", F.regexp_replace("source_path", remove_pattern, ""))
    .withColumn("target_path",   F.concat(F.lit(landing_root_path + "/"), F.col("relative_path")))
)

df_source_files = df_source_files.select(
    "source_path",
    "source_modified_at",  
    "source_size_mb",         
    "target_path"
)

display(df_source_files)

## Find the candidate files to copy

In [None]:
# Define schema to landing_meta_table
landing_meta_schema = StructType([
  StructField("source_path",        StringType(),    False),
  StructField("source_modified_at", TimestampType(), False),
  StructField("source_size_mb",     FloatType(),     False),
  StructField("target_path",        StringType(),    True ),
  StructField("copied_at",          TimestampType(), True ),
])

# Create the landing_meta_table (if not exists)
spark.createDataFrame([], landing_meta_schema) \
    .write.format("delta") \
    .mode("ignore") \
    .saveAsTable(landing_meta_table)

# Load as a named table
df_landing_meta = spark.table(landing_meta_table)

# Consolidate latest copied info per source_path
df_landing_meta_latest = df_landing_meta.groupBy("source_path").agg(
    F.max("source_modified_at").alias("last_copied_source_mtime")
)

# Compute candidates to copy
df_candidates = (
    df_source_files
    .join(df_landing_meta_latest, on="source_path", how="left")
    .filter(
        F.col("source_modified_at") > F.coalesce(F.col("last_copied_source_mtime"), F.lit("1970-01-01"))
    )
    .select(
        "source_path", 
        "source_modified_at", 
        "source_size_mb", 
        "target_path"
    )
)    

print(f"Files to copy now (new or updated): {df_candidates.count()}")
display(df_candidates.orderBy("source_path"))

## Copy files from Blob to Landing Zone

In [None]:
if df_candidates.isEmpty():
    print("Nothing to copy. Skipping...")
else:
    # Candidates from DataFrame to List
    candidates = df_candidates.select(
        "source_path", 
        "target_path", 
        "source_modified_at", 
        "source_size_mb"
    ).collect()

    # Copy files and prepare for update landing_meta_table
    copied_files = []
    for row in candidates:
        source_path = row["source_path"]
        target_path = row["target_path"]
        source_modified_at = row["source_modified_at"]
        source_size = row["source_size_mb"]
        
        try:
            # Copy file
            notebookutils.fs.fastcp(source_path, target_path)
            
            # Record metadata
            copied_files.append({
                "source_path": source_path,
                "source_modified_at": source_modified_at,
                "source_size_mb": source_size,
                "target_path": target_path,
                "copied_at": datetime.now() 
            })
        except Exception as e:
            print(f"Error copying {source_path} to {target_path}: {str(e)}")

    # Create DataFrame with copied files
    df_copied = spark.createDataFrame(
        copied_files, 
        schema=spark.table(landing_meta_table).schema
    )

    # write the metadata
    df_copied.write.format("delta").mode("append").saveAsTable(landing_meta_table)

    # Show summary
    print(f"Copied files: {len(copied_files)}")
    display(df_copied.orderBy("source_path"))


## Load Layout Files to Delta table

In [None]:
# Function to ingest and load each Layout file to table
def load_layout_table(
    table_name: str,
    codigo_is_integer: bool = True,
):
    schema = StructType([
        StructField("Codigo",    IntegerType() if codigo_is_integer else StringType(), True),  
        StructField("Descricao", StringType(),  True)
    ])

    spark.createDataFrame([], schema) \
        .write.format("delta") \
        .mode("ignore") \
        .saveAsTable(table_name)
    
    file_path = f"{landing_root_path}/{table_name}.txt"

    df = spark.read \
        .format("csv") \
        .schema(schema) \
        .option("header", "true") \
        .option("sep", ";") \
        .option("encoding", "UTF-8") \
        .load(file_path)

    df.write.format("delta").mode("overwrite").saveAsTable(table_name)
    print(f"Table {table_name} loaded successfully.")  



In [None]:
load_layout_table("FaixaEtaria") 
load_layout_table("GrauInstrucao")
load_layout_table("RacaCor", False) 
load_layout_table("Regiao", False)
load_layout_table("Secao", False)
load_layout_table("Sexo", False)
load_layout_table("Subclasse", False)
load_layout_table("TamEstabJan")
load_layout_table("Uf", False) 


### 