# Incremental files ingestion from Azure Blob Storage to Microsoft Fabric Lakehouse using PySpark Notebooks

> <br> This project uses the database provided by the government of Brazil called **Cadastro Geral de Empregados e Desempregados (CAGED)** (General Register of Employed and Unemployed).<br><br>
> This dataset consists of monthly .txt files and is made available on ftp for public download.  <br>  
> Download available at: <br> 
> <br> 

**Notebook**: BronzeToSilver  
**Description**: This PySpark notebook performs incremental ingestion of rows from the Bronze layer of the Lakehouse to the Silver layer. It uses the ingestion date of each original file as a reference.

In [1]:
# Imports
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Set case sensitive for table and column names
spark.conf.set('spark.sql.caseSensitive', True)

StatementMeta(, 9d49ac33-b5a3-4dc5-a5d4-739ddf5dbc0d, 3, Finished, Available, Finished)

## Parameters

In [2]:
# Lakehouse paths
bronze_meta_table         = "CagedBronzeMeta"  # Control table to track from Landing (Files) to Bronze (Delta)
bronze_table              = "CagedBronze"      # Bronze layer of data (as-is)
silver_meta_table         = "CagedSilverMeta"
silver_table              = "CagedSilver"

StatementMeta(, 9d49ac33-b5a3-4dc5-a5d4-739ddf5dbc0d, 4, Finished, Available, Finished)

## Prepare, create and load schemas and tables

In [3]:
# Define schema to silver_meta_table 
silver_meta_schema = StructType([
    StructField("file_path", StringType(), False),
    StructField("source_modified_at", TimestampType(), False),
    StructField("processed_at", TimestampType(), True),
])

# Create the silver_meta_table (if not exists)
spark.createDataFrame([], silver_meta_schema) \
    .write.format("delta") \
    .mode("ignore") \
    .saveAsTable(silver_meta_table)

# Define schema to silver_table with types and PascalCase column names
silver_schema = StructType([
    StructField("CompetenciaMov", DateType(), True),  
    StructField("Regiao", StringType(), True),
    StructField("Uf", StringType(), True),
    StructField("Municipio", StringType(), True),
    StructField("Secao", StringType(), True),
    StructField("Subclasse", StringType(), True),
    StructField("SaldoMovimentacao", IntegerType(), True),
    StructField("Categoria", StringType(), True),
    StructField("Cbo2002Ocupacao", StringType(), True),
    StructField("GrauInstrucao", IntegerType(), True),
    StructField("Idade", IntegerType(), True),
    StructField("HorasContratuais", StringType(), True),
    StructField("RacaCor", StringType(), True),
    StructField("Sexo", StringType(), True),
    StructField("TipoEmpregador", StringType(), True),
    StructField("TipoEstabelecimento", StringType(), True),
    StructField("TipoMovimentacao", StringType(), True),
    StructField("TipoDeficiencia", StringType(), True),
    StructField("IndTrabIntermitente", StringType(), True),
    StructField("Salario", DecimalType(15, 2), True),
    StructField("TamEstabJan", StringType(), True),
    StructField("IndicadorAprendiz", StringType(), True),
    StructField("OrigemInformacao", StringType(), True),
    StructField("CompetenciaDec", DateType(), True),  
    StructField("IndicadorForaPrazo", StringType(), True),
    StructField("UnidadeSalarioCodigo", StringType(), True),
    StructField("ValorSalarioFixo", DecimalType(15, 2), True),
    # Control Columns
    StructField("FilePath", StringType(), False),
    StructField("SourceModifiedAt", TimestampType(), False),
    StructField("IsActive", BooleanType(), False)
])

# Create the silver_table (if not exists)
spark.createDataFrame([], silver_schema) \
    .write.format("delta") \
    .mode("ignore") \
    .partitionBy("CompetenciaMov") \
    .saveAsTable(silver_table)

# Load bronze_table and silver_meta_table
df_bronze = spark.table(bronze_table)
df_silver_meta = spark.table(silver_meta_table) 

# Consolidate the files already processed on silver
df_silver_meta_latest = df_silver_meta.groupBy("file_path").agg(
    F.max("source_modified_at").alias("last_processed_source_mtime")
)

StatementMeta(, 9d49ac33-b5a3-4dc5-a5d4-739ddf5dbc0d, 5, Finished, Available, Finished)

## 

## Find candidates to load

In [4]:
# Identify candidate rows for ingestion (newers or updated)
df_silver_candidates = (
    df_bronze
    .filter(F.col("is_active") == True)  # Just active records
    .join(df_silver_meta_latest, df_bronze.file_path == df_silver_meta_latest.file_path, "left")
    .filter(
        F.col("last_processed_source_mtime").isNull() |
        (F.col("source_modified_at") > F.coalesce(F.col("last_processed_source_mtime"), F.lit("1970-01-01")))
    )
    .select(
        df_bronze.file_path,
        df_bronze.source_modified_at,
        df_bronze.is_active,
        # Select all columns except
        *[F.col(col) for col in df_bronze.columns if col not in ["file_path", "source_modified_at", "is_active"]]
    )
)

# Count candidates
print(f"Rows to process from Bronze to Silver: {df_silver_candidates.count()}")


StatementMeta(, 9d49ac33-b5a3-4dc5-a5d4-739ddf5dbc0d, 6, Finished, Available, Finished)

Rows to process from Bronze to Silver: 18543723


## Process the load

In [5]:
# Process candidate rows

if df_silver_candidates.rdd.isEmpty():
    print("Nothing to load. Skipping...")
else:
    # Transform the data
    df_transformed = df_silver_candidates.select(
        F.to_date(F.col("competênciamov"), "yyyyMM").alias("CompetenciaMov"),
        F.col("região").alias("Regiao"),
        F.col("uf").alias("Uf"),
        F.col("município").alias("Municipio"),
        F.col("seção").alias("Secao"),
        F.col("subclasse").alias("Subclasse"),
        F.col("saldomovimentação").cast("integer").alias("SaldoMovimentacao"),
        F.col("categoria").alias("Categoria"),
        F.col("cbo2002ocupação").alias("Cbo2002Ocupacao"),
        F.col("graudeinstrução").cast(IntegerType()).alias("GrauInstrucao"),
        F.col("idade").cast(IntegerType()).alias("Idade"),
        F.col("horascontratuais").alias("HorasContratuais"),
        F.col("raçacor").alias("RacaCor"),
        F.col("sexo").alias("Sexo"),
        F.col("tipoempregador").alias("TipoEmpregador"),
        F.col("tipoestabelecimento").alias("TipoEstabelecimento"),
        F.col("tipomovimentação").alias("TipoMovimentacao"),
        F.col("tipodedeficiência").alias("TipoDeficiencia"),
        F.col("indtrabintermitente").alias("IndTrabIntermitente"),
        F.regexp_replace(F.col("salário"), ",", ".").cast("decimal(15,2)").alias("Salario"),
        F.col("tamestabjan").cast("integer").alias("TamEstabJan"),
        F.col("indicadoraprendiz").alias("IndicadorAprendiz"),
        F.col("origemdainformação").alias("OrigemInformacao"),
        F.to_date(F.col("competênciadec"), "yyyyMM").alias("CompetenciaDec"),
        F.col("indicadordeforadoprazo").alias("IndicadorForaPrazo"),
        F.col("unidadesaláriocódigo").alias("UnidadeSalarioCodigo"),
        F.regexp_replace(F.col("valorsaláriofixo"), ",", ".").cast("decimal(15,2)").alias("ValorSalarioFixo"),
        F.col("file_path").alias("FilePath"),
        F.col("source_modified_at").alias("SourceModifiedAt"),
        F.col("is_active").alias("IsActive")
    )

    # Merge to update or insert into silver table
    df_transformed.createOrReplaceTempView("source_data")
    spark.sql(f"""
    MERGE INTO {silver_table} AS target
    USING source_data AS source
    ON target.FilePath = source.FilePath AND target.CompetenciaMov = source.CompetenciaMov
    WHEN MATCHED AND target.SourceModifiedAt < source.SourceModifiedAt THEN
        UPDATE SET *
    WHEN NOT MATCHED THEN
        INSERT *
    """)

    # Optimize the silver_table to reduce small files
    spark.sql(f"OPTIMIZE {silver_table}")

    # Record metadados
    processed_files = df_silver_candidates.select("file_path", "source_modified_at").distinct().collect()
    df_processed = spark.createDataFrame(
        [{"file_path": row["file_path"], "source_modified_at": row["source_modified_at"], "processed_at": datetime.now()} for row in processed_files],
        schema=silver_meta_schema
    )
    df_processed.write.format("delta").mode("append").saveAsTable(silver_meta_table)

    print("Silver zone updated successfully!") 

StatementMeta(, 9d49ac33-b5a3-4dc5-a5d4-739ddf5dbc0d, 7, Finished, Available, Finished)

Silver zone updated successfully!
