# Incremental files ingestion from Azure Blob Storage to Microsoft Fabric Lakehouse using PySpark Notebooks

> <br> This project uses the database provided by the government of Brazil called **Cadastro Geral de Empregados e Desempregados (CAGED)** (General Register of Employed and Unemployed).<br><br>
> This dataset consists of monthly .txt files and is made available on ftp for public download.  <br>  
> Download available at: <br> 
> <br> 

**Notebook**: SilverToGold  
**Description**: This PySpark notebook performs incremental ingestion of rows from the Silver layer of the Lakehouse to the Gold layer. It uses the ingestion date of each original file as a reference.

In [None]:
# Imports
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import *

# Set case sensitive for table and column names
spark.conf.set('spark.sql.caseSensitive', True)

## Parameters

In [None]:
# Lakehouse paths
silver_meta_table         = "CagedSilverMeta"
silver_table              = "CagedSilver"
gold_meta_table         = "CagedGoldMeta"  # Control table to track from Silver (Curated) to Gold
gold_table              = "CagedGold"      # Gold layer of data (Aggregated)

## Prepare, create and load schemas and tables

In [None]:
# Define schema to gold_meta_table 
gold_meta_schema = StructType([
    StructField("file_path", StringType(), False),
    StructField("source_modified_at", TimestampType(), False),
    StructField("processed_at", TimestampType(), True),
])

# Create the gold_meta_table (if not exists)
spark.createDataFrame([], gold_meta_schema) \
    .write.format("delta") \
    .mode("ignore") \
    .saveAsTable(gold_meta_table)

# Define schema to gold_table with types and PascalCase column names
gold_schema = StructType([
    StructField("CompetenciaMov", DateType(), True),  
    StructField("Regiao", StringType(), True),
    StructField("Uf", StringType(), True),
    StructField("Municipio", StringType(), True),
    StructField("Secao", StringType(), True),
    StructField("Subclasse", StringType(), True),
    StructField("SaldoMovimentacao", IntegerType(), True),
    StructField("GrauInstrucao", IntegerType(), True),
    StructField("Idade", IntegerType(), True),
    StructField("RacaCor", StringType(), True),
    StructField("Sexo", StringType(), True),
    StructField("TamEstabJan", StringType(), True),
    # Control Columns
    StructField("FilePath", StringType(), False),
    StructField("SourceModifiedAt", TimestampType(), False),
    StructField("IsActive", BooleanType(), False)
])

# Create the gold_table (if not exists)
spark.createDataFrame([], gold_schema) \
    .write.format("delta") \
    .mode("ignore") \
    .partitionBy("CompetenciaMov") \
    .saveAsTable(gold_table)

# Load silver_table and gold_meta_table
df_silver = spark.table(silver_table)
df_gold_meta = spark.table(gold_meta_table)

# Consolidate the files already processed on silver
df_gold_meta_latest = df_gold_meta.groupBy("file_path").agg(
    F.max("source_modified_at").alias("last_processed_source_mtime")
)

## 

## Find candidates to load

In [None]:
# Identify candidate rows for ingestion (newers or updated)
df_gold_candidates = (
    df_silver
    .join(df_gold_meta_latest, df_silver.file_path == df_gold_meta_latest.file_path, "left")
    .filter(
        F.col("last_processed_source_mtime").isNull() |
        (F.col("source_modified_at") > F.coalesce(F.col("last_processed_source_mtime"), F.lit("1970-01-01")))
    )
    .select(
        df_silver.file_path,
        df_silver.source_modified_at,
        df_silver.is_active,
        # Select all columns except
        *[F.col(col) for col in df_silver.columns if col not in ["file_path", "source_modified_at", "is_active"]]
    )
)

# Count candidates
print(f"Rows to process from Silver to Gold: {df_gold_candidates.count()}")


## Process the load

In [None]:
# Process candidate rows

if df_gold_candidates.rdd.isEmpty():
    print("Nothing to load. Skipping...")
else:
    # Transform the data
    df_transformed = df_gold_candidates.select(
        F.to_date(F.col("competênciamov"), "yyyyMM").alias("CompetenciaMov"),
        F.col("região").alias("Regiao"),
        F.col("uf").alias("Uf"),
        F.col("município").alias("Municipio"),
        F.col("seção").alias("Secao"),
        F.col("subclasse").alias("Subclasse"),
        F.col("saldomovimentação").cast("integer").alias("SaldoMovimentacao"),
        F.col("graudeinstrução").cast(IntegerType()).alias("GrauInstrucao"),
        F.col("idade").cast(IntegerType()).alias("Idade"),
        F.col("raçacor").alias("RacaCor"),
        F.col("sexo").alias("Sexo"),
        F.col("tamestabjan").cast("integer").alias("TamEstabJan"),
        F.col("file_path").alias("FilePath"),
        F.col("source_modified_at").alias("SourceModifiedAt"),
        F.col("is_active").alias("IsActive")
    )

    # Merge to update or insert into gold table
    df_transformed.createOrReplaceTempView("source_data")
    spark.sql(f"""
    MERGE INTO {gold_table} AS target
    USING source_data AS source
    ON target.FilePath = source.FilePath AND target.CompetenciaMov = source.CompetenciaMov
    WHEN MATCHED AND target.SourceModifiedAt < source.SourceModifiedAt THEN
        UPDATE SET *
    WHEN NOT MATCHED THEN
        INSERT *
    """)

    # Optimize the gold_table to reduce small files
    spark.sql(f"OPTIMIZE {gold_table}")

    # Record metadados
    processed_files = df_gold_candidates.select("file_path", "source_modified_at").distinct().collect()
    df_processed = spark.createDataFrame(
        [{"file_path": row["file_path"], "source_modified_at": row["source_modified_at"], "processed_at": datetime.now()} for row in processed_files],
        schema=gold_meta_schema
    )
    df_processed.write.format("delta").mode("append").saveAsTable(gold_meta_table)

    print("Gold zone updated successfully!")