GOLD LAYER – ANALYTICS TABLES

In [0]:
%sql
USE CATALOG main;
USE SCHEMA agriculture_data;

Crop Production Trend (with Weather Impact)

In [0]:
# Use agriculture database
spark.sql("USE agriculture_data")

from pyspark.sql.functions import sum, avg, col

# Load silver tables
crop_df = spark.table("silver_crop_production")
weather_df = spark.table("silver_weather")

# Join crop data with weather data
gold_crop_weather_trend = (
    crop_df
    .join(
        weather_df,
        on=["state", "district", "year"],
        how="left"
    )
    .groupBy("crop", "year")
    .agg(
        sum("production_tonnes").alias("total_production"),
        sum("area_hectares").alias("total_area"),
        avg("avg_rainfall_mm").alias("avg_rainfall")
    )
)



Gold – Logging – crop_weather_trend

In [0]:
from pyspark.sql.types import (
    StructType, StructField,
    StringType, LongType, TimestampType
)
from pyspark.sql.functions import current_timestamp


In [0]:
log_schema = StructType([
    StructField("pipeline_name", StringType(), True),
    StructField("layer", StringType(), True),
    StructField("table_name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("record_count", LongType(), True),
    StructField("start_time", TimestampType(), True),
    StructField("end_time", TimestampType(), True),
    StructField("error_message", StringType(), True)
])


In [0]:
# Logging metadata
pipeline_name = "agriculture_pipeline"
layer = "GOLD"
table_name = "gold_crop_weather_trend"

try:
    # Record count after aggregation
    record_count = gold_crop_weather_trend.count()

    # Write gold table
    gold_crop_weather_trend.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable(table_name)

    # Success log
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "SUCCESS", record_count, None, None, None)],
        schema=log_schema
    )

    log_df = log_df.withColumn("start_time", current_timestamp()) \
                   .withColumn("end_time", current_timestamp())

    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

except Exception as e:
    # Failure log
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "FAILED", None, None, None, str(e))],
        schema=log_schema
    )

    log_df = log_df.withColumn("start_time", current_timestamp()) \
                   .withColumn("end_time", current_timestamp())

    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

    raise


In [0]:
# Production Validation

# Validate aggregated production values in gold table
invalid_production_records = gold_crop_weather_trend.filter(
    col("total_production") <= 0
).count()

print(f"Invalid production records: {invalid_production_records}")


Invalid production records: 0


Regional Yield with Soil Health

In [0]:
# Use agriculture database
spark.sql("USE agriculture_data")

# Load silver tables
crop_df = spark.table("silver_crop_production")
soil_df = spark.table("silver_soil_health")

# Join crop production with soil health
gold_region_soil_yield = (
    crop_df
    .join(
        soil_df,
        on=["state", "district"],
        how="left"
    )
    .groupBy("state", "district")
    .agg(
        avg("yield_tonnes_per_hectare").alias("avg_yield"),
        avg("ph_level").alias("avg_soil_ph"),
        avg("nitrogen").alias("avg_nitrogen")
    )
)



Gold – Logging – region_soil_yield

In [0]:
# Logging metadata
pipeline_name = "agriculture_pipeline"
layer = "GOLD"
table_name = "gold_region_soil_yield"

try:
    # Record count after aggregation
    record_count = gold_region_soil_yield.count()

    # Write gold table
    gold_region_soil_yield.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable(table_name)

    # Success log
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "SUCCESS", record_count, None, None, None)],
        schema=log_schema
    )

    log_df = log_df.withColumn("start_time", current_timestamp()) \
                   .withColumn("end_time", current_timestamp())

    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

except Exception as e:
    # Failure log
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "FAILED", None, None, None, str(e))],
        schema=log_schema
    )

    log_df = log_df.withColumn("start_time", current_timestamp()) \
                   .withColumn("end_time", current_timestamp())

    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

    raise


Seasonal Yield with Market Prices

In [0]:
# Use agriculture database
spark.sql("USE agriculture_data")

# Load silver tables
crop_df = spark.table("silver_crop_production")
market_df = spark.table("silver_market_prices")

# Join crop production with market prices
gold_season_market_analysis = (
    crop_df
    .join(
        market_df,
        on=["crop", "year"],
        how="left"
    )
    .groupBy("season", "year")
    .agg(
        avg("yield_tonnes_per_hectare").alias("avg_yield"),
        sum("production_tonnes").alias("total_production"),
        avg("market_price_per_quintal").alias("avg_market_price")
    )
)


Gold – Logging – season_market_analysis

In [0]:
# Logging metadata
pipeline_name = "agriculture_pipeline"
layer = "GOLD"
table_name = "gold_season_market_analysis"

try:
    # Record count after aggregation
    record_count = gold_season_market_analysis.count()

    # Write gold table
    gold_season_market_analysis.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable(table_name)

    # Success log
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "SUCCESS", record_count, None, None, None)],
        schema=log_schema
    )

    log_df = log_df.withColumn("start_time", current_timestamp()) \
                   .withColumn("end_time", current_timestamp())

    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

except Exception as e:
    # Failure log
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "FAILED", None, None, None, str(e))],
        schema=log_schema
    )

    log_df = log_df.withColumn("start_time", current_timestamp()) \
                   .withColumn("end_time", current_timestamp())

    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

    raise


Gold Agriculture Summary

In [0]:
# Use agriculture database
spark.sql("USE agriculture_data")

# Gold table 1: Crop + Weather
crop_weather_df = (
    spark.table("gold_crop_weather_trend")
    .withColumnRenamed("total_production", "crop_total_production")
    .withColumnRenamed("total_area", "crop_total_area")
)

# Gold table 2: Region + Soil
region_soil_df = (
    spark.table("gold_region_soil_yield")
    .withColumnRenamed("avg_yield", "region_avg_yield")
)

# Gold table 3: Season + Market
season_market_df = (
    spark.table("gold_season_market_analysis")
    .withColumnRenamed("avg_yield", "season_avg_yield")
    .withColumnRenamed("total_production", "season_total_production")
)

# Create final summary table (lineage anchor)
final_gold_summary = (
    crop_weather_df
    .join(season_market_df, on="year", how="left")
    .join(region_soil_df, how="cross")
)


Gold – Logging – agriculture_summary

In [0]:
# Logging metadata
pipeline_name = "agriculture_pipeline"
layer = "GOLD"
table_name = "gold_agriculture_summary"

try:
    # Record count after final aggregation
    record_count = final_gold_summary.count()

    # Write final gold summary table
    final_gold_summary.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable(table_name)

    # Success log
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "SUCCESS", record_count, None, None, None)],
        schema=log_schema
    )

    log_df = log_df.withColumn("start_time", current_timestamp()) \
                   .withColumn("end_time", current_timestamp())

    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

except Exception as e:
    # Failure log
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "FAILED", None, None, None, str(e))],
        schema=log_schema
    )

    log_df = log_df.withColumn("start_time", current_timestamp()) \
                   .withColumn("end_time", current_timestamp())

    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

    raise


View Logs

In [0]:

# PIPELINE LOG VIEW 

logs_df = spark.table("agriculture_data.pipeline_logs")
logs_df.orderBy("start_time", ascending=False).show(truncate=False)


+--------------------+------+---------------------------+------------+-------+-------------+--------------------------+--------------------------+-------------+
|pipeline_name       |layer |table_name                 |record_count|status |run_timestamp|start_time                |end_time                  |error_message|
+--------------------+------+---------------------------+------------+-------+-------------+--------------------------+--------------------------+-------------+
|agriculture_pipeline|GOLD  |gold_agriculture_summary   |126000      |SUCCESS|NULL         |2026-01-05 15:28:20.250734|2026-01-05 15:28:20.250734|NULL         |
|agriculture_pipeline|GOLD  |gold_season_market_analysis|45          |SUCCESS|NULL         |2026-01-05 15:28:08.756492|2026-01-05 15:28:08.756492|NULL         |
|agriculture_pipeline|GOLD  |gold_region_soil_yield     |400         |SUCCESS|NULL         |2026-01-05 15:27:58.698306|2026-01-05 15:27:58.698306|NULL         |
|agriculture_pipeline|GOLD  |gold_