BRONZE LAYER – RAW DATA INGESTION

In [0]:
%sql
-- Create table for pipeline execution logs
CREATE TABLE IF NOT EXISTS agriculture_data.pipeline_logs (
    pipeline_name STRING,
    layer STRING,
    table_name STRING,
    status STRING,
    record_count LONG,
    start_time TIMESTAMP,
    end_time TIMESTAMP,
    error_message STRING
) USING DELTA;


In [0]:
%sql
USE CATALOG main;
USE SCHEMA agriculture_data;


In [0]:
# Use agriculture database
spark.sql("USE agriculture_data")

# Read raw crop production CSV file
df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("/Volumes/ashritha/default/agriculture/raw/crop_production.csv")
)

Bronze – Logging – crop_production

In [0]:
from pyspark.sql.types import (
    StructType, StructField,
    StringType, LongType, TimestampType
)
from pyspark.sql.functions import current_timestamp


In [0]:
log_schema = StructType([
    StructField("pipeline_name", StringType(), True),
    StructField("layer", StringType(), True),
    StructField("table_name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("record_count", LongType(), True),
    StructField("start_time", TimestampType(), True),
    StructField("end_time", TimestampType(), True),
    StructField("error_message", StringType(), True)
])


In [0]:
pipeline_name = "agriculture_pipeline"
layer = "BRONZE"         
table_name = "bronze_crop_production"

try:
    # Count records processed
    record_count = df.count()

    # Write data to target table
    df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("overwrite") \
        .saveAsTable(table_name)

    # Create success log dataframe with schema
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "SUCCESS", record_count, None, None, None)],
        schema=log_schema
    )

    # Add timestamps
    log_df = log_df \
        .withColumn("start_time", current_timestamp()) \
        .withColumn("end_time", current_timestamp())

    # Append success log
    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

except Exception as e:
    # Create failure log dataframe with schema
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "FAILED", None, None, None, str(e))],
        schema=log_schema
    )

    # Add timestamps
    log_df = log_df \
        .withColumn("start_time", current_timestamp()) \
        .withColumn("end_time", current_timestamp())

    # Append failure log
    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

    # Fail job for Airflow
    raise


In [0]:
# Use agriculture database
spark.sql("USE agriculture_data")

# Read raw weather dataset
df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("/Volumes/ashritha/default/agriculture/raw/weather_data.csv")
)



Bronze – Logging – weather

In [0]:
pipeline_name = "agriculture_pipeline"
layer = "BRONZE"         
table_name = "bronze_weather"

try:
    # Count records processed
    record_count = df.count()

    # Write data to target table
    df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("overwrite") \
        .saveAsTable(table_name)

    # Create success log dataframe with schema
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "SUCCESS", record_count, None, None, None)],
        schema=log_schema
    )

    # Add timestamps
    log_df = log_df \
        .withColumn("start_time", current_timestamp()) \
        .withColumn("end_time", current_timestamp())

    # Append success log
    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

except Exception as e:
    # Create failure log dataframe with schema
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "FAILED", None, None, None, str(e))],
        schema=log_schema
    )

    # Add timestamps
    log_df = log_df \
        .withColumn("start_time", current_timestamp()) \
        .withColumn("end_time", current_timestamp())

    # Append failure log
    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

    # Fail job for Airflow
    raise


In [0]:
# Use agriculture database
spark.sql("USE agriculture_data")

# Read raw soil health dataset
df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("/Volumes/ashritha/default/agriculture/raw/soil_health.csv")
)



Bronze – Logging – soil_health

In [0]:
pipeline_name = "agriculture_pipeline"
layer = "BRONZE"         
table_name = "bronze_soil_health"

try:
    # Count records processed
    record_count = df.count()

    # Write data to target table
    df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("overwrite") \
        .saveAsTable(table_name)

    # Create success log dataframe with schema
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "SUCCESS", record_count, None, None, None)],
        schema=log_schema
    )

    # Add timestamps
    log_df = log_df \
        .withColumn("start_time", current_timestamp()) \
        .withColumn("end_time", current_timestamp())

    # Append success log
    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

except Exception as e:
    # Create failure log dataframe with schema
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "FAILED", None, None, None, str(e))],
        schema=log_schema
    )

    # Add timestamps
    log_df = log_df \
        .withColumn("start_time", current_timestamp()) \
        .withColumn("end_time", current_timestamp())

    # Append failure log
    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

    # Fail job for Airflow
    raise


In [0]:
# Use agriculture database
spark.sql("USE agriculture_data")

# Read raw market prices dataset
df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("/Volumes/ashritha/default/agriculture/raw/market_prices.csv")
)



Bronze – Logging – market_prices

In [0]:
pipeline_name = "agriculture_pipeline"
layer = "BRONZE"         
table_name = "bronze_market_prices"

try:
    # Count records processed
    record_count = df.count()

    # Write data to target table
    df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("overwrite") \
        .saveAsTable(table_name)

    # Create success log dataframe with schema
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "SUCCESS", record_count, None, None, None)],
        schema=log_schema
    )

    # Add timestamps
    log_df = log_df \
        .withColumn("start_time", current_timestamp()) \
        .withColumn("end_time", current_timestamp())

    # Append success log
    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

except Exception as e:
    # Create failure log dataframe with schema
    log_df = spark.createDataFrame(
        [(pipeline_name, layer, table_name, "FAILED", None, None, None, str(e))],
        schema=log_schema
    )

    # Add timestamps
    log_df = log_df \
        .withColumn("start_time", current_timestamp()) \
        .withColumn("end_time", current_timestamp())

    # Append failure log
    log_df.write.format("delta") \
        .option("mergeSchema", "true") \
        .mode("append") \
        .saveAsTable("agriculture_data.pipeline_logs")

    # Fail job for Airflow
    raise
