In [1]:
import boto3
import json
import time
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
import os

# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, sum as spark_sum
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType

# Initialize Spark session with Delta and S3 settings
spark = SparkSession.builder \
    .appName("DeltaLakeAggregation") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.session.token", AWS_SESSION_TOKEN) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.sql.files.maxPartitionBytes", "134217728") \
    .getOrCreate()

# Define paths
bronze_layer_path = "s3a://orders-for-dispatch/bronze"
silver_layer_path = "s3a://orders-for-dispatch/silver"

# Define schema for the data
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("total_weight", DoubleType(), True),
    StructField("total_volume", DoubleType(), True),
    StructField("total_price", DoubleType(), True),
    StructField("order_timestamp", TimestampType(), True),
    StructField("status", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lon", DoubleType(), True)
])

# Read from Delta table (Bronze Layer)
delta_df = spark.readStream \
    .format("delta") \
    .load(bronze_layer_path)

# Apply watermark and window
aggregated_df = delta_df \
    .withWatermark("order_timestamp", "1 minutes") \
    .groupBy(window(col("order_timestamp"), "2 minutes")) \
    .agg(
        spark_sum("total_weight").alias("total_weight"),
        spark_sum("total_volume").alias("total_volume")
    )

# Write the aggregated data to Delta table (Silver Layer)
query = aggregated_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", silver_layer_path + "_checkpoint") \
    .start(silver_layer_path)

# Await termination of the streaming query
query.awaitTermination()

:: loading settings :: url = jar:file:/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/borja/.ivy2/cache
The jars for the packages stored in: /Users/borja/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6124cf4d-e618-499d-8c54-b639ef9e6596;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.1026 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 145ms :: artifacts dl 6ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.1026 from central in [default]
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-ru

Py4JError: An error occurred while calling o68.awaitTermination

24/10/03 22:33:26 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

In [5]:
# Load the Silver table
silver_df = spark.read.format("delta").load(silver_layer_path)

# Show the content of the Silver table
silver_df.printSchema()

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- total_weight: double (nullable = true)
 |-- total_volume: double (nullable = true)



24/10/03 22:34:21 WARN HDFSBackedStateStoreProvider: The state for version 3 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/10/03 22:34:21 WARN HDFSBackedStateStoreProvider: The state for version 3 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/10/03 22:34:21 WARN HDFSBackedStateStoreProvider: The state for version 3 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/10/03 22:34:22 WARN HDFSBackedStateStoreProvider: The state for version 3 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/10/03 22:34:22 WARN HDFSBackedStateStoreProvider: The state for version 3 doesn't exist in loadedMaps. Reading snapsh

#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGSEGV (0xb) at pc=0x0000000105aa51b8, pid=15787, tid=74499
#
# JRE version: OpenJDK Runtime Environment Homebrew (11.0.24) (build 11.0.24+0)
# Java VM: OpenJDK 64-Bit Server VM Homebrew (11.0.24+0, mixed mode, tiered, compressed oops, g1 gc, bsd-aarch64)
# Problematic frame:
# V  [libjvm.dylib+0x6951b8]  ObjectSynchronizer::inflate(Thread*, oopDesc*, ObjectSynchronizer::InflateCause)+0x18c
#
# No core dump will be written. Core dumps have been disabled. To enable core dumping, try "ulimit -c unlimited" before starting Java again
#
# An error report file with more information is saved as:
# /Users/borja/Documents/Somniumrema/projects/de/route_optimizer/notebooks/hs_err_pid15787.log
#
# If you would like to submit a bug report, please visit:
#   https://github.com/Homebrew/homebrew-core/issues
#


In [4]:
# Load the Bronze table
bronze_df = spark.read.format("delta").load(bronze_layer_path)

# Show the content of the Bronze table
bronze_df.show()

[Stage 19:>                                                         (0 + 2) / 2]

+--------------------+--------------------+------------------+------------------+------------------+-------------------+--------+------------------+-------------------+
|            order_id|         customer_id|      total_weight|      total_volume|       total_price|    order_timestamp|  status|               lat|                lon|
+--------------------+--------------------+------------------+------------------+------------------+-------------------+--------+------------------+-------------------+
|acc44049-1228-4e0...|cus-e06c4570-708b...|  84.8057622351721| 457.2678184361901| 194.0919306526209|2024-10-03 22:28:26|RECEIVED|  40.4026748118209|-3.6332734601525303|
|fa6ffa76-841c-4b3...|cus-58becfec-041e...|16.211079521079657|3.1408651537984213| 723.2348576127821|2024-10-03 22:27:13|RECEIVED|40.581382986628434|-3.7808893707357485|
|be2ccb58-9052-4da...|cus-1de19272-4a43...| 89.11164342799184|161.61516249760615| 550.3912570569299|2024-10-03 22:29:50|RECEIVED| 40.37931765715566|-3.5904

                                                                                