In [3]:
import boto3
import json
import time
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
from delta.tables import DeltaTable
import os

# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')

# Path to your local JAR files
local_jars = "/Users/borja/Documents/Somniumrema/projects/de/route_optimizer/jars/aws-java-sdk-kinesis-1.12.364.jar"

# Initialize Spark session with Delta and S3 settings
spark = (SparkSession.builder
    .appName("DeltaLakeAggregation")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID)
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.session.token", AWS_SESSION_TOKEN)
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .config("spark.sql.files.maxPartitionBytes", "134217728")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.sql.adaptive.enabled", "false")
    .config("spark.sql.debug.maxToStringFields", "100")
    .config("spark.databricks.delta.schema.autoMerge.enabled", "true")  # Enable schema auto-merge
    .getOrCreate())

# Optional: Adjust logging level
spark.sparkContext.setLogLevel("WARN")

# Define paths
bronze_layer_path = "s3a://orders-for-dispatch/bronze/"

# Define schema for the data
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("total_weight", DoubleType(), True),
    StructField("total_volume", DoubleType(), True),
    StructField("total_price", DoubleType(), True),
    StructField("order_timestamp", StringType(), True),  # Change to StringType for initial load
    StructField("status", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lon", DoubleType(), True)
])

# Initialize Kinesis client
kinesis_client = boto3.client('kinesis', region_name=AWS_REGION, 
                              aws_access_key_id=AWS_ACCESS_KEY_ID, 
                              aws_secret_access_key=AWS_SECRET_ACCESS_KEY, 
                              aws_session_token=AWS_SESSION_TOKEN)

def get_shard_iterator(stream_name, shard_id):
    response = kinesis_client.get_shard_iterator(
        StreamName=stream_name,
        ShardId=shard_id,
        ShardIteratorType='LATEST'
    )
    return response['ShardIterator']

# Function to read Kinesis records and write them to Delta table
def read_and_write_kinesis_to_delta(stream_name, shard_id):
    shard_iterator = get_shard_iterator(stream_name, shard_id)
    batch_size = 25  # Adjust the batch size as needed
    orders_batch = []

    while True:
        # Fetch records from Kinesis using shard_iterator
        response = kinesis_client.get_records(ShardIterator=shard_iterator)
        records = response['Records']

        # Parse and accumulate orders
        if records:
            orders = [json.loads(record['Data']) for record in records]
            orders_batch.extend(orders)

            # Write to Delta table when batch size is reached
            if len(orders_batch) >= batch_size:
                df = spark.createDataFrame(orders_batch, schema=schema)
                df = df.withColumn("order_timestamp", to_timestamp(col("order_timestamp"), "yyyy-MM-dd HH:mm:ss"))
                
                # Coalesce the DataFrame to reduce the number of partitions
                df = df.coalesce(10)  # Adjust the number of partitions as needed
                
                df.write.format("delta").mode("append").save(bronze_layer_path)
                print(f"Saved {len(orders_batch)} orders to Delta table.")
                
                # Optimize the Delta table to compact small files
                delta_table = DeltaTable.forPath(spark, bronze_layer_path)
                delta_table.optimize().executeCompaction()
                print("Optimized the Delta table.")
                
                # Clear the batch
                orders_batch = []

        # Update shard iterator for the next batch
        shard_iterator = response['NextShardIterator']
        time.sleep(2)  # Sleep to avoid rate limits



ConnectionRefusedError: [Errno 61] Connection refused

In [2]:
# Stream name and shard ID
stream_name = 'OrderStreamForDispatching'  # Replace with your stream name
shard_id = 'shardId-000000000000'  # Get the shard ID from the stream's details

# Start reading from Kinesis and writing to Delta table
read_and_write_kinesis_to_delta(stream_name, shard_id)

24/10/05 00:18:00 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 26 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 26 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 26 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 26 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 26 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 26 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


                                                                                

Saved 25 orders to Delta table.


                                                                                

Optimized the Delta table.


ERROR:root:Exception while sending command.                                     
Traceback (most recent call last):
  File "/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkErro

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 53306)
Traceback (most recent call last):
  File "/Users/borja/.pyenv/versions/3.11.8/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/borja/.pyenv/versions/3.11.8/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/Users/borja/.pyenv/versions/3.11.8/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/borja/.pyenv/versions/3.11.8/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/Users/borja/Library/Caches/pypoetry/virtualenvs/rout

#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGSEGV (0xb) at pc=0x00000001082a51b8, pid=27886, tid=122167
#
# JRE version: OpenJDK Runtime Environment Homebrew (11.0.24) (build 11.0.24+0)
# Java VM: OpenJDK 64-Bit Server VM Homebrew (11.0.24+0, mixed mode, tiered, compressed oops, g1 gc, bsd-aarch64)
# Problematic frame:
# V  [libjvm.dylib+0x6951b8]  ObjectSynchronizer::inflate(Thread*, oopDesc*, ObjectSynchronizer::InflateCause)+0x18c
#
# No core dump will be written. Core dumps have been disabled. To enable core dumping, try "ulimit -c unlimited" before starting Java again
#
# An error report file with more information is saved as:
# /Users/borja/Documents/Somniumrema/projects/de/route_optimizer/notebooks/hs_err_pid27886.log
#
# If you would like to submit a bug report, please visit:
#   https://github.com/Homebrew/homebrew-core/issues
#


Py4JError: An error occurred while calling o776.save

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
