In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, LongType, TimestampType
from dotenv import load_dotenv
import os
import json
import time
import boto3
from pyspark.sql import SparkSession
from datetime import datetime 

In [2]:
# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')

In [3]:
#spark.stop()

In [4]:
from pyspark.sql import SparkSession

# Path to your local JAR files
local_jars = "/Users/borja/Documents/Somniumrema/projects/de/route_optimizer/jars/aws-java-sdk-kinesis-1.12.364.jar"

# Initialize Spark session with Delta and S3 settings
spark = SparkSession.builder \
    .appName("KinesisToDeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.session.token", AWS_SESSION_TOKEN) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.sql.files.maxPartitionBytes", "134217728") \
    .getOrCreate()

# Optional: Adjust logging level
spark.sparkContext.setLogLevel("WARN")

:: loading settings :: url = jar:file:/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/borja/.ivy2/cache
The jars for the packages stored in: /Users/borja/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7f5eb7e1-b885-4843-9ef8-a45e0f5de892;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.1026 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 541ms :: artifacts dl 18ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.1026 from central in [default]
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-r

In [5]:
# Initialize boto3 client for Kinesis with your credentials
kinesis_client = boto3.client(
    'kinesis',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION
)

In [6]:
# Define the schema
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("total_weight", DoubleType(), True),
    StructField("total_volume", DoubleType(), True),
    StructField("total_price", DoubleType(), True),
    StructField("order_timestamp", TimestampType(), True),
    StructField("status", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lon", DoubleType(), True)
])

In [7]:
# Function to convert the timestamps to proper datetime objects
def convert_timestamps(orders):
    for order in orders:
        if isinstance(order['order_timestamp'], str):
            # Only convert if the timestamp is a string
            order['order_timestamp'] = datetime.strptime(order['order_timestamp'], '%Y-%m-%d %H:%M:%S')
    return orders

# Function to convert the location field from MapType to StructType
def transform_location(order):
    if isinstance(order['location'], dict):
        location_data = order['location']
        order['location'] = {
            "address": location_data['address'],
            "lat": float(location_data['lat']),
            "lon": float(location_data['lon'])
        }
    return order

# Function to get the shard iterator
def get_shard_iterator(stream_name, shard_id):
    response = kinesis_client.get_shard_iterator(
        StreamName=stream_name,
        ShardId=shard_id,
        ShardIteratorType='LATEST'  # or 'LATEST' for new records
    )
    return response['ShardIterator']

# Function to read records from the Kinesis stream
def read_kinesis_records(stream_name, shard_iterator):
    while True:
        response = kinesis_client.get_records(ShardIterator=shard_iterator, Limit=100)
        records = response['Records']
        for record in records:
            # No need to base64 decode, just parse the data directly
            order_data = record['Data']
            order = json.loads(order_data)
            print("Received order:", order)

        # Update the shard iterator for the next batch of records
        shard_iterator = response['NextShardIterator']

        # Sleep to avoid hitting API rate limits
        time.sleep(1)

In [8]:
import time
import json

# Initialize buffer, limits, and thresholds
order_buffer = []
weight_threshold = 15  # Example weight threshold (can be adjusted)
volume_threshold = 500  # Example volume threshold (can be adjusted)
time_threshold = 60 * 30 # Example time threshold (e.g., 30 minutes)
buffer_limit = 1000       # Optional, max number of orders in the buffer
last_flush_time = time.time()

# S3 path for Delta table
delta_table_path = "s3a://orders-for-dispatch/dispatching"

# Dispatcher function to read Kinesis records, accumulate, and process them
def dispatcher(shard_iterator):
    global order_buffer, last_flush_time

    while True:
        # Fetch records from Kinesis using shard_iterator
        response = kinesis_client.get_records(ShardIterator=shard_iterator)
        records = response['Records']

        # Parse and accumulate orders
        for record in records:
            order_data = json.loads(record['Data'])
            order_buffer.append(order_data)
            print(f"Accumulated order: {order_data}")

        # Convert timestamps if necessary
        order_buffer = convert_timestamps(order_buffer)

        # Calculate accumulated weight and volume
        total_weight = sum(order['total_weight'] for order in order_buffer)
        total_volume = sum(order['total_volume'] for order in order_buffer)
        time_elapsed = time.time() - last_flush_time

        # Check if any thresholds are met (weight, volume, or time)
        if total_weight >= weight_threshold or total_volume >= volume_threshold or time_elapsed >= time_threshold:
            print(f"Threshold met: Dispatching {len(order_buffer)} orders.")
            
            # Update the status of all orders to 'DISPATCH_READY'
            for order in order_buffer:
                order['status'] = 'READY_FOR_DISPATCH'

            # Convert buffer to DataFrame and write to Delta table
            df = spark.createDataFrame(order_buffer, schema=schema)
            df.write.format("delta").mode("append").save(delta_table_path)
            print(f"Saved {len(order_buffer)} orders to Delta table.")
            
            # Clear the buffer and reset flush time
            order_buffer.clear()
            last_flush_time = time.time()

        # Update shard iterator for the next batch
        shard_iterator = response['NextShardIterator']
        time.sleep(2)  # Sleep to avoid rate limits

# Get the shard iterator
stream_name = 'OrderStreamForDispatching'  # Replace with your stream name
shard_id = 'shardId-000000000000'  # Get the shard ID from the stream's details
shard_iterator = get_shard_iterator(stream_name, shard_id)

# Start the dispatcher
dispatcher(shard_iterator)


Accumulated order: {'order_id': '140680f5-cbf9-4056-922e-2ff935a078f2', 'customer_id': 'cus-ba8d32e6-6f9c-4ddb-a0a0-bdc936739fbb', 'total_weight': 82.73160870030328, 'total_volume': 318.9174240318443, 'total_price': 412.70773183574084, 'order_timestamp': '2024-10-02 04:01:42', 'status': 'RECEIVED', 'lat': 40.5202115180548, 'lon': -3.8638075430981003}
Threshold met: Dispatching 1 orders.


24/10/02 04:01:47 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/10/02 04:02:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Saved 1 orders to Delta table.
Accumulated order: {'order_id': 'b26faf91-8581-47c5-8a7a-bcab8ef664a9', 'customer_id': 'cus-bbf84676-a48e-4beb-8758-22a8bbcdbd03', 'total_weight': 38.75207275621183, 'total_volume': 270.4332401055019, 'total_price': 36.17281916219022, 'order_timestamp': '2024-10-02 04:02:29', 'status': 'RECEIVED', 'lat': 40.37381234111349, 'lon': -3.6619045311504714}
Threshold met: Dispatching 1 orders.


                                                                                

Saved 1 orders to Delta table.
Accumulated order: {'order_id': 'ae7715ce-7714-4d2e-8beb-6e4e69e3ca72', 'customer_id': 'cus-99e1c03c-9e4d-4a38-92b4-34819d8a1909', 'total_weight': 48.12950760579068, 'total_volume': 460.40128508892, 'total_price': 928.9953000140409, 'order_timestamp': '2024-10-02 04:03:00', 'status': 'RECEIVED', 'lat': 40.55352027399454, 'lon': -3.846850240184429}
Threshold met: Dispatching 1 orders.


                                                                                

Saved 1 orders to Delta table.
Accumulated order: {'order_id': '6ba61fa1-8590-4e43-a3d5-440268d80b52', 'customer_id': 'cus-e54fd919-0cd3-40e3-801b-3aa4f0015c6a', 'total_weight': 43.53990041187081, 'total_volume': 139.95677803342312, 'total_price': 420.68691285213123, 'order_timestamp': '2024-10-02 04:03:43', 'status': 'RECEIVED', 'lat': 40.56735698592657, 'lon': -3.7339000542917575}
Threshold met: Dispatching 1 orders.


                                                                                

Saved 1 orders to Delta table.
Accumulated order: {'order_id': '9b442566-8a8b-4081-b6fd-87ff8cb1be2e', 'customer_id': 'cus-012169ba-74fe-44ad-b7f7-48923011499a', 'total_weight': 48.73753836832661, 'total_volume': 242.98560297383696, 'total_price': 572.0745375784338, 'order_timestamp': '2024-10-02 04:04:22', 'status': 'RECEIVED', 'lat': 40.41326882171988, 'lon': -3.7854324815045137}
Threshold met: Dispatching 1 orders.


                                                                                

Saved 1 orders to Delta table.
Accumulated order: {'order_id': '64bd7522-0a7f-44e0-a79f-450a554fb25b', 'customer_id': 'cus-5c6fc123-cf6a-41f8-9ce3-a9d80ddceaa8', 'total_weight': 90.67317748547849, 'total_volume': 375.40339945429565, 'total_price': 291.02050389950824, 'order_timestamp': '2024-10-02 04:04:45', 'status': 'RECEIVED', 'lat': 40.64060844221572, 'lon': -3.7070802941843075}
Threshold met: Dispatching 1 orders.


                                                                                

Saved 1 orders to Delta table.
Accumulated order: {'order_id': '0562e23b-c8c5-41d8-9312-e1e7afa0898d', 'customer_id': 'cus-4b7057d8-c3dd-4449-b4dc-5f4c15a4bdb6', 'total_weight': 46.74087426850831, 'total_volume': 392.79371443806525, 'total_price': 312.49896608574, 'order_timestamp': '2024-10-02 04:04:56', 'status': 'RECEIVED', 'lat': 40.570276017311635, 'lon': -3.632292221828427}
Threshold met: Dispatching 1 orders.


                                                                                

Saved 1 orders to Delta table.
Accumulated order: {'order_id': '8d6adae1-e68d-4e59-aeef-5eb0c1c6227f', 'customer_id': 'cus-49f84bcc-69f8-4be7-94ef-17a0a2d80f2c', 'total_weight': 92.60351507581612, 'total_volume': 242.4883615375743, 'total_price': 880.1691661195565, 'order_timestamp': '2024-10-02 04:05:29', 'status': 'RECEIVED', 'lat': 40.58116490703484, 'lon': -3.7770963729789564}
Threshold met: Dispatching 1 orders.


                                                                                

Saved 1 orders to Delta table.
Accumulated order: {'order_id': '4b940677-dd12-4e2c-9613-0fe1a7aa1760', 'customer_id': 'cus-1a53dfa5-de42-4e91-aa67-7dfd3e07e22d', 'total_weight': 52.89003629329177, 'total_volume': 46.821241954089196, 'total_price': 879.0783040278777, 'order_timestamp': '2024-10-02 04:06:11', 'status': 'RECEIVED', 'lat': 40.529752244413984, 'lon': -3.7012124227966563}
Threshold met: Dispatching 1 orders.


                                                                                

Saved 1 orders to Delta table.
Accumulated order: {'order_id': '20a04e22-5f52-4acd-b582-ac9bf35574a6', 'customer_id': 'cus-ce1c21be-f175-4b09-b23e-416dcd687ae5', 'total_weight': 97.50866167890928, 'total_volume': 91.49363948872059, 'total_price': 943.7143782517655, 'order_timestamp': '2024-10-02 04:06:41', 'status': 'RECEIVED', 'lat': 40.59678718920532, 'lon': -3.777971327149063}
Threshold met: Dispatching 1 orders.


                                                                                

Saved 1 orders to Delta table.


KeyboardInterrupt: 