In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from dotenv import load_dotenv
import os
import json
import time
import boto3
from pyspark.sql import SparkSession

In [2]:
# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')

In [3]:
# Initialize Spark session with Delta and S3 settings
spark = SparkSession.builder \
    .appName("KinesisToDeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.session.token", AWS_SESSION_TOKEN) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.sql.files.maxPartitionBytes", "134217728")  \
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()

24/09/30 20:16:40 WARN Utils: Your hostname, Somnium.local resolves to a loopback address: 127.0.0.1; using 172.28.59.194 instead (on interface en0)
24/09/30 20:16:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/borja/.ivy2/cache
The jars for the packages stored in: /Users/borja/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3bf47d9f-ca99-4dfd-884b-3fa2e0d19e13;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.1026 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 201ms :: artifacts dl 8ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.1026 from central in [default]
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-ru

In [4]:
# Initialize boto3 client for Kinesis with your credentials
kinesis_client = boto3.client(
    'kinesis',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION
)

In [5]:
# S3 path for Delta table
delta_table_path = "s3a://orders-for-dispatch/raw_data"

# Accumulate orders in a list
order_buffer = []
buffer_limit = 1000  # Limit of accumulated orders before writing
flush_interval = 3*60  # Flush the buffer after 60 seconds if limit isn't reached

In [6]:
def fetch_kinesis_records(stream_name, shard_id):
    global order_buffer
    shard_iterator = kinesis_client.get_shard_iterator(
        StreamName=stream_name,
        ShardId=shard_id,
        ShardIteratorType='LATEST'
    )['ShardIterator']

    last_flush_time = time.time()

    while True:
        # Fetch records from Kinesis
        response = kinesis_client.get_records(ShardIterator=shard_iterator, Limit=100)
        records = response['Records']

        if records:
            # Parse Kinesis records and append to the buffer
            orders = [json.loads(record['Data']) for record in records]
            order_buffer.extend(orders)

        # Check if buffer limit or time threshold is reached
        if len(order_buffer) >= buffer_limit or (time.time() - last_flush_time) >= flush_interval:
            if order_buffer:
                # Convert to DataFrame and write to Delta
                df = spark.createDataFrame(order_buffer)
                
                # Optimize file size by controlling partitioning
                # Coalesce to a limited number of partitions (e.g., 10)
                df.coalesce(10).write.format("delta").mode("append").save(delta_table_path)
                
                print(f"Wrote {len(order_buffer)} records to Delta table.")

                # Clear buffer and update flush time
                order_buffer.clear()
                last_flush_time = time.time()

        # Update shard iterator for the next batch
        shard_iterator = response['NextShardIterator']
        time.sleep(2)  # Sleep to avoid throttling

In [7]:
# Example usage: fetch records from Kinesis and accumulate before writing
fetch_kinesis_records('OrderStreamForDispatching', 'shardId-000000000000')

24/09/30 20:19:48 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/09/30 20:20:00 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Wrote 5 records to Delta table.


24/09/30 20:23:05 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

Wrote 7 records to Delta table.


                                                                                

Wrote 6 records to Delta table.


24/09/30 20:29:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

Wrote 10 records to Delta table.


                                                                                

In [None]:
import time
from delta.tables import DeltaTable

def optimize_delta_table():
    delta_table_path = "s3a://orders-for-dispatch/delta_table"
    delta_table = DeltaTable.forPath(spark, delta_table_path)
    delta_table.optimize().executeCompaction()
    print("Delta table optimized.")

# Run the optimization every hour
while True:
    optimize_delta_table()
    time.sleep(3600)  # Sleep for 1 hour (3600 seconds)
