In [3]:
import boto3
import json
import time
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
import os

# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')

In [4]:
from pyspark.sql import SparkSession

# Path to the Silver Delta table
silver_table_path = "s3a://orders-for-dispatch/silver/"

# Path to your local JAR files (optional, only necessary if running locally with custom JARs)
local_jars = "/Users/borja/Documents/Somniumrema/projects/de/route_optimizer/jars/aws-java-sdk-kinesis-1.12.364.jar"

# Initialize Spark session with Delta and S3 settings
spark = (SparkSession.builder
    .appName("DeltaLakeAggregation")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID)
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.session.token", AWS_SESSION_TOKEN)
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .config("spark.sql.files.maxPartitionBytes", "134217728")
    .config("spark.driver.memory", "6g")
    .config("spark.executor.memory", "6g")
    .config("spark.sql.adaptive.enabled", "false")
    .config("spark.sql.debug.maxToStringFields", "100")
    .config("spark.databricks.delta.schema.autoMerge.enabled", "true") 
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate())

# Optional: Adjust logging level
spark.sparkContext.setLogLevel("WARN")

# Read the Delta table from the Silver layer
df_silver = spark.read \
    .format("parquet") \
    .load(silver_table_path)

# Show the schema and first few records to verify
df_silver.printSchema()
df_silver.show(10)


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- total_weight: double (nullable = true)
 |-- total_volume: double (nullable = true)
 |-- total_price: double (nullable = true)
 |-- order_timestamp: timestamp (nullable = true)
 |-- status: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- order_hour: string (nullable = true)



[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+--------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+-------------+
|            order_id|         customer_id|      total_weight|      total_volume|       total_price|    order_timestamp|            status|               lat|                lon|   order_hour|
+--------------------+--------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+-------------+
|ddf388df-0ba7-466...|cus-7936923a-9497...|53.704967191021616|  158.020036789502|  391.369766741626|2024-10-05 00:05:27|READY_FOR_DELIVERY| 40.57537902504314|-3.8109392727805593|2024-10-05-00|
|00097dc9-c9d9-470...|cus-eb2b10f9-0386...| 46.85114562865591|31.315665829003166|372.29208926344467|2024-10-05 00:05:28|READY_FOR_DELIVERY| 40.61009126019276| -3.651427946689784|2024-10-05-00|
|33d73aa7-2983-4be...|cus-f67fa0f8-

                                                                                