In [11]:
import boto3
import json
import time
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
import os

# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')

In [12]:
from pyspark.sql import SparkSession

# Path to the Silver Delta table
silver_table_path = "s3a://dispatched-orders/gold/"

# Path to your local JAR files (optional, only necessary if running locally with custom JARs)
local_jars = "/Users/borja/Documents/Somniumrema/projects/de/route_optimizer/jars/aws-java-sdk-kinesis-1.12.364.jar"

# Initialize Spark session with Delta and S3 settings
spark = (SparkSession.builder
    .appName("DeltaLakeAggregation")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID)
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.session.token", AWS_SESSION_TOKEN)
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .config("spark.sql.files.maxPartitionBytes", "134217728")
    .config("spark.driver.memory", "6g")
    .config("spark.executor.memory", "6g")
    .config("spark.sql.adaptive.enabled", "false")
    .config("spark.sql.debug.maxToStringFields", "100")
    .config("spark.databricks.delta.schema.autoMerge.enabled", "true") 
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate())

# Optional: Adjust logging level
spark.sparkContext.setLogLevel("WARN")

# Read the Delta table from the Silver layer
df_silver = spark.read \
    .format("parquet") \
    .load(silver_table_path)

num_rows = df_silver.count()
print(f"Number of rows: {num_rows}")
# Show the schema and first few records to verify
df_silver.printSchema()
df_silver.show(100)


                                                                                

Number of rows: 0
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- total_weight: double (nullable = true)
 |-- total_volume: double (nullable = true)
 |-- total_price: double (nullable = true)
 |-- order_timestamp: timestamp (nullable = true)
 |-- status: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- depot: string (nullable = true)
 |-- hour: integer (nullable = true)
 |-- batch_id: integer (nullable = true)
 |-- dispatched_timestamp: timestamp (nullable = true)



[Stage 19:>                                                         (0 + 4) / 4]

+--------+-----------+------------+------------+-----------+---------------+------+---+---+-----+----+--------+--------------------+
|order_id|customer_id|total_weight|total_volume|total_price|order_timestamp|status|lat|lon|depot|hour|batch_id|dispatched_timestamp|
+--------+-----------+------------+------------+-----------+---------------+------+---+---+-----+----+--------+--------------------+
+--------+-----------+------------+------------+-----------+---------------+------+---+---+-----+----+--------+--------------------+



                                                                                

In [13]:
# Specify the S3 path for the JSON file
s3_json_path = "s3a://dispatched-orders/to-optimizer/solution_batch_1_2024-10-16-10-56-52.json"

import boto3
import json

# Initialize the boto3 S3 client
s3_client = boto3.client('s3')

# Specify the bucket and the file key (path to the file in the bucket)
bucket_name = 'dispatched-orders'
file_key = 'to-optimizer/solution_batch_1_2024-10-16T08-38-16.325Z.json'

# Read the JSON file from S3
try:
    s3_response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    json_content = s3_response['Body'].read().decode('utf-8')
    
    # Parse the JSON content
    json_data = json.loads(json_content)
    
    # Print the parsed JSON data
    print(json_data)

except Exception as e:
    print(f"Error reading JSON file from S3: {e}")

Error reading JSON file from S3: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.
