In [4]:
import boto3
import json
import time
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
import os

# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')

In [5]:
from pyspark.sql import SparkSession

# Path to the Silver Delta table
silver_table_path = "s3a://dispatched-orders/gold/pBI/data"

# Path to your local JAR files (optional, only necessary if running locally with custom JARs)
local_jars = "/Users/borja/Documents/Somniumrema/projects/de/route_optimizer/jars/aws-java-sdk-kinesis-1.12.364.jar"

# Initialize Spark session with Delta and S3 settings
spark = (SparkSession.builder
    .appName("DeltaLakeAggregation")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID)
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.session.token", AWS_SESSION_TOKEN)
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .config("spark.sql.files.maxPartitionBytes", "134217728")
    .config("spark.driver.memory", "6g")
    .config("spark.executor.memory", "6g")
    .config("spark.sql.adaptive.enabled", "false")
    .config("spark.sql.debug.maxToStringFields", "100")
    .config("spark.databricks.delta.schema.autoMerge.enabled", "true") 
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate())

# Optional: Adjust logging level
spark.sparkContext.setLogLevel("WARN")

# Read the Delta table from the Silver layer
df_silver = spark.read \
    .format("parquet") \
    .load(silver_table_path)

num_rows = df_silver.count()
print(f"Number of rows: {num_rows}")
# Show the schema and first few records to verify
df_silver.printSchema()
df_silver.show(100)


                                                                                

Number of rows: 2
root
 |-- output_distances: struct (nullable = true)
 |    |-- driver_1: double (nullable = true)
 |    |-- driver_10: double (nullable = true)
 |    |-- driver_11: double (nullable = true)
 |    |-- driver_12: double (nullable = true)
 |    |-- driver_13: double (nullable = true)
 |    |-- driver_14: double (nullable = true)
 |    |-- driver_15: double (nullable = true)
 |    |-- driver_16: double (nullable = true)
 |    |-- driver_17: double (nullable = true)
 |    |-- driver_18: double (nullable = true)
 |    |-- driver_2: double (nullable = true)
 |    |-- driver_3: double (nullable = true)
 |    |-- driver_4: double (nullable = true)
 |    |-- driver_5: double (nullable = true)
 |    |-- driver_6: double (nullable = true)
 |    |-- driver_7: double (nullable = true)
 |    |-- driver_8: double (nullable = true)
 |    |-- driver_9: double (nullable = true)
 |-- output_num_late_visits: long (nullable = true)
 |-- output_num_unserved: long (nullable = true)
 |-- outp

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------+----------------------+-------------------+--------------------+-------------+-----------------------+---------------------+----------------------+------------------------+-----------------------------+---------------------------+-------------------------+---------------+-----------------------+----------------+--------------------+
|    output_distances|output_num_late_visits|output_num_unserved|     output_solution|output_status|output_total_break_time|output_total_distance|output_total_idle_time|output_total_travel_time|output_total_vehicle_overtime|output_total_visit_lateness|output_total_working_time|output_unserved|output_vehicle_overtime| dispatched_hour|         finished_at|
+--------------------+----------------------+-------------------+--------------------+-------------+-----------------------+---------------------+----------------------+------------------------+-----------------------------+---------------------------+-------------------------+--------

                                                                                

In [6]:
# Specify the S3 path for the JSON file
s3_json_path = "s3a://dispatched-orders/silver/batching/batch_1_2024-10-16-10-56-52.json"

import boto3
import json

# Initialize the boto3 S3 client
s3_client = boto3.client('s3')

# Specify the bucket and the file key (path to the file in the bucket)
bucket_name = 'dispatched-orders'
file_key = 'to-optimizer/solution_batch_1_2024-10-16T08-38-16.325Z.json'

# Read the JSON file from S3
try:
    s3_response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    json_content = s3_response['Body'].read().decode('utf-8')
    
    # Parse the JSON content
    json_data = json.loads(json_content)
    
    # Print the parsed JSON data
    print(json_data)

except Exception as e:
    print(f"Error reading JSON file from S3: {e}")

Error reading JSON file from S3: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.
