In [7]:
import boto3
import json
import time
from dotenv import load_dotenv
from datetime import datetime
import os
from delta.tables import DeltaTable

# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')
ROUTIFIC_TOKEN = os.getenv('ROUTIFIC_TOKEN')

In [8]:
# Initialize boto3 client for Kinesis with your credentials
kinesis_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION
)

# Initialize boto3 client for S3 with your credentials
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION 
)

In [3]:
from pyspark.sql import SparkSession

# Path to your local JAR files
local_jars = "/Users/borja/Documents/Somniumrema/projects/de/route_optimizer/jars/aws-java-sdk-kinesis-1.12.364.jar"

# Initialize Spark session with Delta and S3 settings
spark = SparkSession.builder \
    .appName("KinesisToDeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.session.token", AWS_SESSION_TOKEN) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.sql.files.maxPartitionBytes", "134217728") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Optional: Adjust logging level
spark.sparkContext.setLogLevel("WARN")

:: loading settings :: url = jar:file:/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/borja/.ivy2/cache
The jars for the packages stored in: /Users/borja/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-abf9f713-f6dd-4d5f-88a3-35006721978d;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.1026 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 142ms :: artifacts dl 6ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.1026 from central in [default]
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-ru

In [4]:
from delta.tables import DeltaTable
from pyspark.sql.utils import AnalysisException

def check_delta_table_exists(spark, path):
    try:
        delta_table = DeltaTable.forPath(spark, path)
        print(f"Delta table exists at path: {path}")
        return True
    except AnalysisException as e:
        if "is not a Delta table" in str(e):
            print(f"No Delta table found at path: {path}")
            return False
        else:
            raise e

# Path to the Delta table in S3
delta_table_path = "s3a://orders-for-dispatch/silver"

# Check if the Delta table exists
exists = check_delta_table_exists(spark, delta_table_path)

24/10/06 22:35:40 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


Delta table exists at path: s3a://orders-for-dispatch/silver


                                                                                

In [5]:


# Import necessary libraries
from pyspark.sql import SparkSession
from delta.tables import DeltaTable



# Path to the Delta table in S3
delta_table_path = "s3a://orders-for-dispatch/silver"

# Read the Delta table
delta_table = DeltaTable.forPath(spark, delta_table_path)

# Convert Delta table to DataFrame
df = delta_table.toDF()

# Show the contents of the Delta table
df.show()

# Stop the Spark session
spark.stop()

24/10/06 22:35:46 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------------------+--------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+
|            order_id|         customer_id|      total_weight|      total_volume|       total_price|    order_timestamp|            status|               lat|                lon|
+--------------------+--------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+
|53db6ed4-90b8-4e9...|cus-96f56ac2-0c0f...| 50.73262649393302|190.44385165145806| 980.7997692409006|2024-10-06 21:41:21|READY_FOR_DISPATCH|40.549209890297696|-3.7227035885231077|
|9de5440f-e96b-4ea...|cus-e9477353-df77...| 76.42521001573878|392.15686868126295|   948.53615868996|2024-10-06 21:40:57|READY_FOR_DISPATCH| 40.59354358838209|-3.7961876466288467|
|75dc2792-2f2b-4b9...|cus-570786aa-85dc...|23.404358420138454|256.69045554496216| 581.7456700201304|2024-

In [9]:
import boto3
import json

# Initialize a session using Amazon S3
s3 = boto3.client('s3')

# Define the bucket name and the object key (file path)
bucket_name = 'ready-for-dispatch'
object_key = 'gold-optimization/dispatch_2024-10-06T19-59-03.117Z.json'

# Download the JSON file from S3
response = s3.get_object(Bucket=bucket_name, Key=object_key)
json_content = response['Body'].read().decode('utf-8')

# Parse the JSON content
data = json.loads(json_content)

# Print the parsed data
print(data)

NoSuchBucket: An error occurred (NoSuchBucket) when calling the GetObject operation: The specified bucket does not exist