In [1]:
import boto3
import json
import time
from dotenv import load_dotenv
from datetime import datetime
import os
from delta.tables import DeltaTable

# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')
ROUTIFIC_TOKEN = os.getenv('ROUTIFIC_TOKEN')

In [2]:
# Initialize boto3 client for Kinesis with your credentials
kinesis_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION
)

# Initialize boto3 client for S3 with your credentials
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION 
)

In [3]:
from pyspark.sql import SparkSession

# Path to your local JAR files
local_jars = "/Users/borja/Documents/Somniumrema/projects/de/route_optimizer/jars/aws-java-sdk-kinesis-1.12.364.jar"

# Initialize Spark session with Delta and S3 settings
spark = SparkSession.builder \
    .appName("KinesisToDeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.1026") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.session.token", AWS_SESSION_TOKEN) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.sql.files.maxPartitionBytes", "134217728") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Optional: Adjust logging level
spark.sparkContext.setLogLevel("WARN")


:: loading settings :: url = jar:file:/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/borja/.ivy2/cache
The jars for the packages stored in: /Users/borja/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0c35e0cb-a5fd-4efb-a536-0ba37a3d0789;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.1026 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 138ms :: artifacts dl 5ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.1026 from central in [default]
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-ru

In [4]:
# Path to the Delta table in S3
delta_table_path = 's3a://orders-for-dispatch/dispatching'

In [5]:
# Function to read records marked as ready for dispatch
def read_ready_for_dispatch_orders():
    # Load the Delta table
    delta_table = DeltaTable.forPath(spark, delta_table_path)
    
    # Query the Delta table to fetch only the records marked as READY_FOR_DISPATCH
    df = delta_table.toDF().filter("status = 'READY_FOR_DISPATCH'")
    
    # Return the DataFrame
    return df

In [6]:
# Fetch the ready-for-dispatch orders
ready_orders_df = read_ready_for_dispatch_orders()

# Convert the DataFrame to a list of Row objects
ready_orders = ready_orders_df.collect()

24/10/05 20:14:26 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/10/05 20:14:33 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [7]:
ready_orders_df.show()

[Stage 10:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+
|            order_id|         customer_id|      total_weight|      total_volume|       total_price|    order_timestamp|            status|               lat|                lon|
+--------------------+--------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+
|8fbd9962-a3e9-4a0...|cus-b698b933-9fe8...| 88.50330913308558|  66.9705801132376| 500.4046597864464|2024-10-05 17:47:35|READY_FOR_DISPATCH| 40.61365288258535| -3.730030544862642|
|20bc833d-39b7-426...|cus-fda63b25-0eda...|2.3907991645592532| 424.3756447440603| 674.6519559982112|2024-10-05 18:18:32|READY_FOR_DISPATCH|  40.6209851640247|-3.7648733948723687|
|af5c9f28-6053-4e1...|cus-b644ae0a-b5c3...|12.973519894566822| 495.3820843189091|395.72821511824606|2024-

                                                                                

In [8]:
import random
import numpy as np

# List of possible priorities
priorities = ["low", "regular", "high"]

# Function to write JSON with visits in the required format
def format_to_visits(data):
    if data is None:
        print("Data is None, cannot format visits.")
        return {}

    visits = {
        item['order_id']: {
            "location": {
                "name": item['customer_id'],  # Routific requires a 'name' field for each location
                "lat": round(item['lat'], 6),  # Latitude from your data
                "lng": round(item['lon'], 7)  # Longitude from your data (changed 'lon' to 'lng' for consistency with Routific API)
            },
            "start": "9:00",  # Default start time (customize if you have specific times)
            "end": "18:00",  # Default end time
            "duration": 5,  # Default duration (customize as needed)
            "load": {
                "weight": int(np.ceil(item['total_weight'])),  # Use the 'total_weight' from the row
                "volume": int(np.ceil(item['total_volume']))  # Use the 'total_volume' from the row
            },
            "priority": random.choice(priorities),  # Assign a random priority for each order
        } for item in data
    }
    return visits



# Step 3: Transform the orders into the Routific format
visits_for_routific = format_to_visits(ready_orders)

# Check the output to verify the visits are correctly formatted
print("Visits formatted for Routific API:")
print(json.dumps(visits_for_routific, indent=4))


Visits formatted for Routific API:
{
    "8fbd9962-a3e9-4a04-8fab-e64fc85e48ac": {
        "location": {
            "name": "cus-b698b933-9fe8-48a9-9f2a-1aa07c1377bf",
            "lat": 40.613653,
            "lng": -3.7300305
        },
        "start": "9:00",
        "end": "18:00",
        "duration": 5,
        "load": {
            "weight": 89,
            "volume": 67
        },
        "priority": "high"
    },
    "20bc833d-39b7-426d-aaff-cfc01eeb2e22": {
        "location": {
            "name": "cus-fda63b25-0eda-4f69-b4b0-6f0fda030416",
            "lat": 40.620985,
            "lng": -3.7648734
        },
        "start": "9:00",
        "end": "18:00",
        "duration": 5,
        "load": {
            "weight": 3,
            "volume": 425
        },
        "priority": "regular"
    },
    "af5c9f28-6053-4e13-9fcb-bff05c76d6c5": {
        "location": {
            "name": "cus-b644ae0a-b5c3-43b0-91d7-6cbcd5f51a1c",
            "lat": 40.494461,
            "lng": -

In [9]:
import requests

# Define your fleet setup here (this is an example based on previous information)
def build_fleet(depots, num_drivers, shift_start, shift_end, weight, volume, drivers_per_depot):
    fleet = {
        f"driver_{driver_counter}": {
            "start_location": {
                "id": depot["id"],
                "name": depot["name"],
                "lat": depot["lat"],
                "lng": depot["lng"]
            },
            "end_location": {
                "id": depot["id"],
                "name": depot["name"],
                "lat": depot["lat"],
                "lng": depot["lng"]
            },
            "shift_start": shift_start,
            "shift_end": shift_end,
            "min_visits": 1,
            "capacity": {
                "weight": weight,
                "volume": volume
            }
        }
        for depot in depots
        for driver_counter in range(1, num_drivers + 1)
        if driver_counter <= num_drivers and (driver_counter - 1) // drivers_per_depot < len(depots)
    }
    
    return fleet

# Define depots and fleet parameters (based on your previous setup)
depots = [
    {"id": "depot_1", "name": "San Sebastian de los Reyes", "lat": 40.54510, "lng": -3.61184},
    {"id": "depot_2", "name": "Alcorcón", "lat": 40.350370, "lng": -3.855863},
    {"id": "depot_3", "name": "Vallecas", "lat": 40.36977, "lng": -3.59670}
]

# Build the fleet
fleet_for_routific = build_fleet(depots, num_drivers=18, shift_start="9:00", shift_end="18:00", weight=1500, volume=1500, drivers_per_depot=6)

# Combine the visits and fleet into the payload
payload = {
    "visits": visits_for_routific,
    "fleet": fleet_for_routific
}

# Combine with the config options if needed
config = {
   "options": {
       "traffic": "slow",
       "balance": True,
       "shortest_distance": True,
       "polylines": True
   }
}

# Final payload with config
combined_payload = {**payload, **config}

# Make the API request to Routific
routific_url = "https://api.routific.com/v1/vrp-long"
headers = {
    "Authorization": f"bearer {ROUTIFIC_TOKEN}",
    "Content-Type": "application/json"
}

response = requests.post(routific_url, headers=headers, json=combined_payload)

# Check the response and extract the job_id even if the status code is 202
if response.status_code == 200 or response.status_code == 202:
    print("Orders submitted to Routific. Processing in progress.")
    response_data = response.json()
    jobID = response_data.get("job_id")
    print(f"Job ID: {jobID}")
else:
    print(f"Failed to submit the orders. Status code: {response.status_code}")
    print("Response:", response.text)

Orders submitted to Routific. Processing in progress.
Job ID: m1wh5wam586


In [10]:
import urllib3
import json
import time

def check_job_status(URL, jobID, headers, waiting_time):
    http = urllib3.PoolManager()
    URL = f"{URL}/{jobID}"
    job_status = None

    while job_status != 'finished':
        response = http.request('GET', URL, headers=headers)
        solution_data = json.loads(response.data.decode('utf-8'))
        job_status = solution_data.get('status')

        print("Current job status:", job_status)

        if job_status in ['pending', 'processing']:
            time.sleep(waiting_time)
        elif job_status == 'finished':
            print("Job finished.")
            return solution_data
        else:
            print("Unexpected job status:", job_status)
            break

# Example usage
solution_data = check_job_status("https://api.routific.com/jobs", jobID, headers, waiting_time=10)


Current job status: processing
Current job status: processing
Current job status: finished
Job finished.


In [11]:
# Iterate over the response and print the key-value structure
def print_key_value_structure(data, indent=''):
    for key, value in data.items():
        print(f"{indent}Key: {key} - Value Type: {type(value)}")
        if isinstance(value, dict):
            print_key_value_structure(value, indent + '  ')

In [12]:
# Print structure of the response_data
print_key_value_structure(solution_data)

Key: timing - Value Type: <class 'dict'>
  Key: startedProcessingAt - Value Type: <class 'str'>
  Key: finishedProcessingAt - Value Type: <class 'str'>
Key: fetchedCount - Value Type: <class 'int'>
Key: apiMajorVersion - Value Type: <class 'int'>
Key: apiMinorVersion - Value Type: <class 'int'>
Key: _id - Value Type: <class 'str'>
Key: input - Value Type: <class 'dict'>
  Key: visits - Value Type: <class 'dict'>
    Key: 8fbd9962-a3e9-4a04-8fab-e64fc85e48ac - Value Type: <class 'dict'>
      Key: location - Value Type: <class 'dict'>
        Key: name - Value Type: <class 'str'>
        Key: lat - Value Type: <class 'float'>
        Key: lng - Value Type: <class 'float'>
      Key: start - Value Type: <class 'str'>
      Key: end - Value Type: <class 'str'>
      Key: duration - Value Type: <class 'int'>
      Key: load - Value Type: <class 'dict'>
        Key: weight - Value Type: <class 'int'>
        Key: volume - Value Type: <class 'int'>
      Key: priority - Value Type: <class 'i

In [13]:
import re
# Extract the finishedProcessingAt timestamp
finished_processing_at = solution_data['timing']['finishedProcessingAt']

# Format the timestamp to make it file-name friendly (remove colons, spaces, etc.)
formatted_timestamp = re.sub(r'[:\s]', '-', finished_processing_at)

In [14]:
# Convert the solution data to a JSON string
json_data = json.dumps(solution_data, indent=4)

# Define your S3 bucket and folder (optional)
bucket_name = 'dispatched-orders'
folder_name = 'optimized-dispatch-raw/'
s3_file_name = f'{folder_name}dispatch_{formatted_timestamp}.json'  # File name in S3

# Upload the JSON string directly to S3
s3_client.put_object(
    Bucket=bucket_name,
    Key=s3_file_name,
    Body=json_data,  # The JSON data as the body
    ContentType='application/json'  # Specify content type as JSON
)

print(f"JSON data uploaded to s3://{bucket_name}/{s3_file_name}")

JSON data uploaded to s3://dispatched-orders/optimized-dispatch-raw/dispatch_2024-10-05T18-15-50.867Z.json


In [15]:
from datetime import datetime, timedelta
from pyspark.sql.functions import lit, col

# Extract the served 'cus' IDs and finish times from the solution
served_customers = []
for driver, stops in solution_data['output']['solution'].items():
    for stop in stops:
        # Only process stops with customer IDs (which start with 'cus-')
        if stop['location_name'].startswith('cus-'):
            served_customers.append((stop['location_name'], stop['finish_time']))

# Debugging: Print the served customers
print("Served customers:", served_customers)

# Convert the finish_time into a timestamp and add one day
for index, (cus_id, finish_time) in enumerate(served_customers):
    try:
        # Assuming the finish_time is in 'HH:MM' format, append a date and parse it
        finish_time_obj = datetime.strptime(finish_time, '%H:%M')
        updated_finish_time = datetime.now().replace(hour=finish_time_obj.hour, minute=finish_time_obj.minute) + timedelta(days=1)
        served_customers[index] = (cus_id, updated_finish_time)
    except ValueError as e:
        print(f"Error parsing finish_time '{finish_time}' for cus_id '{cus_id}': {e}")

# Debugging: Print served customers after time adjustment
print("Served customers after time adjustment:", served_customers)

Served customers: [('cus-e5385b0c-2083-45a1-8df1-67d259af1200', '09:43'), ('cus-7c89b9b5-1ec8-44aa-94b5-abb63c2884cf', '10:31'), ('cus-b75365a4-6a2a-43de-ba32-379dd03fdbd9', '10:58'), ('cus-f19acc08-0662-44e1-a9f0-d1b0082f7873', '11:19'), ('cus-002be47a-ffcc-4543-be77-2ac408397f20', '10:17'), ('cus-6e7a84ec-0d18-4437-a3f6-369bd7f5e02a', '10:37'), ('cus-864b7b6b-a694-40bb-8f4c-e931b5c23f69', '10:42'), ('cus-c2bcb3ae-8118-4e92-8620-2063da2da939', '10:47'), ('cus-b059a6a0-5703-4198-bb2b-cd2a13146de9', '10:52'), ('cus-c6fc39fe-b9a8-47e9-b585-6386b2aa838f', '10:57'), ('cus-ff2ebd43-aede-4842-bcc1-318e020070af', '10:26'), ('cus-fda63b25-0eda-4f69-b4b0-6f0fda030416', '10:40'), ('cus-b3e8e1e1-cb6e-4e9b-81a6-ce131dba3107', '10:47'), ('cus-bc89ad9c-f681-48eb-8484-31c32df64c78', '11:19'), ('cus-db9fe191-34f6-473f-84e7-5641ffa5d06a', '11:30'), ('cus-d45695af-ba01-4d84-aa13-cf3870c5ed6f', '11:41'), ('cus-516f5414-a7e1-474d-8108-f00fbfdfbaec', '11:56'), ('cus-b698b933-9fe8-48a9-9f2a-1aa07c1377bf', '

In [16]:
from pyspark.sql.functions import lit, col
from delta.tables import DeltaTable

# Function to overwrite Delta table with the updated 'DISPATCHED' records
def overwrite_delta_table(served_customers, delta_table_path):
    # Read the Delta table into a DataFrame
    delta_table = DeltaTable.forPath(spark, delta_table_path)
    df = delta_table.toDF()

    # Create a DataFrame from the served customers list
    served_df = spark.createDataFrame(served_customers, ['served_customer_id', 'updated_finish_time'])

    # Rename the column in served_df to avoid ambiguity during the join
    served_df = served_df.withColumnRenamed('updated_finish_time', 'new_finish_time')

    # Update the records that match the served customers
    updated_df = df.join(served_df, df.customer_id == served_df.served_customer_id, 'inner') \
        .withColumn('status', lit('DISPATCHED')) \
        .withColumn('order_timestamp', col('new_finish_time'))  # Use the renamed column

    # Overwrite only the affected rows in the Delta table
    delta_table.alias('old') \
        .merge(updated_df.alias('new'), 'old.customer_id = new.served_customer_id') \
        .whenMatchedUpdate(set={
            "status": col('new.status'),
            "order_timestamp": col('new.new_finish_time')  # Use the renamed column
        }) \
        .execute()

    # Display the updated Delta table without the extra columns
    df_updated = delta_table.toDF().drop('served_customer_id', 'new_finish_time')
    df_updated.filter(df_updated.status.isin('DISPATCHED', 'READY_FOR_DISPATCH')).show(truncate=False)

# Call the function to overwrite the Delta table with the updated records
overwrite_delta_table(served_customers, delta_table_path)


[Stage 45:>                                                         (0 + 1) / 1]

+------------------------------------+----------------------------------------+------------------+------------------+------------------+--------------------------+----------+------------------+-------------------+
|order_id                            |customer_id                             |total_weight      |total_volume      |total_price       |order_timestamp           |status    |lat               |lon                |
+------------------------------------+----------------------------------------+------------------+------------------+------------------+--------------------------+----------+------------------+-------------------+
|31b36b35-4d42-4675-b7fe-8b3231f1f082|cus-6e7a84ec-0d18-4437-a3f6-369bd7f5e02a|88.62973530489795 |82.52767579258817 |555.8962092436389 |2024-10-06 10:37:03.352505|DISPATCHED|40.59326444616846 |-3.7767650828172705|
|6c26680d-add1-4bb7-8cf4-e4ceaff191a7|cus-b85524d1-cf64-4c30-a11e-d9d3068f8813|45.186168043138274|373.52346464296596|111.74023128136311|2024-10-

                                                                                

In [17]:
# Function to show Delta table with both 'DISPATCHED' and 'READY_FOR_DISPATCH' statuses
def show_dispatch_status(limit=None):
    delta_table = DeltaTable.forPath(spark, delta_table_path)
    df = delta_table.toDF()
    
    # Filter for both 'DISPATCHED' and 'READY_FOR_DISPATCH' statuses
    filtered_df = df.filter(df.status.isin('DISPATCHED', 'READY_FOR_DISPATCH'))
    
    # Show all rows (or a limited number if specified)
    if limit is not None:
        filtered_df.show(limit, truncate=False)
    else:
        filtered_df.show(truncate=False)

# Display the updated Delta table with an increased row limit (e.g., 100 rows)
show_dispatch_status(limit=100)  # Adjust limit as needed

[Stage 48:>                                                         (0 + 1) / 1]

+------------------------------------+----------------------------------------+------------------+------------------+------------------+--------------------------+----------+------------------+-------------------+
|order_id                            |customer_id                             |total_weight      |total_volume      |total_price       |order_timestamp           |status    |lat               |lon                |
+------------------------------------+----------------------------------------+------------------+------------------+------------------+--------------------------+----------+------------------+-------------------+
|31b36b35-4d42-4675-b7fe-8b3231f1f082|cus-6e7a84ec-0d18-4437-a3f6-369bd7f5e02a|88.62973530489795 |82.52767579258817 |555.8962092436389 |2024-10-06 10:37:03.352505|DISPATCHED|40.59326444616846 |-3.7767650828172705|
|6c26680d-add1-4bb7-8cf4-e4ceaff191a7|cus-b85524d1-cf64-4c30-a11e-d9d3068f8813|45.186168043138274|373.52346464296596|111.74023128136311|2024-10-

                                                                                