In [1]:
import boto3
import json
import time
from dotenv import load_dotenv
from datetime import datetime
import os
from delta.tables import DeltaTable

# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')
ROUTIFIC_TOKEN = os.getenv('ROUTIFIC_TOKEN')

In [2]:
# Initialize boto3 client for Kinesis with your credentials
kinesis_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION
)

# Initialize boto3 client for S3 with your credentials
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION 
)

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaTableReader") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Path to the Delta table in S3
delta_table_path = "s3a://orders-for-dispatch/ready_for_dispatch"


:: loading settings :: url = jar:file:/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/borja/.ivy2/cache
The jars for the packages stored in: /Users/borja/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1e5aa28a-9f69-4686-8ed2-df7469bccfc0;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 137ms :: artifacts dl 5ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	org.apache.hadoop#h

In [4]:
# Path to the Delta table in S3
delta_table_path = 's3a://orders-for-dispatch/dispatching'

In [5]:
# Function to read records marked as ready for dispatch
def read_ready_for_dispatch_orders():
    # Load the Delta table
    delta_table = DeltaTable.forPath(spark, delta_table_path)
    
    # Query the Delta table to fetch only the records marked as READY_FOR_DISPATCH
    df = delta_table.toDF().filter("status = 'READY_FOR_DISPATCH'")
    
    # Return the DataFrame
    return df

In [6]:
# Fetch the ready-for-dispatch orders
ready_orders_df = read_ready_for_dispatch_orders()

# Convert the DataFrame to a list of Row objects
ready_orders = ready_orders_df.collect()

24/10/03 03:30:30 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/10/03 03:30:38 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [7]:
ready_orders_df.show()

[Stage 11:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+
|            order_id|         customer_id|      total_weight|      total_volume|       total_price|    order_timestamp|            status|               lat|                lon|
+--------------------+--------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+
|bfc9983d-23f7-416...|cus-f1595d2d-7f14...|48.517479450213614|151.80363378642684| 576.5663611345955|2024-10-03 03:28:52|READY_FOR_DISPATCH| 40.58429278033965|-3.6723275932801482|
|6f0b12bc-7dfc-443...|cus-c7255b5b-f67f...| 30.93436584680427| 361.3511529789953|109.69888627248581|2024-10-03 03:28:59|READY_FOR_DISPATCH| 40.37587721528003|-3.8458455782855085|
|a2b89b86-13fb-4bf...|cus-6bf87077-5e08...| 51.91328065838457|454.09330436935704|503.29033208358646|2024-

                                                                                

In [8]:
import random
import numpy as np

# List of possible priorities
priorities = ["low", "regular", "high"]

# Function to write JSON with visits in the required format
def format_to_visits(data):
    if data is None:
        print("Data is None, cannot format visits.")
        return {}

    visits = {
        item['order_id']: {
            "location": {
                "name": item['customer_id'],  # Routific requires a 'name' field for each location
                "lat": round(item['lat'], 6),  # Latitude from your data
                "lng": round(item['lon'], 7)  # Longitude from your data (changed 'lon' to 'lng' for consistency with Routific API)
            },
            "start": "9:00",  # Default start time (customize if you have specific times)
            "end": "18:00",  # Default end time
            "duration": 5,  # Default duration (customize as needed)
            "load": {
                "weight": int(np.ceil(item['total_weight'])),  # Use the 'total_weight' from the row
                "volume": int(np.ceil(item['total_volume']))  # Use the 'total_volume' from the row
            },
            "priority": random.choice(priorities),  # Assign a random priority for each order
        } for item in data
    }
    return visits



# Step 3: Transform the orders into the Routific format
visits_for_routific = format_to_visits(ready_orders)

# Check the output to verify the visits are correctly formatted
print("Visits formatted for Routific API:")
print(json.dumps(visits_for_routific, indent=4))


Visits formatted for Routific API:
{
    "bfc9983d-23f7-4163-8c03-98cc7a041555": {
        "location": {
            "name": "cus-f1595d2d-7f14-455e-95f8-3cfa96d040e4",
            "lat": 40.584293,
            "lng": -3.6723276
        },
        "start": "9:00",
        "end": "18:00",
        "duration": 5,
        "load": {
            "weight": 49,
            "volume": 152
        },
        "priority": "low"
    },
    "6f0b12bc-7dfc-4432-8b19-25a1d545ee26": {
        "location": {
            "name": "cus-c7255b5b-f67f-4f66-b5dc-1d340ab2cbeb",
            "lat": 40.375877,
            "lng": -3.8458456
        },
        "start": "9:00",
        "end": "18:00",
        "duration": 5,
        "load": {
            "weight": 31,
            "volume": 362
        },
        "priority": "low"
    },
    "a2b89b86-13fb-4bf7-831b-72326bafabe3": {
        "location": {
            "name": "cus-6bf87077-5e08-4fd1-838f-f3006e2f58dc",
            "lat": 40.325046,
            "lng": -3.6

In [9]:
import requests

# Define your fleet setup here (this is an example based on previous information)
def build_fleet(depots, num_drivers, shift_start, shift_end, weight, volume, drivers_per_depot):
    fleet = {
        f"driver_{driver_counter}": {
            "start_location": {
                "id": depot["id"],
                "name": depot["name"],
                "lat": depot["lat"],
                "lng": depot["lng"]
            },
            "end_location": {
                "id": depot["id"],
                "name": depot["name"],
                "lat": depot["lat"],
                "lng": depot["lng"]
            },
            "shift_start": shift_start,
            "shift_end": shift_end,
            "min_visits": 1,
            "capacity": {
                "weight": weight,
                "volume": volume
            }
        }
        for depot in depots
        for driver_counter in range(1, num_drivers + 1)
        if driver_counter <= num_drivers and (driver_counter - 1) // drivers_per_depot < len(depots)
    }
    
    return fleet

# Define depots and fleet parameters (based on your previous setup)
depots = [
    {"id": "depot_1", "name": "San Sebastian de los Reyes", "lat": 40.54510, "lng": -3.61184},
    {"id": "depot_2", "name": "Alcorcón", "lat": 40.350370, "lng": -3.855863},
    {"id": "depot_3", "name": "Vallecas", "lat": 40.36977, "lng": -3.59670}
]

# Build the fleet
fleet_for_routific = build_fleet(depots, num_drivers=18, shift_start="9:00", shift_end="18:00", weight=1500, volume=1500, drivers_per_depot=6)

# Combine the visits and fleet into the payload
payload = {
    "visits": visits_for_routific,
    "fleet": fleet_for_routific
}

# Combine with the config options if needed
config = {
   "options": {
       "traffic": "slow",
       "balance": True,
       "shortest_distance": True,
       "polylines": True
   }
}

# Final payload with config
combined_payload = {**payload, **config}

# Make the API request to Routific
routific_url = "https://api.routific.com/v1/vrp-long"
headers = {
    "Authorization": f"bearer {ROUTIFIC_TOKEN}",
    "Content-Type": "application/json"
}

response = requests.post(routific_url, headers=headers, json=combined_payload)

# Check the response and extract the job_id even if the status code is 202
if response.status_code == 200 or response.status_code == 202:
    print("Orders submitted to Routific. Processing in progress.")
    response_data = response.json()
    jobID = response_data.get("job_id")
    print(f"Job ID: {jobID}")
else:
    print(f"Failed to submit the orders. Status code: {response.status_code}")
    print("Response:", response.text)

Orders submitted to Routific. Processing in progress.
Job ID: m1smebqq71


In [10]:
import urllib3
import json
import time

def check_job_status(URL, jobID, headers, waiting_time):
    http = urllib3.PoolManager()
    URL = f"{URL}/{jobID}"
    job_status = None

    while job_status != 'finished':
        response = http.request('GET', URL, headers=headers)
        solution_data = json.loads(response.data.decode('utf-8'))
        job_status = solution_data.get('status')

        print("Current job status:", job_status)

        if job_status in ['pending', 'processing']:
            time.sleep(waiting_time)
        elif job_status == 'finished':
            print("Job finished.")
            return solution_data
        else:
            print("Unexpected job status:", job_status)
            break

# Example usage
solution_data = check_job_status("https://api.routific.com/jobs", jobID, headers, waiting_time=10)


Current job status: processing
Current job status: finished
Job finished.


In [11]:
# Iterate over the response and print the key-value structure
def print_key_value_structure(data, indent=''):
    for key, value in data.items():
        print(f"{indent}Key: {key} - Value Type: {type(value)}")
        if isinstance(value, dict):
            print_key_value_structure(value, indent + '  ')

In [12]:
# Print structure of the response_data
print_key_value_structure(solution_data)

Key: timing - Value Type: <class 'dict'>
  Key: startedProcessingAt - Value Type: <class 'str'>
  Key: finishedProcessingAt - Value Type: <class 'str'>
Key: fetchedCount - Value Type: <class 'int'>
Key: apiMajorVersion - Value Type: <class 'int'>
Key: apiMinorVersion - Value Type: <class 'int'>
Key: _id - Value Type: <class 'str'>
Key: input - Value Type: <class 'dict'>
  Key: visits - Value Type: <class 'dict'>
    Key: bfc9983d-23f7-4163-8c03-98cc7a041555 - Value Type: <class 'dict'>
      Key: location - Value Type: <class 'dict'>
        Key: name - Value Type: <class 'str'>
        Key: lat - Value Type: <class 'float'>
        Key: lng - Value Type: <class 'float'>
      Key: start - Value Type: <class 'str'>
      Key: end - Value Type: <class 'str'>
      Key: duration - Value Type: <class 'int'>
      Key: load - Value Type: <class 'dict'>
        Key: weight - Value Type: <class 'int'>
        Key: volume - Value Type: <class 'int'>
      Key: priority - Value Type: <class 'i

In [13]:
import re
# Extract the finishedProcessingAt timestamp
finished_processing_at = solution_data['timing']['finishedProcessingAt']

# Format the timestamp to make it file-name friendly (remove colons, spaces, etc.)
formatted_timestamp = re.sub(r'[:\s]', '-', finished_processing_at)

In [14]:
# Convert the solution data to a JSON string
json_data = json.dumps(solution_data, indent=4)

# Define your S3 bucket and folder (optional)
bucket_name = 'dispatched-orders'
folder_name = 'optimized-dispatch-raw/'
s3_file_name = f'{folder_name}dispatch_{formatted_timestamp}.json'  # File name in S3

# Upload the JSON string directly to S3
s3_client.put_object(
    Bucket=bucket_name,
    Key=s3_file_name,
    Body=json_data,  # The JSON data as the body
    ContentType='application/json'  # Specify content type as JSON
)

print(f"JSON data uploaded to s3://{bucket_name}/{s3_file_name}")

JSON data uploaded to s3://dispatched-orders/optimized-dispatch-raw/dispatch_2024-10-03T01-31-09.146Z.json


In [15]:
from datetime import datetime, timedelta
from pyspark.sql.functions import lit, col

# Extract the served 'cus' IDs and finish times from the solution
served_customers = []
for driver, stops in solution_data['output']['solution'].items():
    for stop in stops:
        # Only process stops with customer IDs (which start with 'cus-')
        if stop['location_name'].startswith('cus-'):
            served_customers.append((stop['location_name'], stop['finish_time']))

# Debugging: Print the served customers
print("Served customers:", served_customers)

# Convert the finish_time into a timestamp and add one day
for index, (cus_id, finish_time) in enumerate(served_customers):
    try:
        # Assuming the finish_time is in 'HH:MM' format, append a date and parse it
        finish_time_obj = datetime.strptime(finish_time, '%H:%M')
        updated_finish_time = datetime.now().replace(hour=finish_time_obj.hour, minute=finish_time_obj.minute) + timedelta(days=1)
        served_customers[index] = (cus_id, updated_finish_time)
    except ValueError as e:
        print(f"Error parsing finish_time '{finish_time}' for cus_id '{cus_id}': {e}")

# Debugging: Print served customers after time adjustment
print("Served customers after time adjustment:", served_customers)

Served customers: [('cus-e614c9ff-ef42-4d5d-97ca-752c6edb75f8', '09:39'), ('cus-f1595d2d-7f14-455e-95f8-3cfa96d040e4', '09:59'), ('cus-0c77cf12-ac06-43b9-8a79-31afc67d7de1', '11:03'), ('cus-ecd091c4-91ee-4449-b9d2-7b38c0ed44d9', '12:00'), ('cus-a939ebd7-17ee-4f03-9d66-0143e6df5161', '12:11'), ('cus-99826dbf-a9fe-431b-95e7-eb7e01c9e2b2', '12:17'), ('cus-d352a54f-081c-474f-9385-4cacfeb28550', '09:29'), ('cus-1c2638df-a1c1-4e5e-89a4-ea6191fa09fb', '09:40'), ('cus-9ef25f15-402d-4544-bb89-cc34a8b6caad', '09:21'), ('cus-30e3b389-4422-45f2-9cd2-88786e803733', '10:11'), ('cus-607eff8c-4ad4-4a2e-8abe-1b72d8be173f', '09:34'), ('cus-c85c717a-78fd-4c44-b151-14387d4b8d74', '09:35'), ('cus-2a0cddf4-cfc0-45f8-8502-abe13dee4284', '09:43'), ('cus-2bdcc414-58ea-4f84-8e3a-e830a68dfdeb', '09:36'), ('cus-ca693a19-80da-40a1-b8d3-377404f372d6', '09:56'), ('cus-c7255b5b-f67f-4f66-b5dc-1d340ab2cbeb', '10:09'), ('cus-6bf87077-5e08-4fd1-838f-f3006e2f58dc', '09:28'), ('cus-2f527c49-a8e6-4d82-b936-2d6a4e1a752f', '

In [16]:
from pyspark.sql.functions import lit, col
from delta.tables import DeltaTable

# Function to overwrite Delta table with the updated 'DISPATCHED' records
def overwrite_delta_table(served_customers, delta_table_path):
    # Read the Delta table into a DataFrame
    delta_table = DeltaTable.forPath(spark, delta_table_path)
    df = delta_table.toDF()

    # Create a DataFrame from the served customers list
    served_df = spark.createDataFrame(served_customers, ['served_customer_id', 'updated_finish_time'])

    # Rename the column in served_df to avoid ambiguity during the join
    served_df = served_df.withColumnRenamed('updated_finish_time', 'new_finish_time')

    # Update the records that match the served customers
    updated_df = df.join(served_df, df.customer_id == served_df.served_customer_id, 'inner') \
        .withColumn('status', lit('DISPATCHED')) \
        .withColumn('order_timestamp', col('new_finish_time'))  # Use the renamed column

    # Overwrite only the affected rows in the Delta table
    delta_table.alias('old') \
        .merge(updated_df.alias('new'), 'old.customer_id = new.served_customer_id') \
        .whenMatchedUpdate(set={
            "status": col('new.status'),
            "order_timestamp": col('new.new_finish_time')  # Use the renamed column
        }) \
        .execute()

    # Display the updated Delta table without the extra columns
    df_updated = delta_table.toDF().drop('served_customer_id', 'new_finish_time')
    df_updated.filter(df_updated.status.isin('DISPATCHED', 'READY_FOR_DISPATCH')).show(truncate=False)

# Call the function to overwrite the Delta table with the updated records
overwrite_delta_table(served_customers, delta_table_path)


[Stage 47:>                                                         (0 + 1) / 1]

+------------------------------------+----------------------------------------+------------------+------------------+------------------+--------------------------+----------+------------------+-------------------+
|order_id                            |customer_id                             |total_weight      |total_volume      |total_price       |order_timestamp           |status    |lat               |lon                |
+------------------------------------+----------------------------------------+------------------+------------------+------------------+--------------------------+----------+------------------+-------------------+
|cf988e65-f487-446d-a219-3042b52217a7|cus-44773914-37e8-46e8-82ad-0f6d41a2d029|71.37787881137281 |214.71052995262997|422.21060559378554|2024-10-04 10:20:58.954948|DISPATCHED|40.479906969050305|-3.8472546692052676|
|066b1265-81fe-485a-beb8-36f9d204f224|cus-419e4282-c3d1-4c11-aec2-d905bdf3cfb5|69.80258274003555 |109.12980584587555|562.1809353803667 |2024-10-

                                                                                

In [17]:
# Function to show Delta table with both 'DISPATCHED' and 'READY_FOR_DISPATCH' statuses
def show_dispatch_status(limit=None):
    delta_table = DeltaTable.forPath(spark, delta_table_path)
    df = delta_table.toDF()
    
    # Filter for both 'DISPATCHED' and 'READY_FOR_DISPATCH' statuses
    filtered_df = df.filter(df.status.isin('DISPATCHED', 'READY_FOR_DISPATCH'))
    
    # Show all rows (or a limited number if specified)
    if limit is not None:
        filtered_df.show(limit, truncate=False)
    else:
        filtered_df.show(truncate=False)

# Display the updated Delta table with an increased row limit (e.g., 100 rows)
show_dispatch_status(limit=100)  # Adjust limit as needed

[Stage 55:>                                                         (0 + 1) / 1]

+------------------------------------+----------------------------------------+------------------+------------------+------------------+--------------------------+------------------+------------------+-------------------+
|order_id                            |customer_id                             |total_weight      |total_volume      |total_price       |order_timestamp           |status            |lat               |lon                |
+------------------------------------+----------------------------------------+------------------+------------------+------------------+--------------------------+------------------+------------------+-------------------+
|cf988e65-f487-446d-a219-3042b52217a7|cus-44773914-37e8-46e8-82ad-0f6d41a2d029|71.37787881137281 |214.71052995262997|422.21060559378554|2024-10-04 10:20:58.954948|DISPATCHED        |40.479906969050305|-3.8472546692052676|
|066b1265-81fe-485a-beb8-36f9d204f224|cus-419e4282-c3d1-4c11-aec2-d905bdf3cfb5|69.80258274003555 |109.1298058458

                                                                                