In [16]:
import boto3
import json
import time
from dotenv import load_dotenv
from datetime import datetime
import os
from delta.tables import DeltaTable

# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')
ROUTIFIC_TOKEN = os.getenv('ROUTIFIC_TOKEN')

In [17]:
# Initialize boto3 client for Kinesis with your credentials
kinesis_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION
)

In [18]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaTableReader") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Path to the Delta table in S3
delta_table_path = "s3a://orders-for-dispatch/ready_for_dispatch"


In [19]:
# Path to the Delta table in S3
delta_table_path = 's3a://orders-for-dispatch/dispatching'

In [20]:
# Function to read records marked as ready for dispatch
def read_ready_for_dispatch_orders():
    # Load the Delta table
    delta_table = DeltaTable.forPath(spark, delta_table_path)
    
    # Query the Delta table to fetch only the records marked as READY_FOR_DISPATCH
    df = delta_table.toDF().filter("status = 'READY_FOR_DISPATCH'")
    
    # Return the DataFrame
    return df

In [21]:
# Fetch the ready-for-dispatch orders
ready_orders_df = read_ready_for_dispatch_orders()

# Convert the DataFrame to a list of Row objects
ready_orders = ready_orders_df.collect()

24/10/02 04:05:01 WARN DeltaLog: Change in the table id detected while updating snapshot. 
Previous snapshot = Snapshot(path=s3a://orders-for-dispatch/dispatching/_delta_log, version=83, metadata=Metadata(32e99d0b-4860-41b1-b62a-1b1d387224f7,null,null,Format(parquet,Map()),{"type":"struct","fields":[{"name":"order_id","type":"string","nullable":true,"metadata":{}},{"name":"customer_id","type":"string","nullable":true,"metadata":{}},{"name":"total_weight","type":"double","nullable":true,"metadata":{}},{"name":"total_volume","type":"double","nullable":true,"metadata":{}},{"name":"total_price","type":"double","nullable":true,"metadata":{}},{"name":"order_timestamp","type":"timestamp","nullable":true,"metadata":{}},{"name":"status","type":"string","nullable":true,"metadata":{}},{"name":"lat","type":"double","nullable":true,"metadata":{}},{"name":"lon","type":"double","nullable":true,"metadata":{}},{"name":"served_customer_id","type":"string","nullable":true,"metadata":{}},{"name":"updated_

In [22]:
ready_orders_df.show()

[Stage 20:>                                                         (0 + 2) / 2]

+--------------------+--------------------+-----------------+------------------+------------------+-------------------+------------------+------------------+-------------------+
|            order_id|         customer_id|     total_weight|      total_volume|       total_price|    order_timestamp|            status|               lat|                lon|
+--------------------+--------------------+-----------------+------------------+------------------+-------------------+------------------+------------------+-------------------+
|140680f5-cbf9-405...|cus-ba8d32e6-6f9c...|82.73160870030328| 318.9174240318443|412.70773183574084|2024-10-02 04:01:42|READY_FOR_DISPATCH|  40.5202115180548|-3.8638075430981003|
|9b442566-8a8b-408...|cus-012169ba-74fe...|48.73753836832661|242.98560297383696| 572.0745375784338|2024-10-02 04:04:22|READY_FOR_DISPATCH| 40.41326882171988|-3.7854324815045137|
|6ba61fa1-8590-4e4...|cus-e54fd919-0cd3...|43.53990041187081|139.95677803342312|420.68691285213123|2024-10-02 

                                                                                

In [23]:
import random
import numpy as np

# List of possible priorities
priorities = ["low", "regular", "high"]

# Function to write JSON with visits in the required format
def format_to_visits(data):
    if data is None:
        print("Data is None, cannot format visits.")
        return {}

    visits = {
        item['order_id']: {
            "location": {
                "name": item['customer_id'],  # Routific requires a 'name' field for each location
                "lat": round(item['lat'], 6),  # Latitude from your data
                "lng": round(item['lon'], 7)  # Longitude from your data (changed 'lon' to 'lng' for consistency with Routific API)
            },
            "start": "9:00",  # Default start time (customize if you have specific times)
            "end": "18:00",  # Default end time
            "duration": 5,  # Default duration (customize as needed)
            "load": {
                "weight": int(np.ceil(item['total_weight'])),  # Use the 'total_weight' from the row
                "volume": int(np.ceil(item['total_volume']))  # Use the 'total_volume' from the row
            },
            "priority": random.choice(priorities),  # Assign a random priority for each order
        } for item in data
    }
    return visits



# Step 3: Transform the orders into the Routific format
visits_for_routific = format_to_visits(ready_orders)

# Check the output to verify the visits are correctly formatted
print("Visits formatted for Routific API:")
print(json.dumps(visits_for_routific, indent=4))


Visits formatted for Routific API:
{
    "140680f5-cbf9-4056-922e-2ff935a078f2": {
        "location": {
            "name": "cus-ba8d32e6-6f9c-4ddb-a0a0-bdc936739fbb",
            "lat": 40.520212,
            "lng": -3.8638075
        },
        "start": "9:00",
        "end": "18:00",
        "duration": 5,
        "load": {
            "weight": 83,
            "volume": 319
        },
        "priority": "high"
    },
    "9b442566-8a8b-4081-b6fd-87ff8cb1be2e": {
        "location": {
            "name": "cus-012169ba-74fe-44ad-b7f7-48923011499a",
            "lat": 40.413269,
            "lng": -3.7854325
        },
        "start": "9:00",
        "end": "18:00",
        "duration": 5,
        "load": {
            "weight": 49,
            "volume": 243
        },
        "priority": "low"
    },
    "6ba61fa1-8590-4e43-a3d5-440268d80b52": {
        "location": {
            "name": "cus-e54fd919-0cd3-40e3-801b-3aa4f0015c6a",
            "lat": 40.567357,
            "lng": -3.

In [24]:
import requests

# Define your fleet setup here (this is an example based on previous information)
def build_fleet(depots, num_drivers, shift_start, shift_end, weight, volume, drivers_per_depot):
    fleet = {
        f"driver_{driver_counter}": {
            "start_location": {
                "id": depot["id"],
                "name": depot["name"],
                "lat": depot["lat"],
                "lng": depot["lng"]
            },
            "end_location": {
                "id": depot["id"],
                "name": depot["name"],
                "lat": depot["lat"],
                "lng": depot["lng"]
            },
            "shift_start": shift_start,
            "shift_end": shift_end,
            "min_visits": 1,
            "capacity": {
                "weight": weight,
                "volume": volume
            }
        }
        for depot in depots
        for driver_counter in range(1, num_drivers + 1)
        if driver_counter <= num_drivers and (driver_counter - 1) // drivers_per_depot < len(depots)
    }
    
    return fleet

# Define depots and fleet parameters (based on your previous setup)
depots = [
    {"id": "depot_1", "name": "San Sebastian de los Reyes", "lat": 40.54510, "lng": -3.61184},
    {"id": "depot_2", "name": "Alcorcón", "lat": 40.350370, "lng": -3.855863},
    {"id": "depot_3", "name": "Vallecas", "lat": 40.36977, "lng": -3.59670}
]

# Build the fleet
fleet_for_routific = build_fleet(depots, num_drivers=18, shift_start="9:00", shift_end="18:00", weight=1500, volume=1500, drivers_per_depot=6)

# Combine the visits and fleet into the payload
payload = {
    "visits": visits_for_routific,
    "fleet": fleet_for_routific
}

# Combine with the config options if needed
config = {
   "options": {
       "traffic": "slow",
       "balance": True,
       "shortest_distance": True,
       "polylines": True
   }
}

# Final payload with config
combined_payload = {**payload, **config}

# Make the API request to Routific
routific_url = "https://api.routific.com/v1/vrp-long"
headers = {
    "Authorization": f"bearer {ROUTIFIC_TOKEN}",
    "Content-Type": "application/json"
}

response = requests.post(routific_url, headers=headers, json=combined_payload)

# Check the response and extract the job_id even if the status code is 202
if response.status_code == 200 or response.status_code == 202:
    print("Orders submitted to Routific. Processing in progress.")
    response_data = response.json()
    jobID = response_data.get("job_id")
    print(f"Job ID: {jobID}")
else:
    print(f"Failed to submit the orders. Status code: {response.status_code}")
    print("Response:", response.text)

Orders submitted to Routific. Processing in progress.
Job ID: m1r86ecm522


In [25]:
import urllib3
import json
import time

def check_job_status(URL, jobID, headers, waiting_time):
    http = urllib3.PoolManager()
    URL = f"{URL}/{jobID}"
    job_status = None

    while job_status != 'finished':
        response = http.request('GET', URL, headers=headers)
        solution_data = json.loads(response.data.decode('utf-8'))
        job_status = solution_data.get('status')

        print("Current job status:", job_status)

        if job_status in ['pending', 'processing']:
            time.sleep(waiting_time)
        elif job_status == 'finished':
            print("Job finished.")
            return solution_data
        else:
            print("Unexpected job status:", job_status)
            break

# Example usage
solution_data = check_job_status("https://api.routific.com/jobs", jobID, headers, waiting_time=10)


Current job status: processing
Current job status: finished
Job finished.


In [26]:
# Iterate over the response and print the key-value structure
def print_key_value_structure(data, indent=''):
    for key, value in data.items():
        print(f"{indent}Key: {key} - Value Type: {type(value)}")
        if isinstance(value, dict):
            print_key_value_structure(value, indent + '  ')

In [27]:
# Print structure of the response_data
print_key_value_structure(solution_data)

Key: timing - Value Type: <class 'dict'>
  Key: startedProcessingAt - Value Type: <class 'str'>
  Key: finishedProcessingAt - Value Type: <class 'str'>
Key: fetchedCount - Value Type: <class 'int'>
Key: apiMajorVersion - Value Type: <class 'int'>
Key: apiMinorVersion - Value Type: <class 'int'>
Key: _id - Value Type: <class 'str'>
Key: input - Value Type: <class 'dict'>
  Key: visits - Value Type: <class 'dict'>
    Key: 140680f5-cbf9-4056-922e-2ff935a078f2 - Value Type: <class 'dict'>
      Key: location - Value Type: <class 'dict'>
        Key: name - Value Type: <class 'str'>
        Key: lat - Value Type: <class 'float'>
        Key: lng - Value Type: <class 'float'>
      Key: start - Value Type: <class 'str'>
      Key: end - Value Type: <class 'str'>
      Key: duration - Value Type: <class 'int'>
      Key: load - Value Type: <class 'dict'>
        Key: weight - Value Type: <class 'int'>
        Key: volume - Value Type: <class 'int'>
      Key: priority - Value Type: <class 'i

In [28]:
# Print the output from the response_data
solution_data['output']['solution']

{'driver_1': [{'location_id': 'depot_3',
   'location_name': 'Vallecas',
   'arrival_time': '09:00',
   'distance': 0},
  {'location_id': '140680f5-cbf9-4056-922e-2ff935a078f2',
   'location_name': 'cus-ba8d32e6-6f9c-4ddb-a0a0-bdc936739fbb',
   'arrival_time': '10:07',
   'finish_time': '10:12',
   'distance': 37909.9},
  {'location_id': 'depot_3',
   'location_name': 'Vallecas',
   'arrival_time': '11:20',
   'distance': 39252.5}],
 'driver_10': [{'location_id': 'depot_3',
   'location_name': 'Vallecas',
   'arrival_time': '09:00',
   'distance': 0},
  {'location_id': 'depot_3',
   'location_name': 'Vallecas',
   'arrival_time': '09:00',
   'distance': 0}],
 'driver_11': [{'location_id': 'depot_3',
   'location_name': 'Vallecas',
   'arrival_time': '09:00',
   'distance': 0},
  {'location_id': 'depot_3',
   'location_name': 'Vallecas',
   'arrival_time': '09:00',
   'distance': 0}],
 'driver_12': [{'location_id': 'depot_3',
   'location_name': 'Vallecas',
   'arrival_time': '09:00',
 

In [29]:
from datetime import datetime, timedelta
from pyspark.sql.functions import lit, col

# Extract the served 'cus' IDs and finish times from the solution
served_customers = []
for driver, stops in solution_data['output']['solution'].items():
    for stop in stops:
        # Only process stops with customer IDs (which start with 'cus-')
        if stop['location_name'].startswith('cus-'):
            served_customers.append((stop['location_name'], stop['finish_time']))

# Debugging: Print the served customers
print("Served customers:", served_customers)

# Convert the finish_time into a timestamp and add one day
for index, (cus_id, finish_time) in enumerate(served_customers):
    try:
        # Assuming the finish_time is in 'HH:MM' format, append a date and parse it
        finish_time_obj = datetime.strptime(finish_time, '%H:%M')
        updated_finish_time = datetime.now().replace(hour=finish_time_obj.hour, minute=finish_time_obj.minute) + timedelta(days=1)
        served_customers[index] = (cus_id, updated_finish_time)
    except ValueError as e:
        print(f"Error parsing finish_time '{finish_time}' for cus_id '{cus_id}': {e}")

# Debugging: Print served customers after time adjustment
print("Served customers after time adjustment:", served_customers)

Served customers: [('cus-ba8d32e6-6f9c-4ddb-a0a0-bdc936739fbb', '10:12'), ('cus-bbf84676-a48e-4beb-8758-22a8bbcdbd03', '09:28'), ('cus-012169ba-74fe-44ad-b7f7-48923011499a', '09:50'), ('cus-4b7057d8-c3dd-4449-b4dc-5f4c15a4bdb6', '09:51'), ('cus-e54fd919-0cd3-40e3-801b-3aa4f0015c6a', '09:58'), ('cus-99e1c03c-9e4d-4a38-92b4-34819d8a1909', '10:30'), ('cus-5c6fc123-cf6a-41f8-9ce3-a9d80ddceaa8', '11:37')]
Served customers after time adjustment: [('cus-ba8d32e6-6f9c-4ddb-a0a0-bdc936739fbb', datetime.datetime(2024, 10, 3, 10, 12, 25, 809208)), ('cus-bbf84676-a48e-4beb-8758-22a8bbcdbd03', datetime.datetime(2024, 10, 3, 9, 28, 25, 809229)), ('cus-012169ba-74fe-44ad-b7f7-48923011499a', datetime.datetime(2024, 10, 3, 9, 50, 25, 809239)), ('cus-4b7057d8-c3dd-4449-b4dc-5f4c15a4bdb6', datetime.datetime(2024, 10, 3, 9, 51, 25, 809248)), ('cus-e54fd919-0cd3-40e3-801b-3aa4f0015c6a', datetime.datetime(2024, 10, 3, 9, 58, 25, 809256)), ('cus-99e1c03c-9e4d-4a38-92b4-34819d8a1909', datetime.datetime(2024, 

In [31]:
from pyspark.sql.functions import lit, col
from delta.tables import DeltaTable

# Function to overwrite Delta table with the updated 'DISPATCHED' records
def overwrite_delta_table(served_customers, delta_table_path):
    # Read the Delta table into a DataFrame
    delta_table = DeltaTable.forPath(spark, delta_table_path)
    df = delta_table.toDF()

    # Create a DataFrame from the served customers list
    served_df = spark.createDataFrame(served_customers, ['served_customer_id', 'updated_finish_time'])

    # Rename the column in served_df to avoid ambiguity during the join
    served_df = served_df.withColumnRenamed('updated_finish_time', 'new_finish_time')

    # Update the records that match the served customers
    updated_df = df.join(served_df, df.customer_id == served_df.served_customer_id, 'inner') \
        .withColumn('status', lit('DISPATCHED')) \
        .withColumn('order_timestamp', col('new_finish_time'))  # Use the renamed column

    # Overwrite only the affected rows in the Delta table
    delta_table.alias('old') \
        .merge(updated_df.alias('new'), 'old.customer_id = new.served_customer_id') \
        .whenMatchedUpdate(set={
            "status": col('new.status'),
            "order_timestamp": col('new.new_finish_time')  # Use the renamed column
        }) \
        .execute()

    # Display the updated Delta table without the extra columns
    df_updated = delta_table.toDF().drop('served_customer_id', 'new_finish_time')
    df_updated.filter(df_updated.status.isin('DISPATCHED', 'READY_FOR_DISPATCH')).show(truncate=False)

# Call the function to overwrite the Delta table with the updated records
overwrite_delta_table(served_customers, delta_table_path)


[Stage 79:>                                                         (0 + 1) / 1]

+------------------------------------+----------------------------------------+-----------------+------------------+------------------+--------------------------+------------------+------------------+-------------------+
|order_id                            |customer_id                             |total_weight     |total_volume      |total_price       |order_timestamp           |status            |lat               |lon                |
+------------------------------------+----------------------------------------+-----------------+------------------+------------------+--------------------------+------------------+------------------+-------------------+
|9b442566-8a8b-4081-b6fd-87ff8cb1be2e|cus-012169ba-74fe-44ad-b7f7-48923011499a|48.73753836832661|242.98560297383696|572.0745375784338 |2024-10-03 09:50:25.809239|DISPATCHED        |40.41326882171988 |-3.7854324815045137|
|140680f5-cbf9-4056-922e-2ff935a078f2|cus-ba8d32e6-6f9c-4ddb-a0a0-bdc936739fbb|82.73160870030328|318.9174240318443 |

                                                                                

In [32]:
# Function to show Delta table with both 'DISPATCHED' and 'READY_FOR_DISPATCH' statuses
def show_dispatch_status(limit=None):
    delta_table = DeltaTable.forPath(spark, delta_table_path)
    df = delta_table.toDF()
    
    # Filter for both 'DISPATCHED' and 'READY_FOR_DISPATCH' statuses
    filtered_df = df.filter(df.status.isin('DISPATCHED', 'READY_FOR_DISPATCH'))
    
    # Show all rows (or a limited number if specified)
    if limit is not None:
        filtered_df.show(limit, truncate=False)
    else:
        filtered_df.show(truncate=False)

# Display the updated Delta table with an increased row limit (e.g., 100 rows)
show_dispatch_status(limit=100)  # Adjust limit as needed

[Stage 85:>                                                         (0 + 2) / 2]

+------------------------------------+----------------------------------------+-----------------+------------------+------------------+--------------------------+------------------+------------------+-------------------+
|order_id                            |customer_id                             |total_weight     |total_volume      |total_price       |order_timestamp           |status            |lat               |lon                |
+------------------------------------+----------------------------------------+-----------------+------------------+------------------+--------------------------+------------------+------------------+-------------------+
|9b442566-8a8b-4081-b6fd-87ff8cb1be2e|cus-012169ba-74fe-44ad-b7f7-48923011499a|48.73753836832661|242.98560297383696|572.0745375784338 |2024-10-03 09:50:25.809239|DISPATCHED        |40.41326882171988 |-3.7854324815045137|
|140680f5-cbf9-4056-922e-2ff935a078f2|cus-ba8d32e6-6f9c-4ddb-a0a0-bdc936739fbb|82.73160870030328|318.9174240318443 |

                                                                                