In [1]:
# Load the SQL extension
%load_ext sql

In [2]:
import boto3
import boto3
import time
import json
import time
from dotenv import load_dotenv
from datetime import datetime
import os
from delta.tables import DeltaTable
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, date_add

# Load environment variables from .env file
load_dotenv()

# Initialize AWS credentials from the .env file
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
AWS_REGION = os.getenv('AWS_REGION')
ROUTIFIC_TOKEN = os.getenv('ROUTIFIC_TOKEN')

In [3]:
# Initialize boto3 client for S3 with your credentials
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION 
)

athena_client = boto3.client(
    'athena',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=AWS_SESSION_TOKEN,
    region_name=AWS_REGION 
)

In [4]:
spark = SparkSession.builder \
    .appName("Convert JSON to Parquet") \
    .config("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID) \
    .config("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.session.token", AWS_SESSION_TOKEN) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1") \
    .config("spark.ui.port", "4050") \
    .getOrCreate()


:: loading settings :: url = jar:file:/Users/borja/Library/Caches/pypoetry/virtualenvs/route-optimizer-AqO2e-Ud-py3.11/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/borja/.ivy2/cache
The jars for the packages stored in: /Users/borja/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b71f57de-3f7b-42a7-a8da-caedd4cbd87e;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 109ms :: artifacts dl 3ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.1 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------

In [5]:
# Adjust the path to your S3 bucket
input_path = "s3a://dispatched-orders/optimized-dispatch-raw/"

# Define the input and output paths in S3
output_path = "s3a://dispatched-orders/optimized-dispatch-bronze/"

from pyspark.sql.functions import to_date, date_add, date_format

# Read the JSON file with multiLine option
df = (spark
          .read
          .option("multiLine", "true")
          .json(input_path)
     )

# Convert 'finished_at' to 'finished_date' with year, month, day, hour, and minute format
df_transformed = (df.withColumn("dispatched_date", date_add(to_date(df["finished_at"]), 1))  # Add 1 day for 'dispatched_date'
                     .write
                     .mode('overwrite')
                     .partitionBy("dispatched_date")
                     .parquet(output_path)
                 )


24/10/03 03:32:32 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/10/03 03:32:37 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [6]:
# Define input and output paths
bronze_parquet_path = "s3a://dispatched-orders/optimized-dispatch-bronze/"
silver_parquet_path = "s3a://dispatched-orders/optimized-dispatched-silver/"


# Read the input Parquet file from S3 and add necessary transformations
silver_df = (spark.read.parquet(bronze_parquet_path)
              .select("_id", "finished_at", "fleet", "id", "input", "output", "region", "status", "visits")
              .withColumn("finished_at_clean", date_format(col("finished_at"), "yyyy-MM-dd_HH-mm-ss"))
              .withColumn("dispatched_date", to_date(col("finished_at")))  # Partition by dispatched_date
)

# Write the transformed data to the silver path partitioned by dispatched_date
(silver_df
    .write
    .mode('overwrite')
    .partitionBy("dispatched_date")
    .parquet(silver_parquet_path)
)


                                                                                

In [7]:
routes_file_path = "s3a://dispatched-orders/optimized-dispatched-gold/routes/"

# Extract distinct 'finished_at_clean' values for graph_routes creation
unique_finished_at_values = silver_df.select("finished_at_clean").distinct().collect()

# Loop over each unique 'finished_at_clean' and extract 'polylines' for each
for row in unique_finished_at_values:
    finished_at_value = row["finished_at_clean"]
    
    # Set the output path with 'finished_at_clean' as the unique identifier
    parquet_file_path = f"{routes_file_path}/graph_route_{finished_at_value}.parquet"

    # Filter the DataFrame for the current 'finished_at_clean' and select 'polylines'
    graph_routes_df = silver_df.filter(col("finished_at_clean") == finished_at_value).select(
        col("output.polylines").alias("polylines")
    )
    
    # Write the polylines to a Parquet file for this specific 'finished_at_clean'
    graph_routes_df.write.mode('overwrite').parquet(parquet_file_path)

                                                                                

Grafos de recorridos codificados Streamlit (no PowerBI)

In [8]:
pBI_parquet_path = "s3a://dispatched-orders/optimized-dispatched-gold/pBI/"

# Load the Silver table from S3
silver_df = spark.read.parquet(silver_parquet_path)

# Drop 'pl_precision' and 'polylines' from the 'output' struct
clean_gold_df = (silver_df
                 .withColumn("output", 
                             col("output").dropFields("pl_precision", "polylines"))  # Dropping the fields
                 .select("output", "dispatched_date", "finished_at")  # Select only the required columns
                )

# Write the cleaned data to S3, partitioned by 'finished_at'
clean_gold_df.write.mode('overwrite').partitionBy("finished_at").parquet(pBI_parquet_path)

                                                                                

In [9]:
# Define the S3 path for Athena to store the query result
tables_location = "s3://dispatched-orders/optimized-dispatched-gold/pBI/tables/"  # Ensure this is the correct format

# Define the S3 location where the Parquet files are stored
pBI_file_path = "s3://dispatched-orders/optimized-dispatched-gold/pBI/"

In [10]:
# Define the query to create the database
create_database_query = """
CREATE DATABASE IF NOT EXISTS optimization_db
"""

# Execute the query to create the database
response = athena_client.start_query_execution(
    QueryString=create_database_query,
    ResultConfiguration={
        'OutputLocation': tables_location  # Output location for query results
    }
)

# Wait for the query execution to complete
query_execution_id = response['QueryExecutionId']
status = 'RUNNING'

# Wait for query completion
while status == 'RUNNING':
    response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
    status = response['QueryExecution']['Status']['State']
    print(f"Query status: {status}")
    if status == 'FAILED' or status == 'CANCELLED':
        raise Exception(f"Query {query_execution_id} failed or was cancelled")
    time.sleep(2)

print(f"Query {query_execution_id} succeeded!")

Query status: RUNNING
Query status: SUCCEEDED
Query d2378982-cd68-4bfe-9be5-5cfccc85f26c succeeded!


In [11]:
# SQL query to create the table in Athena
create_table_query = """
CREATE EXTERNAL TABLE IF NOT EXISTS optimization_db.dispatched_orders (
  output STRUCT<
    distances: STRUCT<
      driver_1: DOUBLE,
      driver_2: DOUBLE,
      driver_3: DOUBLE,
      driver_4: DOUBLE,
      driver_5: DOUBLE,
      driver_6: DOUBLE,
      driver_7: DOUBLE,
      driver_8: DOUBLE,
      driver_9: DOUBLE,
      driver_10: DOUBLE,
      driver_11: DOUBLE,
      driver_12: DOUBLE,
      driver_13: DOUBLE,
      driver_14: DOUBLE,
      driver_15: DOUBLE,
      driver_16: DOUBLE,
      driver_17: DOUBLE,
      driver_18: DOUBLE
    >,
    num_late_visits: BIGINT,
    num_unserved: BIGINT,
    solution: STRUCT<
      driver_1: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_2: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_3: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_4: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_5: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_6: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_7: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_8: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_9: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_10: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_11: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_12: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_13: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_14: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_15: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_16: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_17: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>,
      driver_18: ARRAY<STRUCT<
        arrival_time: STRING,
        distance: DOUBLE,
        finish_time: STRING,
        location_id: STRING,
        location_name: STRING
      >>
    >,
    status: STRING,
    total_break_time: BIGINT,
    total_distance: DOUBLE,
    total_idle_time: BIGINT,
    total_travel_time: BIGINT,
    total_vehicle_overtime: BIGINT,
    total_visit_lateness: BIGINT,
    total_working_time: BIGINT,
    unserved: STRING,
    vehicle_overtime: STRUCT<
      driver_1: BIGINT,
      driver_2: BIGINT,
      driver_3: BIGINT,
      driver_4: BIGINT,
      driver_5: BIGINT,
      driver_6: BIGINT,
      driver_7: BIGINT,
      driver_8: BIGINT,
      driver_9: BIGINT,
      driver_10: BIGINT,
      driver_11: BIGINT,
      driver_12: BIGINT,
      driver_13: BIGINT,
      driver_14: BIGINT,
      driver_15: BIGINT,
      driver_16: BIGINT,
      driver_17: BIGINT,
      driver_18: BIGINT
    >
  >,
  dispatched_date DATE
)
PARTITIONED BY (finished_at STRING)
STORED AS PARQUET
LOCATION 's3://dispatched-orders/optimized-dispatched-gold/pBI/'
TBLPROPERTIES ('parquet.compress'='SNAPPY');
"""



In [12]:
# Python code to run MSCK REPAIR TABLE
repair_table_query = "MSCK REPAIR TABLE optimization_db.dispatched_orders;"

# Execute the Athena query to create the table
response = athena_client.start_query_execution(
    QueryString=create_table_query,
    QueryExecutionContext={
        'Database': 'optimization_db'
    },
    ResultConfiguration={
        'OutputLocation': tables_location
    }
)

# Wait for the query to complete
query_execution_id = response['QueryExecutionId']
status = 'RUNNING'
while status == 'RUNNING':
    response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
    status = response['QueryExecution']['Status']['State']
    print(f"Query status: {status}")
    if status == 'FAILED' or status == 'CANCELLED':
        raise Exception(f"Query {query_execution_id} failed or was cancelled")
    time.sleep(2)

print(f"Query {query_execution_id} succeeded!")

Query status: RUNNING
Query status: SUCCEEDED
Query afc64e91-657c-4028-9f74-aecfe60826f4 succeeded!


In [13]:
# Run MSCK REPAIR TABLE to load partitions
repair_table_query = "MSCK REPAIR TABLE optimization_db.dispatched_orders;"

response = athena_client.start_query_execution(
    QueryString=repair_table_query,
    QueryExecutionContext={
        'Database': 'optimization_db'
    },
    ResultConfiguration={
        'OutputLocation': tables_location
    }
)

# Wait for the query execution to complete
query_execution_id = response['QueryExecutionId']
status = 'RUNNING'

while status in ['RUNNING', 'QUEUED']:
    response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
    status = response['QueryExecution']['Status']['State']
    if status in ['FAILED', 'CANCELLED']:
        reason = response['QueryExecution']['Status']['StateChangeReason']
        raise Exception(f"MSCK REPAIR TABLE query {query_execution_id} failed or was cancelled. Reason: {reason}")
    print(f"MSCK REPAIR TABLE status: {status}")
    time.sleep(2)

print(f"MSCK REPAIR TABLE {query_execution_id} succeeded!")


MSCK REPAIR TABLE status: RUNNING
MSCK REPAIR TABLE status: RUNNING
MSCK REPAIR TABLE status: SUCCEEDED
MSCK REPAIR TABLE 94ff3e90-eaac-41d0-a2d4-16cd31320e91 succeeded!


In [14]:
# Define the SELECT query
select_query = "SELECT * FROM optimization_db.dispatched_orders LIMIT 10;"

# Execute the query
response = athena_client.start_query_execution(
    QueryString=select_query,
    QueryExecutionContext={
        'Database': 'optimization_db'
    },
    ResultConfiguration={
        'OutputLocation': tables_location
    }
)

# Wait for the query execution to complete
query_execution_id = response['QueryExecutionId']
status = 'RUNNING'

while status in ['RUNNING', 'QUEUED']:
    response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
    status = response['QueryExecution']['Status']['State']
    if status in ['FAILED', 'CANCELLED']:
        reason = response['QueryExecution']['Status']['StateChangeReason']
        raise Exception(f"SELECT query {query_execution_id} failed or was cancelled. Reason: {reason}")
    print(f"SELECT query status: {status}")
    time.sleep(2)

print(f"SELECT query {query_execution_id} succeeded!")

# Fetch the results
results_paginator = athena_client.get_paginator('get_query_results')
results_iterator = results_paginator.paginate(QueryExecutionId=query_execution_id)

for results_page in results_iterator:
    for row in results_page['ResultSet']['Rows']:
        print([col.get('VarCharValue', '') for col in row['Data']])


SELECT query status: QUEUED
SELECT query status: SUCCEEDED
SELECT query 710e665c-1373-461d-8db3-95582c20f348 succeeded!
['output', 'dispatched_date', 'finished_at']
['{distances={driver_1=41.7215, driver_2=36.4822, driver_3=68.449, driver_4=69.1019, driver_5=56.189899999999, driver_6=30.2027, driver_7=39.497099999999, driver_8=59.415800000000004, driver_9=10.655100000000001, driver_10=75.7179, driver_11=81.8713, driver_12=49.2805, driver_13=107.7957, driver_14=80.4745, driver_15=52.9017, driver_16=30.6843, driver_17=105.0769, driver_18=75.85839999999999}, num_late_visits=0, num_unserved=0, solution={driver_1=[{arrival_time=09:00, distance=0.0, finish_time=null, location_id=depot_3, location_name=Vallecas}, {arrival_time=09:36, distance=18676.0, finish_time=09:41, location_id=04da7395-06f5-41fd-956e-36012baa5e1d, location_name=cus-abfc2e74-4bb3-45a0-9161-adc591737c0d}, {arrival_time=09:50, distance=1944.1, finish_time=09:55, location_id=655a5164-a06e-4e6c-8435-8099103aef47, location_nam

In [15]:
# Define the first query to create dispatched_orders_flat view
create_dispatched_orders_flat_view = """
CREATE OR REPLACE VIEW optimization_db.dispatched_orders_flat AS
SELECT
  dispatched_date,
  finished_at,
  output.status,
  output.total_break_time,
  output.total_distance,
  output.total_idle_time,
  output.total_travel_time,
  output.total_vehicle_overtime,
  output.total_visit_lateness,
  output.total_working_time,
  output.num_late_visits,
  output.num_unserved,
  output.unserved,
  -- Flatten distances
  output.distances.driver_1 AS distance_driver_1,
  output.distances.driver_2 AS distance_driver_2,
  output.distances.driver_3 AS distance_driver_3,
  output.distances.driver_4 AS distance_driver_4,
  output.distances.driver_5 AS distance_driver_5,
  output.distances.driver_6 AS distance_driver_6,
  output.distances.driver_7 AS distance_driver_7,
  output.distances.driver_8 AS distance_driver_8,
  output.distances.driver_9 AS distance_driver_9,
  output.distances.driver_10 AS distance_driver_10,
  output.distances.driver_11 AS distance_driver_11,
  output.distances.driver_12 AS distance_driver_12,
  output.distances.driver_13 AS distance_driver_13,
  output.distances.driver_14 AS distance_driver_14,
  output.distances.driver_15 AS distance_driver_15,
  output.distances.driver_16 AS distance_driver_16,
  output.distances.driver_17 AS distance_driver_17,
  output.distances.driver_18 AS distance_driver_18,
  -- Flatten vehicle_overtime
  output.vehicle_overtime.driver_1 AS overtime_driver_1,
  output.vehicle_overtime.driver_2 AS overtime_driver_2,
  output.vehicle_overtime.driver_3 AS overtime_driver_3,
  output.vehicle_overtime.driver_4 AS overtime_driver_4,
  output.vehicle_overtime.driver_5 AS overtime_driver_5,
  output.vehicle_overtime.driver_6 AS overtime_driver_6,
  output.vehicle_overtime.driver_7 AS overtime_driver_7,
  output.vehicle_overtime.driver_8 AS overtime_driver_8,
  output.vehicle_overtime.driver_9 AS overtime_driver_9,
  output.vehicle_overtime.driver_10 AS overtime_driver_10,
  output.vehicle_overtime.driver_11 AS overtime_driver_11,
  output.vehicle_overtime.driver_12 AS overtime_driver_12,
  output.vehicle_overtime.driver_13 AS overtime_driver_13,
  output.vehicle_overtime.driver_14 AS overtime_driver_14,
  output.vehicle_overtime.driver_15 AS overtime_driver_15,
  output.vehicle_overtime.driver_16 AS overtime_driver_16,
  output.vehicle_overtime.driver_17 AS overtime_driver_17,
  output.vehicle_overtime.driver_18 AS overtime_driver_18
FROM optimization_db.dispatched_orders;
"""

# Function to execute Athena queries
def execute_athena_query(query, database, output_location):
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database
        },
        ResultConfiguration={
            'OutputLocation': output_location
        }
    )

    # Wait for query execution to complete
    query_execution_id = response['QueryExecutionId']
    status = 'RUNNING'

    while status in ['RUNNING', 'QUEUED']:
        response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        status = response['QueryExecution']['Status']['State']
        if status in ['FAILED', 'CANCELLED']:
            reason = response['QueryExecution']['Status'].get('StateChangeReason', 'Unknown error')
            raise Exception(f"Query {query_execution_id} failed or was cancelled. Reason: {reason}")
        print(f"Query status: {status}")
        time.sleep(2)

    print(f"Query {query_execution_id} succeeded!")

# Execute the query to create dispatched_orders_flat view
print("Creating dispatched_orders_flat view...")
execute_athena_query(create_dispatched_orders_flat_view, 'optimization_db', tables_location)

Creating dispatched_orders_flat view...
Query status: RUNNING
Query status: SUCCEEDED
Query b6ad3f93-79f3-40c2-8928-36d31ce2edd4 succeeded!


In [16]:
# Build the corrected query for driver_solutions view
driver_solutions_query = """
CREATE OR REPLACE VIEW optimization_db.driver_solutions AS
"""

# Generate SELECT statements for each driver
for i in range(1, 19):
    driver_id = f'driver_{i}'
    select_statement = f"""
SELECT
  dispatched_date,
  finished_at,
  '{driver_id}' AS driver_id,
  t.elem.arrival_time,
  t.elem.distance,
  t.elem.finish_time,
  t.elem.location_id,
  t.elem.location_name
FROM optimization_db.dispatched_orders
CROSS JOIN UNNEST(output.solution.{driver_id}) AS t (elem)
"""
    if i > 1:
        driver_solutions_query += "\nUNION ALL\n"
    driver_solutions_query += select_statement

# Function to execute Athena queries
def execute_athena_query(query, database, output_location):
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database
        },
        ResultConfiguration={
            'OutputLocation': output_location
        }
    )

    # Wait for query execution to complete
    query_execution_id = response['QueryExecutionId']
    status = 'RUNNING'

    while status in ['RUNNING', 'QUEUED']:
        response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        status = response['QueryExecution']['Status']['State']
        if status in ['FAILED', 'CANCELLED']:
            reason = response['QueryExecution']['Status'].get('StateChangeReason', 'Unknown error')
            raise Exception(f"Query {query_execution_id} failed or was cancelled. Reason: {reason}")
        print(f"Query status: {status}")
        time.sleep(2)

    print(f"Query {query_execution_id} succeeded!")

# Execute the query to create driver_solutions view
print("Creating driver_solutions view...")
execute_athena_query(driver_solutions_query, 'optimization_db', tables_location)

Creating driver_solutions view...
Query status: RUNNING
Query status: SUCCEEDED
Query 2c0ec866-6626-493f-b785-7f32fe8d7dbd succeeded!


In [17]:
def run_query(query, database, output_location):
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database
        },
        ResultConfiguration={
            'OutputLocation': output_location
        }
    )

    query_execution_id = response['QueryExecutionId']
    status = 'RUNNING'

    while status in ['RUNNING', 'QUEUED']:
        response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        status = response['QueryExecution']['Status']['State']
        time.sleep(1)

    if status == 'SUCCEEDED':
        print(f"Query succeeded: {query_execution_id}")
        return query_execution_id
    else:
        reason = response['QueryExecution']['Status'].get('StateChangeReason', 'Unknown error')
        raise Exception(f"Query failed: {reason}")

Data for a Specific Driver

In [18]:
# Example usage:
query = """
SELECT *
FROM optimization_db.driver_solutions
WHERE driver_id = 'driver_1'
ORDER BY finished_at, arrival_time
LIMIT 20;
"""

try:
    query_execution_id = run_query(query, 'optimization_db', tables_location)
    # Fetch and display results
    results_paginator = athena_client.get_paginator('get_query_results')
    results_iterator = results_paginator.paginate(QueryExecutionId=query_execution_id)

    for results_page in results_iterator:
        for row in results_page['ResultSet']['Rows']:
            print([col.get('VarCharValue', '') for col in row['Data']])
except Exception as e:
    print(str(e))

Query succeeded: 108146fe-930c-4d85-adc4-8fda08f28389
['dispatched_date', 'finished_at', 'driver_id', 'arrival_time', 'distance', 'finish_time', 'location_id', 'location_name']
['2024-10-02', '2024-10-02T22:59:31.256Z', 'driver_1', '09:00', '0.0', '', 'depot_3', 'Vallecas']
['2024-10-02', '2024-10-02T22:59:31.256Z', 'driver_1', '09:34', '13411.1', '09:39', '2b9739b1-9711-465a-bcda-87f0fea426e2', 'cus-a83c1637-ffed-41a2-aede-43d0cf122fe3']
['2024-10-02', '2024-10-02T22:59:31.256Z', 'driver_1', '10:09', '14601.0', '', 'depot_3', 'Vallecas']
['2024-10-02', '2024-10-02T23:50:50.402Z', 'driver_1', '09:00', '0.0', '', 'depot_3', 'Vallecas']
['2024-10-02', '2024-10-02T23:50:50.402Z', 'driver_1', '09:36', '18676.0', '09:41', '04da7395-06f5-41fd-956e-36012baa5e1d', 'cus-abfc2e74-4bb3-45a0-9161-adc591737c0d']
['2024-10-02', '2024-10-02T23:50:50.402Z', 'driver_1', '09:50', '1944.1', '09:55', '655a5164-a06e-4e6c-8435-8099103aef47', 'cus-bcb273b4-4b6b-486e-b514-a76803a5bb96']
['2024-10-02', '2024-1

Joining the Views to Combine Data

In [19]:
query = """
SELECT
  ds.finished_at,
  ds.dispatched_date,
  ds.driver_id,
  ds.arrival_time,
  ds.distance AS segment_distance,
  ds.finish_time,
  ds.location_id,
  ds.location_name,
  dof.status,
  dof.total_distance,
  dof.total_travel_time,
  -- Include driver's total distance from the flat view
  CASE ds.driver_id
    WHEN 'driver_1' THEN dof.distance_driver_1
    WHEN 'driver_2' THEN dof.distance_driver_2
    WHEN 'driver_3' THEN dof.distance_driver_3
    WHEN 'driver_4' THEN dof.distance_driver_4
    WHEN 'driver_5' THEN dof.distance_driver_5
    WHEN 'driver_6' THEN dof.distance_driver_6
    WHEN 'driver_7' THEN dof.distance_driver_7
    WHEN 'driver_8' THEN dof.distance_driver_8
    WHEN 'driver_9' THEN dof.distance_driver_9
    WHEN 'driver_10' THEN dof.distance_driver_10
    WHEN 'driver_11' THEN dof.distance_driver_11
    WHEN 'driver_12' THEN dof.distance_driver_12
    WHEN 'driver_13' THEN dof.distance_driver_13
    WHEN 'driver_14' THEN dof.distance_driver_14
    WHEN 'driver_15' THEN dof.distance_driver_15
    WHEN 'driver_16' THEN dof.distance_driver_16
    WHEN 'driver_17' THEN dof.distance_driver_17
    WHEN 'driver_18' THEN dof.distance_driver_18
    ELSE NULL
  END AS total_driver_distance
FROM optimization_db.driver_solutions ds
JOIN optimization_db.dispatched_orders_flat dof
  ON ds.finished_at = dof.finished_at
ORDER BY ds.finished_at, ds.driver_id, ds.arrival_time
LIMIT 50;
"""

try:
    query_execution_id = run_query(query, 'optimization_db', tables_location)
    # Fetch and display results
    results_paginator = athena_client.get_paginator('get_query_results')
    results_iterator = results_paginator.paginate(QueryExecutionId=query_execution_id)

    for results_page in results_iterator:
        for row in results_page['ResultSet']['Rows']:
            print([col.get('VarCharValue', '') for col in row['Data']])
except Exception as e:
    print(str(e))

Query succeeded: 138341d7-1cd2-42da-9348-3508b97d1fd7
['finished_at', 'dispatched_date', 'driver_id', 'arrival_time', 'segment_distance', 'finish_time', 'location_id', 'location_name', 'status', 'total_distance', 'total_travel_time', 'total_driver_distance']
['2024-10-02T22:59:31.256Z', '2024-10-02', 'driver_1', '09:00', '0.0', '', 'depot_3', 'Vallecas', 'success', '521.5519', '1058', '28.0121']
['2024-10-02T22:59:31.256Z', '2024-10-02', 'driver_1', '09:34', '13411.1', '09:39', '2b9739b1-9711-465a-bcda-87f0fea426e2', 'cus-a83c1637-ffed-41a2-aede-43d0cf122fe3', 'success', '521.5519', '1058', '28.0121']
['2024-10-02T22:59:31.256Z', '2024-10-02', 'driver_1', '10:09', '14601.0', '', 'depot_3', 'Vallecas', 'success', '521.5519', '1058', '28.0121']
['2024-10-02T22:59:31.256Z', '2024-10-02', 'driver_10', '09:00', '0.0', '', 'depot_3', 'Vallecas', 'success', '521.5519', '1058', '0.0']
['2024-10-02T22:59:31.256Z', '2024-10-02', 'driver_10', '09:00', '0.0', '', 'depot_3', 'Vallecas', 'success', 

Aggregating Driver Data

In [20]:
query = """
SELECT
  driver_id,
  SUM(distance) AS total_route_distance,
  COUNT(*) AS num_stops
FROM optimization_db.driver_solutions
GROUP BY driver_id
ORDER BY driver_id;
"""

try:
    query_execution_id = run_query(query, 'optimization_db', tables_location)
    # Fetch and display results
    results_paginator = athena_client.get_paginator('get_query_results')
    results_iterator = results_paginator.paginate(QueryExecutionId=query_execution_id)

    for results_page in results_iterator:
        for row in results_page['ResultSet']['Rows']:
            print([col.get('VarCharValue', '') for col in row['Data']])
except Exception as e:
    print(str(e))

Query succeeded: cc6e04d2-c1ab-47bf-8f9b-1fc9ecdf3328
['driver_id', 'total_route_distance', 'num_stops']
['driver_1', '110011.1', '10']
['driver_10', '170864.0', '15']
['driver_11', '105913.4', '15']
['driver_12', '67242.3', '9']
['driver_13', '174364.5', '12']
['driver_14', '105795.1', '10']
['driver_15', '90313.0', '8']
['driver_16', '81426.2', '9']
['driver_17', '127579.5', '12']
['driver_18', '137219.09999999998', '11']
['driver_2', '78076.7', '12']
['driver_3', '177659.0', '14']
['driver_4', '140948.4', '10']
['driver_5', '151670.5', '10']
['driver_6', '151769.0', '10']
['driver_7', '171094.3', '11']
['driver_8', '163134.30000000002', '10']
['driver_9', '120695.2', '9']


Driver Performance Metrics

In [21]:
query = """
SELECT
  driver_id,
  dispatched_date,
  finished_at,
  CASE driver_id
    WHEN 'driver_1' THEN distance_driver_1
    WHEN 'driver_2' THEN distance_driver_2
    WHEN 'driver_3' THEN distance_driver_3
    WHEN 'driver_4' THEN distance_driver_4
    WHEN 'driver_5' THEN distance_driver_5
    WHEN 'driver_6' THEN distance_driver_6
    WHEN 'driver_7' THEN distance_driver_7
    WHEN 'driver_8' THEN distance_driver_8
    WHEN 'driver_9' THEN distance_driver_9
    WHEN 'driver_10' THEN distance_driver_10
    WHEN 'driver_11' THEN distance_driver_11
    WHEN 'driver_12' THEN distance_driver_12
    WHEN 'driver_13' THEN distance_driver_13
    WHEN 'driver_14' THEN distance_driver_14
    WHEN 'driver_15' THEN distance_driver_15
    WHEN 'driver_16' THEN distance_driver_16
    WHEN 'driver_17' THEN distance_driver_17
    WHEN 'driver_18' THEN distance_driver_18
    ELSE NULL
  END AS total_distance,
  CASE driver_id
    WHEN 'driver_1' THEN overtime_driver_1
    WHEN 'driver_2' THEN overtime_driver_2
    WHEN 'driver_3' THEN overtime_driver_3
    WHEN 'driver_4' THEN overtime_driver_4
    WHEN 'driver_5' THEN overtime_driver_5
    WHEN 'driver_6' THEN overtime_driver_6
    WHEN 'driver_7' THEN overtime_driver_7
    WHEN 'driver_8' THEN overtime_driver_8
    WHEN 'driver_9' THEN overtime_driver_9
    WHEN 'driver_10' THEN overtime_driver_10
    WHEN 'driver_11' THEN overtime_driver_11
    WHEN 'driver_12' THEN overtime_driver_12
    WHEN 'driver_13' THEN overtime_driver_13
    WHEN 'driver_14' THEN overtime_driver_14
    WHEN 'driver_15' THEN overtime_driver_15
    WHEN 'driver_16' THEN overtime_driver_16
    WHEN 'driver_17' THEN overtime_driver_17
    WHEN 'driver_18' THEN overtime_driver_18
    ELSE NULL
  END AS total_overtime
FROM optimization_db.dispatched_orders_flat,
UNNEST(
  ARRAY['driver_1', 'driver_2', 'driver_3', 'driver_4', 'driver_5', 'driver_6', 'driver_7', 'driver_8', 'driver_9', 'driver_10',
        'driver_11', 'driver_12', 'driver_13', 'driver_14', 'driver_15', 'driver_16', 'driver_17', 'driver_18']
) AS t (driver_id)
ORDER BY driver_id, dispatched_date;
"""

try:
    query_execution_id = run_query(query, 'optimization_db', tables_location)
    # Fetch and display results
    results_paginator = athena_client.get_paginator('get_query_results')
    results_iterator = results_paginator.paginate(QueryExecutionId=query_execution_id)

    for results_page in results_iterator:
        for row in results_page['ResultSet']['Rows']:
            print([col.get('VarCharValue', '') for col in row['Data']])
except Exception as e:
    print(str(e))



Query succeeded: 3df8df94-5e6f-4686-b273-8c6b8de7eb32
['driver_id', 'dispatched_date', 'finished_at', 'total_distance', 'total_overtime']
['driver_1', '2024-10-02', '2024-10-02T22:59:31.256Z', '28.0121', '0']
['driver_1', '2024-10-02', '2024-10-02T23:50:50.402Z', '41.7215', '0']
['driver_1', '2024-10-03', '2024-10-03T01:31:08.949Z', '40.2775', '0']
['driver_10', '2024-10-02', '2024-10-02T23:50:50.402Z', '75.7179', '0']
['driver_10', '2024-10-02', '2024-10-02T22:59:31.256Z', '0.0', '0']
['driver_10', '2024-10-03', '2024-10-03T01:31:08.949Z', '95.1461', '0']
['driver_11', '2024-10-02', '2024-10-02T22:59:31.256Z', '0.0', '0']
['driver_11', '2024-10-02', '2024-10-02T23:50:50.402Z', '81.8713', '0']
['driver_11', '2024-10-03', '2024-10-03T01:31:08.949Z', '24.042099999999998', '0']
['driver_12', '2024-10-02', '2024-10-02T23:50:50.402Z', '49.2805', '0']
['driver_12', '2024-10-02', '2024-10-02T22:59:31.256Z', '0.0', '0']
['driver_12', '2024-10-03', '2024-10-03T01:31:08.949Z', '17.9618', '0']
['

Analyzing Unserved Orders

In [22]:
query = """
SELECT
  dispatched_date,
  finished_at,
  unserved
FROM optimization_db.dispatched_orders_flat
WHERE unserved IS NOT NULL AND unserved <> '';
"""
try:
    query_execution_id = run_query(query, 'optimization_db', tables_location)
    # Fetch and display results
    results_paginator = athena_client.get_paginator('get_query_results')
    results_iterator = results_paginator.paginate(QueryExecutionId=query_execution_id)

    for results_page in results_iterator:
        for row in results_page['ResultSet']['Rows']:
            print([col.get('VarCharValue', '') for col in row['Data']])
except Exception as e:
    print(str(e))



Query succeeded: 8ede9fb0-631c-418b-8dae-acf49d612eaf
['dispatched_date', 'finished_at', 'unserved']


Orders with Late Visits

In [23]:
query = """
SELECT
  dispatched_date,
  finished_at,
  num_late_visits,
  total_visit_lateness
FROM optimization_db.dispatched_orders_flat
WHERE num_late_visits > 0;
"""
try:
    query_execution_id = run_query(query, 'optimization_db', tables_location)
    # Fetch and display results
    results_paginator = athena_client.get_paginator('get_query_results')
    results_iterator = results_paginator.paginate(QueryExecutionId=query_execution_id)

    for results_page in results_iterator:
        for row in results_page['ResultSet']['Rows']:
            print([col.get('VarCharValue', '') for col in row['Data']])
except Exception as e:
    print(str(e))



Query succeeded: 2bddfb1c-be66-4a17-a4f5-dbf561e498f7
['dispatched_date', 'finished_at', 'num_late_visits', 'total_visit_lateness']


In [24]:
query = """
SELECT
  ds.driver_id,
  ds.dispatched_date,
  ds.finished_at,
  ds.arrival_time,
  ds.finish_time,
  ds.distance AS segment_distance,
  ds.location_id,
  ds.location_name
FROM optimization_db.driver_solutions ds
ORDER BY ds.driver_id, ds.finished_at, ds.arrival_time;
"""
try:
    query_execution_id = run_query(query, 'optimization_db', tables_location)
    # Fetch and display results
    results_paginator = athena_client.get_paginator('get_query_results')
    results_iterator = results_paginator.paginate(QueryExecutionId=query_execution_id)

    for results_page in results_iterator:
        for row in results_page['ResultSet']['Rows']:
            print([col.get('VarCharValue', '') for col in row['Data']])
except Exception as e:
    print(str(e))

Query succeeded: 54482347-59f5-4789-8dd8-b3aa3dd58a5e
['driver_id', 'dispatched_date', 'finished_at', 'arrival_time', 'finish_time', 'segment_distance', 'location_id', 'location_name']
['driver_1', '2024-10-02', '2024-10-02T22:59:31.256Z', '09:00', '', '0.0', 'depot_3', 'Vallecas']
['driver_1', '2024-10-02', '2024-10-02T22:59:31.256Z', '09:34', '09:39', '13411.1', '2b9739b1-9711-465a-bcda-87f0fea426e2', 'cus-a83c1637-ffed-41a2-aede-43d0cf122fe3']
['driver_1', '2024-10-02', '2024-10-02T22:59:31.256Z', '10:09', '', '14601.0', 'depot_3', 'Vallecas']
['driver_1', '2024-10-02', '2024-10-02T23:50:50.402Z', '09:00', '', '0.0', 'depot_3', 'Vallecas']
['driver_1', '2024-10-02', '2024-10-02T23:50:50.402Z', '09:36', '09:41', '18676.0', '04da7395-06f5-41fd-956e-36012baa5e1d', 'cus-abfc2e74-4bb3-45a0-9161-adc591737c0d']
['driver_1', '2024-10-02', '2024-10-02T23:50:50.402Z', '09:50', '09:55', '1944.1', '655a5164-a06e-4e6c-8435-8099103aef47', 'cus-bcb273b4-4b6b-486e-b514-a76803a5bb96']
['driver_1', '

Data from the driver_solutions View

In [25]:
# Example usage:
query = """
SELECT *
FROM optimization_db.driver_solutions
LIMIT 20;
"""

try:
    query_execution_id = run_query(query, 'optimization_db', tables_location)
    # Fetch and display results
    results_paginator = athena_client.get_paginator('get_query_results')
    results_iterator = results_paginator.paginate(QueryExecutionId=query_execution_id)

    for results_page in results_iterator:
        for row in results_page['ResultSet']['Rows']:
            print([col.get('VarCharValue', '') for col in row['Data']])
except Exception as e:
    print(str(e))

Query succeeded: 8934cd9e-945b-456a-833f-4a91523b0671
['dispatched_date', 'finished_at', 'driver_id', 'arrival_time', 'distance', 'finish_time', 'location_id', 'location_name']
['2024-10-03', '2024-10-03T01:31:08.949Z', 'driver_5', '09:00', '0.0', '', 'depot_3', 'Vallecas']
['2024-10-03', '2024-10-03T01:31:08.949Z', 'driver_5', '09:35', '19385.9', '09:40', '0f229ce5-284d-4755-87aa-48f7c8f5f756', 'cus-45aaaea9-81d1-49fc-aba5-35692ba64573']
['2024-10-03', '2024-10-03T01:31:08.949Z', 'driver_5', '10:21', '21383.4', '', 'depot_3', 'Vallecas']
['2024-10-02', '2024-10-02T23:50:50.402Z', 'driver_6', '09:00', '0.0', '', 'depot_3', 'Vallecas']
['2024-10-02', '2024-10-02T23:50:50.402Z', 'driver_6', '09:33', '15042.9', '09:38', '4152955a-e6df-4ffe-a1b0-d64227cd353a', 'cus-f5beeff2-58e5-44b8-83ed-5f48720e9cb5']
['2024-10-02', '2024-10-02T23:50:50.402Z', 'driver_6', '09:41', '668.6', '09:46', '8176f038-b164-45d3-b4c4-088750409976', 'cus-868038e8-cc31-4717-9511-e275bd7e49fc']
['2024-10-02', '2024-10

In [26]:




# Example usage:
query = """
SELECT *
FROM optimization_db.dispatched_orders_flat
LIMIT 10;
"""

try:
    query_execution_id = run_query(query, 'optimization_db', output_location)
    # Fetch and display results
    results_paginator = athena_client.get_paginator('get_query_results')
    results_iterator = results_paginator.paginate(QueryExecutionId=query_execution_id)

    for results_page in results_iterator:
        for row in results_page['ResultSet']['Rows']:
            print([col.get('VarCharValue', '') for col in row['Data']])
except Exception as e:
    print(str(e))

name 'output_location' is not defined
