In [1]:
from copy import deepcopy

In [None]:
def generate_random_route(city_from, city_to):
    merchandise_types = ['milk', 'honey', 'butter', 'tomatoes', 'pens', 'bread', 'coca-cola']
    merchandise = {item: random.randint(1, 50) for item in random.sample(merchandise_types, random.randint(1, len(merchandise_types)))}
    return {'from': city_from, 'to': city_to, 'merchandise': merchandise}

In [ ]:
def generate_random_standard_route(route_id):
    cities = ['Rome', 'Milan', 'Verona', 'Venezia', 'Bergamo', 'Bolzano', 'Trento']
    num_trips = random.randint(2, 4)
    return {'id': f's{route_id}', 'route': [generate_random_route(cities[i], cities[i+1]) for i in range(num_trips)]}

In [ ]:
def introduce_variation(route):
    # Simulate variations by randomly modifying quantities, adding/removing items, and changing cities
    modified_route = deepcopy(route)
    for trip in modified_route['route']:
        for item, quantity in trip['merchandise'].items():
            if random.random() < 0.3:  # 30% chance of modification
                trip['merchandise'][item] = max(1, quantity + random.randint(-5, 5))
        if random.random() < 0.2:  # 20% chance of adding/removing items
            item_to_change = random.choice(['milk', 'honey', 'butter', 'tomatoes', 'pens', 'bread', 'coca-cola'])
            trip['merchandise'][item_to_change] = random.randint(1, 50)
        if random.random() < 0.1:  # 10% chance of changing cities
            trip['from'] = random.choice(['Rome', 'Milan', 'Verona', 'Venezia', 'Bergamo', 'Bolzano', 'Trento'])
            trip['to'] = random.choice(['Rome', 'Milan', 'Verona', 'Venezia', 'Bergamo', 'Bolzano', 'Trento'])
    return modified_route

In [ ]:
def generate_actual_route(route_id, driver, standard_route):
    actual_route = deepcopy(standard_route)
    actual_route['id'] = f'a{route_id}'
    actual_route['driver'] = driver
    actual_route['route'] = [generate_random_route(trip['from'], trip['to']) for trip in standard_route['route']]
    actual_route = introduce_variation(actual_route)
    return actual_route

In [ ]:
def generate_synthetic_dataset(num_standard_routes, num_actual_routes_per_standard):
    standard_routes = [generate_random_standard_route(i) for i in range(1, num_standard_routes + 1)]

    actual_routes = []
    for route_id in range(1, num_actual_routes_per_standard + 1):
        driver = random.choice(['A', 'B', 'C', 'D', 'E'])
        standard_route = random.choice(standard_routes)
        actual_route = generate_actual_route(route_id, driver, standard_route)
        actual_routes.append(actual_route)

    with open('standard.json', 'w') as standard_file:
        json.dump(standard_routes, standard_file, indent=2)

    with open('actual.json', 'w') as actual_file:
        json.dump(actual_routes, actual_file, indent=2)

In [ ]:
if __name__ == "__main__":
    generate_synthetic_dataset(num_standard_routes=100, num_actual_routes_per_standard=200)

## Modifying the synthetic dataset

### Generating connected standard routes

In [8]:
import json

# List of top 50 cities in Italy (from Wikipedia)
cities = [
    "Rome", "Milan", "Naples", "Turin", "Palermo", "Genoa", "Bologna", "Florence",
    "Bari", "Catania", "Verona", "Venice", "Messina", "Padua", "Prato", "Trieste",
    "Brescia", "Parma", "Taranto", "Modena", "Reggio Calabria", "Reggio Emilia",
    "Perugia", "Ravenna", "Livorno", "Rimini", "Cagliari", "Foggia", "Ferrara",
    "Salerno", "Latina", "Giugliano in Campania", "Monza", "Sassari", "Bergamo",
    "Pescara", "Trento", "Forlì", "Syracuse", "Vicenza", "Terni", "Bolzano-Bozen",
    "Piacenza", "Novara", "Ancona", "Udine", "Andria", "Arezzo", "Cesena"
]

# Merchandise types
merchandise_types = ['milk', 'honey', 'butter', 'tomatoes', 'pens', 'bread', 'coca-cola']

In [9]:
def generate_merchandise():
    """Generate a random set of merchandise with quantities."""
    return {
        # merchandise_type: random_quantity for _ in range(random_number_of_items)
        random.choice(merchandise_types): random.randint(1, 50) for _ in range(random.randint(1, 4))
    }

def generate_connected_route(min_trips_, max_trips_):
    """
    Generate a connected route with a random number of trips within the specified constraint.
    
    @param min_trips_: minimum number of trips in the route
    @param max_trips_: maximum number of trips in the route
    
    @return: a connected route with a random number of trips within the specified constraint
    """
    route_length = random.randint(min_trips_, max_trips_)  # Number of trips in the route
    
    # random.sample() returns a list of unique elements, so the route is guaranteed to be connected
    selected_cities = random.sample(cities, route_length + 1)  # One more city for the final destination
    
    # randomize the order of the cities
    random.shuffle(selected_cities)
    
    route_ = []
    for i in range(route_length):  # Generate a trip for each pair of cities
        from_city = selected_cities[i]
        to_city = selected_cities[i + 1]
        route_.append({"from": from_city, "to": to_city, "merchandise": generate_merchandise()})

    return route_

def generate_standard_routes_connected(num_routes, min_trips_, max_trips_):
    """
    Generate a set of standard routes with connected trips and trip number constraints.
    
    @param num_routes: number of routes to generate
    @param min_trips_: minimum number of trips in the route
    @param max_trips_: maximum number of trips in the route
    
    @return: a set of standard routes with connected trips and trip number constraints
    """
    standard_routes = []
    for i in range(num_routes):  # Generate a standard route for each route id
        # Generate a connected route with the specified trip constraints
        route_ = generate_connected_route(min_trips_, max_trips_)
        
        # Add the route to the list of standard routes
        standard_routes.append({"id": f"s{i+1}", "route": route_})  
    
    return standard_routes

In [10]:
# Define minimum and maximum number of trips in a route
min_trips = 3
max_trips = 7

# Generate 10 standard connected routes with the specified trip constraints
standard_connected_routes = generate_standard_routes_connected(10, min_trips, max_trips)

# Print the generated connected routes with constraints
for route in standard_connected_routes:
    print(json.dumps(route, indent=2))

{
  "id": "s1",
  "route": [
    {
      "from": "Venice",
      "to": "Modena",
      "merchandise": {
        "butter": 2
      }
    },
    {
      "from": "Modena",
      "to": "Ravenna",
      "merchandise": {
        "butter": 8,
        "milk": 8
      }
    },
    {
      "from": "Ravenna",
      "to": "Turin",
      "merchandise": {
        "coca-cola": 15,
        "milk": 2
      }
    },
    {
      "from": "Turin",
      "to": "Foggia",
      "merchandise": {
        "pens": 1
      }
    }
  ]
}
{
  "id": "s2",
  "route": [
    {
      "from": "Sassari",
      "to": "Novara",
      "merchandise": {
        "bread": 34,
        "honey": 50,
        "coca-cola": 30
      }
    },
    {
      "from": "Novara",
      "to": "Milan",
      "merchandise": {
        "bread": 32,
        "butter": 12,
        "honey": 5
      }
    },
    {
      "from": "Milan",
      "to": "Cagliari",
      "merchandise": {
        "honey": 16
      }
    },
    {
      "from": "Cagliari",
      

### Generating actual routes with variations

In [None]:
import json
import random

# Constants for the number of drivers and their IDs
NUM_DRIVERS = 20
DRIVERS = [f'D{i}' for i in range(1, NUM_DRIVERS + 1)]

# Function to load the standard routes data
def load_standard_routes(file_path):
    with open(file_path, 'r') as file_:
        return json.load(file_)

# Function to adjust merchandise quantities
def adjust_merchandise(merchandise):
    adjusted = {}
    for item, quantity in merchandise.items():
        decision = random.choice(['increase', 'decrease', 'omit', 'keep'])
        if decision == 'increase':
            adjusted[item] = min(quantity + random.randint(1, 5), 50)  # Cap at 50 for max quantity
        elif decision == 'decrease' and quantity > 1:
            adjusted[item] = max(quantity - random.randint(1, quantity - 1), 1)  # Ensure at least 1
        elif decision == 'keep':
            adjusted[item] = quantity
    return adjusted

In [ ]:
# Load the standard routes
standard_routes = load_standard_routes('standard.json')

In [14]:
def create_actual_route_variation(standard_route, driver_id):
    """
    Create a variation of the standard route to form an actual route.
    Variations include minor changes in the route and merchandise.

    :param standard_route: The original standard route.
    :param driver_id: The ID of the driver for the actual route.
    :return: A varied actual route.
    """
    actual_route = {
        "id": f"a{random.randint(1, 10000)}",  # Unique ID for the actual route
        "driver": driver_id,
        "sroute": standard_route["id"],
        "route": []
    }

    # Iterate over the trips in the standard route to create variations
    for trip in standard_route["route"]:
        # Randomly decide to make a minor detour
        if random.choice([True, False]):
            # Choose a random nearby city for a detour
            detour_city = random.choice(
                [city for city in cities if city != trip["from"] and city != trip["to"]]
            )
            
            # add a detour city in from of the original trip or after the original trip
            if random.choice([True, False]):
                # Add a detour trip (from the detour city to the original city)
                actual_route["route"].append({
                    "from": detour_city, "to": trip["from"], 
                    "merchandise": generate_merchandise()
                })
            else:
                # Add a detour trip (from the original city to the detour city)
                actual_route["route"].append({
                    "from": trip["from"], "to": detour_city, 
                    "merchandise": adjust_merchandise(trip["merchandise"])
                })
                
                # Add the original trip (from the detour city to the original destination)
                actual_route["route"].append({
                    "from": detour_city, "to": trip["to"],
                    "merchandise": generate_merchandise()
                })
        else:
            # Keep the original trip but adjust the merchandise
            actual_route["route"].append({
                "from": trip["from"], "to": trip["to"], 
                "merchandise": adjust_merchandise(trip["merchandise"])
            })

    return actual_route

# Generate actual routes with variations
actual_routes_with_variations = []
for driver in DRIVERS:
    for standard_route in standard_routes:
        # Generate multiple variations for each standard route
        for _ in range(random.randint(1, 3)):
            varied_route = create_actual_route_variation(standard_route, driver)
            actual_routes_with_variations.append(varied_route)

# Display the first few actual routes for review
print(json.dumps(actual_routes_with_variations[:3], indent=2))

[
  {
    "id": "a2329",
    "driver": "D1",
    "sroute": "s1",
    "route": [
      {
        "from": "Vicenza",
        "to": "Giugliano in Campania",
        "merchandise": {
          "honey": 48,
          "milk": 35
        }
      },
      {
        "from": "Giugliano in Campania",
        "to": "Genoa",
        "merchandise": {
          "pens": 21,
          "milk": 2,
          "honey": 22
        }
      },
      {
        "from": "Genoa",
        "to": "Monza",
        "merchandise": {
          "tomatoes": 6
        }
      },
      {
        "from": "Monza",
        "to": "Ravenna",
        "merchandise": {
          "butter": 47
        }
      },
      {
        "from": "Ravenna",
        "to": "Pescara",
        "merchandise": {
          "bread": 41
        }
      },
      {
        "from": "Pescara",
        "to": "Rome",
        "merchandise": {
          "bread": 40
        }
      },
      {
        "from": "Rome",
        "to": "Turin",
        "merchandise": {

In [19]:
standard_routes[0]

{'id': 's1',
 'route': [{'from': 'Vicenza',
   'to': 'Genoa',
   'merchandise': {'honey': 48, 'milk': 39}},
  {'from': 'Genoa',
   'to': 'Monza',
   'merchandise': {'coca-cola': 42, 'tomatoes': 6}},
  {'from': 'Monza',
   'to': 'Pescara',
   'merchandise': {'pens': 49, 'butter': 44}},
  {'from': 'Pescara',
   'to': 'Turin',
   'merchandise': {'bread': 36, 'pens': 3, 'milk': 49}},
  {'from': 'Turin',
   'to': 'Bari',
   'merchandise': {'milk': 40, 'tomatoes': 20, 'bread': 31}}]}

In [20]:
actual_routes_with_variations[0]

{'id': 'a2329',
 'driver': 'D1',
 'sroute': 's1',
 'route': [{'from': 'Vicenza',
   'to': 'Giugliano in Campania',
   'merchandise': {'honey': 48, 'milk': 35}},
  {'from': 'Giugliano in Campania',
   'to': 'Genoa',
   'merchandise': {'pens': 21, 'milk': 2, 'honey': 22}},
  {'from': 'Genoa', 'to': 'Monza', 'merchandise': {'tomatoes': 6}},
  {'from': 'Monza', 'to': 'Ravenna', 'merchandise': {'butter': 47}},
  {'from': 'Ravenna', 'to': 'Pescara', 'merchandise': {'bread': 41}},
  {'from': 'Pescara', 'to': 'Rome', 'merchandise': {'bread': 40}},
  {'from': 'Rome',
   'to': 'Turin',
   'merchandise': {'milk': 33, 'butter': 6, 'honey': 17}},
  {'from': 'Turin', 'to': 'Modena', 'merchandise': {'milk': 43, 'bread': 36}},
  {'from': 'Modena',
   'to': 'Bari',
   'merchandise': {'tomatoes': 12, 'pens': 42, 'butter': 18, 'bread': 36}}]}

## Cities and Distances (Once City distance matrix against all other cities)

### Loading Italian cities and their coordinates from a CSV file

In [19]:
import pandas as pd

In [20]:
it_cities_df = pd.read_csv("Dataset/it-cities.csv")

# rename so that the column names are shorter and comply with PEP-8
it_cities_df.rename(
    columns={"CountryName": "Country", "CapitalName": "capital", "CapitalLatitude": "lat", 
             "CapitalLongitude": "lon", "CountryCode": "code", "ContinentName": "continent"}, inplace=True)
it_cities_df.head(3)

Unnamed: 0,city,lat,lng,country,iso2,admin_name,capital,population,population_proper
0,Rome,41.8931,12.4828,Italy,IT,Lazio,primary,2872800,2872800
1,Milan,45.4669,9.19,Italy,IT,Lombardy,admin,1366180,1366180
2,Naples,40.8333,14.25,Italy,IT,Campania,admin,966144,966144


In [21]:
# List of top 50 cities in Italy (from Wikipedia)
cities = [
    "Rome", "Milan", "Naples", "Turin", "Palermo", "Genoa", "Bologna", "Florence",
    "Bari", "Catania", "Verona", "Venice", "Messina", "Padova", "Prato", "Trieste",
    "Brescia", "Parma", "Taranto", "Modena", "Reggio di Calabria", "Reggio Emilia",
    "Perugia", "Ravenna", "Livorno", "Rimini", "Cagliari", "Foggia", "Ferrara",
    "Salerno", "Latina", "Giugliano in Campania", "Monza", "Sassari", "Bergamo",
    "Pescara", "Trento", "Forlì", "Siracusa", "Vicenza", "Terni", "Bolzano",
    "Piacenza", "Novara", "Ancona", "Udine"
]

In [4]:
!pip install geopy

Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
     ---------------------------------------- 0.0/40.3 kB ? eta -:--:--
     ---------------------------------------- 0.0/40.3 kB ? eta -:--:--
     ------------------- ------------------ 20.5/40.3 kB 320.0 kB/s eta 0:00:01
     -------------------------------------- 40.3/40.3 kB 385.7 kB/s eta 0:00:00
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
   ---------------------------------------- 0.0/125.4 kB ? eta -:--:--
   ---------------------- ----------------- 71.7/125.4 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 125.4/125.4 kB 1.5 MB/s eta 0:00:00
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.1


In [22]:
from geopy import distance

In [23]:
hest = it_cities_df[it_cities_df["city"].isin(["Rome", "Milan"])].reset_index()
hest

Unnamed: 0,index,city,lat,lng,country,iso2,admin_name,capital,population,population_proper
0,0,Rome,41.8931,12.4828,Italy,IT,Lazio,primary,2872800,2872800
1,1,Milan,45.4669,9.19,Italy,IT,Lombardy,admin,1366180,1366180


In [24]:
hest_cities = hest.copy()
hest_cities

Unnamed: 0,index,city,lat,lng,country,iso2,admin_name,capital,population,population_proper
0,0,Rome,41.8931,12.4828,Italy,IT,Lazio,primary,2872800,2872800
1,1,Milan,45.4669,9.19,Italy,IT,Lombardy,admin,1366180,1366180


In [25]:
d = distance.distance((hest_cities.loc[0, "lat"], hest_cities.loc[0, "lng"]), (hest_cities.loc[1, "lat"], hest_cities.loc[1, "lng"]))

In [26]:
d.km

477.56703300539994

In [27]:
# Function to calculate the distance between two cities
def calculate_distance(city1_, city2_):
    """
    Calculate the distance between two cities.
    
    @param city1_: the first city
    @param city2_: the second city
    
    @return: the distance between the two cities
    """
    # Get the coordinates of the two cities
    city1_coords = it_cities_df[it_cities_df["city"] == city1_].reset_index()
    city2_coords = it_cities_df[it_cities_df["city"] == city2_].reset_index()
    
    # Calculate the distance between the two cities
    d_ = distance.distance(
        (city1_coords.loc[0, "lat"], city1_coords.loc[0, "lng"]), 
        (city2_coords.loc[0, "lat"], city2_coords.loc[0, "lng"])
    )
    
    return d_.km

In [28]:
# Create a distance matrix for the cities
distance_matrix = pd.DataFrame(index=cities, columns=cities)
for city1 in cities:
    print(f"Calculating distances from {city1}...")
    for city2 in cities:
        if city1 == city2:
            continue
        distance_matrix.loc[city1, city2] = calculate_distance(city1, city2)
        
# Display the distance matrix
distance_matrix

Calculating distances from Rome...
Calculating distances from Milan...
Calculating distances from Naples...
Calculating distances from Turin...
Calculating distances from Palermo...
Calculating distances from Genoa...
Calculating distances from Bologna...
Calculating distances from Florence...
Calculating distances from Bari...
Calculating distances from Catania...
Calculating distances from Verona...
Calculating distances from Venice...
Calculating distances from Messina...
Calculating distances from Padova...
Calculating distances from Prato...
Calculating distances from Trieste...
Calculating distances from Brescia...
Calculating distances from Parma...
Calculating distances from Taranto...
Calculating distances from Modena...
Calculating distances from Reggio di Calabria...
Calculating distances from Reggio Emilia...
Calculating distances from Perugia...
Calculating distances from Ravenna...
Calculating distances from Livorno...
Calculating distances from Rimini...
Calculating dist

Unnamed: 0,Rome,Milan,Naples,Turin,Palermo,Genoa,Bologna,Florence,Bari,Catania,...,Trento,Forlì,Siracusa,Vicenza,Terni,Bolzano,Piacenza,Novara,Ancona,Udine
Rome,,477.567033,188.983954,525.654881,426.070596,401.960481,303.426129,231.573282,375.743088,536.516598,...,476.492079,262.360392,587.331036,413.191956,75.433725,519.558232,416.732462,503.245517,209.359851,467.623013
Milan,477.567033,,658.795099,126.368016,886.731534,119.074773,201.294867,249.65985,787.25523,1012.102091,...,164.026504,264.379364,1062.64513,184.649862,425.098547,203.152836,61.230419,44.880189,400.53683,321.469527
Naples,188.983954,658.795099,,713.872634,311.254244,589.883509,471.266331,409.244195,222.591127,377.111118,...,634.209108,418.661581,427.428791,568.023935,234.062925,671.186074,597.568447,687.456249,315.062864,587.219011
Turin,525.654881,126.368016,713.872634,,906.531599,124.164189,297.35052,319.806402,866.794236,1045.055611,...,290.09242,359.381308,1094.546857,308.20977,487.484745,326.592325,159.42756,84.543828,493.099051,447.342585
Palermo,426.070596,886.731534,311.254244,906.531599,,791.237705,728.185082,652.598277,449.63845,166.886925,...,902.364648,688.230026,205.977151,839.26032,497.480598,945.604773,828.231151,904.659204,611.056371,883.22246
Genoa,401.960481,119.074773,589.883509,124.164189,791.237705,,192.035334,199.020967,744.436321,925.542935,...,251.493907,249.430089,975.413171,242.12071,363.551385,299.455683,93.456605,118.119,377.958288,384.468591
Bologna,303.426129,201.294867,471.266331,297.35052,728.185082,192.035334,,80.592957,586.419172,838.013128,...,175.694363,63.376385,888.918379,118.497313,238.938016,222.751707,143.975342,239.848115,199.591316,229.236664
Florence,231.573282,249.65985,409.244195,319.806402,652.598277,199.020967,80.592957,,547.149958,768.089051,...,255.307957,81.900025,818.903863,199.034206,175.439348,303.133429,188.436006,280.376288,183.196257,299.113631
Bari,375.743088,787.25523,222.591127,866.794236,449.63845,744.436321,586.419172,547.149958,,430.630052,...,718.614589,524.296808,470.539572,653.55769,385.437658,743.255527,727.973011,823.421235,390.751845,622.317638
Catania,536.516598,1012.102091,377.111118,1045.055611,166.886925,925.542935,838.013128,768.089051,430.630052,,...,1006.853535,790.29282,50.909174,941.36868,599.583757,1045.942168,951.839793,1034.545741,692.166436,963.860536


In [18]:
# Save the distance matrix to a CSV file
distance_matrix.to_csv("Dataset/distance-matrix.csv")

In [19]:
# Load the distance matrix from a CSV file
distance_matrix = pd.read_csv("Dataset/distance-matrix.csv", index_col=0)

In [20]:
distance_matrix

Unnamed: 0,Rome,Milan,Naples,Turin,Palermo,Genoa,Bologna,Florence,Bari,Catania,...,Trento,Forlì,Siracusa,Vicenza,Terni,Bolzano,Piacenza,Novara,Ancona,Udine
Rome,,477.567033,188.983954,525.654881,426.070596,401.960481,303.426129,231.573282,375.743088,536.516598,...,476.492079,262.360392,587.331036,413.191956,75.433725,519.558232,416.732462,503.245517,209.359851,467.623013
Milan,477.567033,,658.795099,126.368016,886.731534,119.074773,201.294867,249.65985,787.25523,1012.102091,...,164.026504,264.379364,1062.64513,184.649862,425.098547,203.152836,61.230419,44.880189,400.53683,321.469527
Naples,188.983954,658.795099,,713.872634,311.254244,589.883509,471.266331,409.244195,222.591127,377.111118,...,634.209108,418.661581,427.428791,568.023935,234.062925,671.186074,597.568447,687.456249,315.062864,587.219011
Turin,525.654881,126.368016,713.872634,,906.531599,124.164189,297.35052,319.806402,866.794236,1045.055611,...,290.09242,359.381308,1094.546857,308.20977,487.484745,326.592325,159.42756,84.543828,493.099051,447.342585
Palermo,426.070596,886.731534,311.254244,906.531599,,791.237705,728.185082,652.598277,449.63845,166.886925,...,902.364648,688.230026,205.977151,839.26032,497.480598,945.604773,828.231151,904.659204,611.056371,883.22246
Genoa,401.960481,119.074773,589.883509,124.164189,791.237705,,192.035334,199.020967,744.436321,925.542935,...,251.493907,249.430089,975.413171,242.12071,363.551385,299.455683,93.456605,118.119,377.958288,384.468591
Bologna,303.426129,201.294867,471.266331,297.35052,728.185082,192.035334,,80.592957,586.419172,838.013128,...,175.694363,63.376385,888.918379,118.497313,238.938016,222.751707,143.975342,239.848115,199.591316,229.236664
Florence,231.573282,249.65985,409.244195,319.806402,652.598277,199.020967,80.592957,,547.149958,768.089051,...,255.307957,81.900025,818.903863,199.034206,175.439348,303.133429,188.436006,280.376288,183.196257,299.113631
Bari,375.743088,787.25523,222.591127,866.794236,449.63845,744.436321,586.419172,547.149958,,430.630052,...,718.614589,524.296808,470.539572,653.55769,385.437658,743.255527,727.973011,823.421235,390.751845,622.317638
Catania,536.516598,1012.102091,377.111118,1045.055611,166.886925,925.542935,838.013128,768.089051,430.630052,,...,1006.853535,790.29282,50.909174,941.36868,599.583757,1045.942168,951.839793,1034.545741,692.166436,963.860536


In [ ]:
# To add city variation to the actual routes, based on the standard routes, consider the city with least distance
# from the standard route city and add that city to the actual route

## 2. Data Analysis and Mining

### 2.1. Loading the data

In [12]:
import json
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from geopy import distance

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# Load the standard routes from a JSON
with open("Dataset/standard_routes.json", "r") as file:
    standard_routes = json.load(file)
    
# Load the actual routes from a JSON
with open("Dataset/actual_routes.json", "r") as file:
    actual_routes = json.load(file)

In [13]:
# Convert route data into a suitable format for analysis
def convert_route_to_df(routes):
    """Convert routes data to a Pandas DataFrame."""
    rows = []
    for route_ in routes:
        for trip in route_["route"]:
            row = {
                "route_id": route_["id"],
                "from": trip["from"],
                "to": trip["to"],
                "merchandise": ", ".join([f"{item}: {qty}" for item, qty in trip["merchandise"].items()])
            }
            rows.append(row)
    return pd.DataFrame(rows)

In [14]:
# Convert the sample data into DataFrames
standard_routes_df = convert_route_to_df(standard_routes)
actual_routes_df = convert_route_to_df(actual_routes)

In [15]:
# Display the first few rows of each DataFrame for inspection
standard_routes_df.head()

Unnamed: 0,route_id,from,to,merchandise
0,s1,Trento,Bolzano,"pens: 5, milk: 30"
1,s1,Bolzano,Vicenza,"honey: 15, pens: 34, tomatoes: 2, coca-cola: 26"
2,s1,Vicenza,Padova,"bread: 47, butter: 39, tomatoes: 29"
3,s1,Padova,Bologna,"pens: 50, tomatoes: 16, butter: 2"
4,s1,Bologna,Forlì,"pens: 11, bread: 8"


In [16]:
actual_routes_df.head()

Unnamed: 0,route_id,from,to,merchandise
0,a28078,Trento,Bolzano,"pens: 5, milk: 30"
1,a28078,Bolzano,Vicenza,"honey: 15, pens: 34, tomatoes: 2, coca-cola: 26"
2,a28078,Vicenza,Padova,"bread: 47, butter: 39, tomatoes: 29"
3,a28078,Padova,Ravenna,"pens: 23, tomatoes: 2, butter: 2"
4,a28078,Ravenna,Forlì,"pens: 11, bread: 1"


### 2.2. Analysis Step 1: Comparing Standard and Actual Routes

In [31]:
def analyze_route_deviations(standard_routes_df_, actual_routes_df_):
    """Analyze deviations between standard and actual routes."""
    # Merge the two dataframes on 'from' and 'to' columns to find matching routes
    # merged_routes = pd.merge(standard_routes_df_, actual_routes_df_, on=['from', 'to'], suffixes=('_std', '_act'))
    # 
    # # Calculate the deviation in merchandise for each route
    # merged_routes['merchandise_deviation'] = merged_routes.apply(
    #     lambda row: len(row['merchandise_std'].split(', ')) - len(row['merchandise_act'].split(', ')), axis=1
    # )
    # 
    # # Calculate the deviation in distance for each route
    # merged_routes['distance_deviation'] = merged_routes.apply(
    #     lambda row: calculate_distance(row['from'], row['to']) - calculate_distance(row['from_act'], row['to_act']), axis=1
    # )
    # 
    # # Calculate the deviation in number of trips for each route
    # merged_routes['num_trips_deviation'] = merged_routes.apply(
    #     lambda row: len(standard_routes_df_[standard_routes_df_['route_id'] == row['route_id_std']]) - 1, axis=1
    # )
    # 
    # # Calculate the deviation in number of trips for each route
    # merged_routes['num_trips_deviation'] = merged_routes.apply(
    #     lambda row: len(standard_routes_df_[standard_routes_df_['route_id'] == row['route_id_std']]) - 1, axis=1
    # )

    # Count the frequency of each route in both standard and actual routes
    route_frequency_std = standard_routes_df_.groupby(['from', 'to']).size().reset_index(name='count_std')
    route_frequency_act = actual_routes_df_.groupby(['from', 'to']).size().reset_index(name='count_act')

    # Merge frequencies to compare
    frequency_comparison = pd.merge(
        route_frequency_std, route_frequency_act, on=['from', 'to'], how='outer').fillna(0)
    
    # Sort by the difference in frequency (indicating deviations)
    frequency_comparison['frequency_diff'] = frequency_comparison['count_act'] - frequency_comparison['count_std']
    sorted_frequency_comparison = frequency_comparison.sort_values(by='frequency_diff', ascending=False)

    return sorted_frequency_comparison

In [32]:
# Perform the analysis
route_deviations_analysis = analyze_route_deviations(standard_routes_df, actual_routes_df)

# Display the result
route_deviations_analysis.head()

Unnamed: 0,from,to,count_std,count_act,frequency_diff
75,Naples,Pescara,7.0,215,208.0
93,Parma,Verona,5.0,169,164.0
103,Pescara,Rome,5.0,162,157.0
44,Forlì,Rimini,4.0,145,141.0
30,Florence,Bologna,4.0,141,137.0


### 2.3. Analysis Step 2: Analyzing Driver Behavior

In [33]:
def analyze_driver_behavior(actual_routes_df_):
    """Analyze the behavior of drivers based on the actual routes they take."""
    # Group by driver and count the frequency of each route
    driver_route_frequency = actual_routes_df_.groupby(['driver', 'from', 'to']).size().reset_index(name='count')

    # Sort by driver and count to find the most common routes for each driver
    sorted_driver_routes = driver_route_frequency.sort_values(by=['driver', 'count'], ascending=[True, False])

    return sorted_driver_routes

In [34]:
# We need to add a 'driver' column to the actual routes dataframe for this analysis
# Adding a sample driver column to the actual_routes_df for demonstration
actual_routes_df['driver'] = actual_routes_df['route_id'].apply(lambda x: 'D' + str(int(x[-2:]) % 20 + 1))

# Perform the analysis
driver_behavior_analysis = analyze_driver_behavior(actual_routes_df)

# Display the result
driver_behavior_analysis.head()

Unnamed: 0,driver,from,to,count
252,D1,Verona,Brescia,14
173,D1,Prato,Florence,11
22,D1,Bologna,Forlì,9
61,D1,Florence,Livorno,8
136,D1,Padova,Verona,8
