In [78]:
import pandas as pd
import numpy  as np
import time   # To calculate the execution time
import bisect # For the WeightedRandomGenerator
import random # For the generation of random numbers
from numpy.random import choice # For generation of ordered list based on weight
from collections import defaultdict  # For dictionary that memorize the cumulative probability of import/expor
import copy # For function deepcopy

# The dataset was created by using a food dataset and saved outside because the original dataset was huge (1GB).
product_list = pd.read_csv('categories.csv', 
                           delimiter = ',', 
                           encoding  = 'utf-8')

# I delete few strange strings
product_list = product_list.drop([123, 157, 440, 442])

final_product_list                     = pd.DataFrame(product_list['Name'])
final_product_list['Popularity']       = np.random.random_sample(len(final_product_list))

# Index reset due to problems with non-consecutive indexes
final_product_list.reset_index(drop    = True, 
                               inplace = True)

In [79]:
cities = pd.read_csv('world-cities.csv', 
                     delimiter = ',', 
                     encoding  = 'utf-8')

# Set the number of cities to 100
nCities = 100
cities  = cities.sample(nCities)

# Index reset due to problems with non-consecutive indexes
cities.reset_index(drop    = True, 
                   inplace = True)
city_list         = pd.DataFrame(cities.name)
city_list.columns = ['Name']

In [80]:
#city_list["Export"] = ""
#city_list["Import"] = ""

In [81]:
# range of exported and imported for each city setted respectively at 3:20 and 10:50
minExp = 5
maxExp = 40
minImp = 20
maxImp = 100

# THIS PART CAN BE RUN IN PARALLEL (SPARK)
#start = time.time()
#for c in range(0,len(city_list)):
#    num_exp = np.random.randint(minExp,maxExp)
#    num_imp = np.random.randint(minImp,maxImp)
#    exp = np.unique(final_product_list.sample(num_exp, weights=final_product_list.Popularity).Name)
#    possible_imp_products = final_product_list[~final_product_list['Name'].isin(exp)]
#    imp = np.unique(possible_imp_products.sample(num_imp, weights=possible_imp_products.Popularity).Name)
#    city_list.loc[c,'Export'] = exp
#    city_list.loc[c,'Import'] = imp
#print(time.time()-start)

In [90]:
# Creation of a probability vector for each city based on popularity of products and random coefficient
# from 0 to 1
rand = [np.random.random_sample(len(final_product_list)) for i in range(0,len(city_list))]
popularity = final_product_list.Popularity
name = final_product_list.Name

priority = [((popularity * rand[i]), (popularity**2 * (1-rand[i])))
            for i in range(0,len(city_list))]
probabilityExport = [p[0] / sum(p[0]) for p in priority]
probabilityImport = [p[1] / sum(p[1]) for p in priority]

# List of each product in order of probability and then insert it in the dataset his coefficient
productsOrderExp = [choice(name, len(final_product_list), p = prob, replace = False)
                 for prob in probabilityExport]
city_list["ImportanceExport"] = [[(l[i], (1 - i / len(l))) for i in range(0,len(l))] for l in productsOrderExp]

productsOrderImp = [choice(name, len(final_product_list), p = prob, replace = False)
                 for prob in probabilityImport]
city_list["ImportanceImport"] = [[(l[i], (1 - i / len(l))) for i in range(0,len(l))] for l in productsOrderImp]

In [91]:
city_list

Unnamed: 0,Name,ImportanceExport,ImportanceImport
0,Tegucigalpa,"[(Milk-chocolate-digestives, 1.0), (Corn-snack...","[(Grated-parmesan-cheese, 1.0), (Pumpkin-seed-..."
1,Deoli,"[(Fruit-sticks, 1.0), (Complements-alimentaire...","[(Churned-cream, 1.0), (Fresh-coconut, 0.99921..."
2,Enterprise,"[(Fat-free-milk, 1.0), (Fruit-sticks, 0.999216...","[(Organic-and-premium-pepper-cambodia, 1.0), (..."
3,Adjumani,"[(Muffin-mix, 1.0), (Fecules, 0.99921630094043...","[(Fruit-bread, 1.0), (Grattons, 0.999216300940..."
4,Muskegon,"[(Hot-pepper-sauce, 1.0), (Chicken-kiev, 0.999...","[(Tarama, 1.0), (Wholefood, 0.9992163009404389..."
5,Toamasina,"[(Yaourt-aux-fruits, 1.0), (Tiefkuhlpizza, 0.9...","[(Milk substitute, 1.0), (Wheatflour, 0.999216..."
6,Iradan,"[(Churned-cream, 1.0), (Corn-snacks, 0.9992163...","[(Brie-cheese, 1.0), (Bakery, 0.99921630094043..."
7,Guanajuato,"[(Coconut-cream, 1.0), (Pancake-syrup, 0.99921...","[(Frozen-potato-products, 1.0), (Dilute-drink,..."
8,Luqiao,"[(Levure, 1.0), (Schwarztees, 0.99921630094043...","[(Cinnamon-roll, 1.0), (Hommus, 0.999216300940..."
9,Katima Mulilo,"[(Still-water, 1.0), (Gnocchi, 0.9992163009404...","[(Pate-a-tartiner-pomme-poire, 1.0), (Mais-a-p..."


In [188]:
def standard_info(i, rid, tcd, trip, final_trip):
    final_trip.loc[i, 'RouteId']   = rid
    final_trip.loc[i, 'Order']     = i
    final_trip.loc[i, 'TruckCode'] = tcd
    final_trip.loc[i, 'StartCity'] = trip.loc[i    , 'IdCity']
    final_trip.loc[i, 'EndCity']   = trip.loc[i + 1, 'IdCity']

def first_step_creation(rid, tcd, length, trip):
    final_trip   = pd.DataFrame(columns = ['RouteId', 'Order', 'TruckCode', 'StartCity', 'EndCity', 'Items'])
    standard_info(0, rid, tcd, trip, final_trip)
    itemsToImport = [(item[0], item[1]*item2[1]) for item in trip.loc[length - 2, "ProductImportance"]
                                 for item2 in trip.loc[length - 2, "PopularImport"] if item[0] == item2[0]]
    tot = sum(i[1] for i in itemsToImport)
    itemList = [i[0] for i in itemsToImport]
    itemValues = [i[1]/tot for i in itemsToImport]
    # I set the range to be not fixed to 5-20 for the first trip cities that have few exportable items and I set
    # them to half to all the possible exportable items
    numItems = np.random.randint(5,15)
    exportedItems = choice(itemList, numItems, p = itemValues, replace = False)
    final_trip.loc[length - 2, 'Items'] = set(exportedItems)
    return final_trip
    
def trip_creation(i, rid, tcd, length, trip, final_trip):
    standard_info(i - 1, rid, tcd, trip, final_trip)
    crev            = length - i - 1
    exportPriority = city_list.loc[(list(city_list.Name).index(trip.loc[crev + 1, 'IdCity'])),
                                   'ImportanceExport']
    removeItems = [expPriority[0] for expPriority in exportPriority if random.random() < expPriority[1]]
    possibleExports = list(final_trip.loc[crev + 1, 'Items'])
    itemsToDelete   = set(possibleExports).intersection(removeItems)
    for item in itemsToDelete:
        possibleExports.remove(item)
    # Number of new exported items
    numItems = np.random.randint(3, 15)
    itemsToImport = [(item[0], item[1]*item2[1]) for item in trip.loc[length - 2, "ProductImportance"]
                                 for item2 in trip.loc[length - 2, "PopularImport"] if item[0] == item2[0]]
    tot = sum(i[1] for i in itemsToImport)
    itemList = [i[0] for i in itemsToImport]
    itemValues = [i[1]/tot for i in itemsToImport]
    exportedItems = choice(itemList, numItems, p = itemValues, replace = False)
    final_trip.loc[crev, 'Items'] = set(possibleExports).union(exportedItems)
    return final_trip

In [176]:
# Range of trip length setted at 3:10. We set the number of trucks to 1000
minLen   = 3
maxLen   = 10
numTruck = 50

def generate_trip(i):
    length = np.random.randint(minLen, maxLen)
    trip   = pd.DataFrame(columns = ['IdCity', 'ProductImportance', 'PopularImport'])
    Exports = defaultdict(int)
    cities = []
    for c in range(0, length):
        city     = np.random.randint(0, len(city_list))
        while city in cities:
            city = np.random.randint(0, len(city_list))
        cities.append(city)
        trip.loc[c, 'IdCity'] = city_list.loc[city, 'Name']
        for cl in city_list.loc[city, 'ImportanceExport']:
            Exports[cl[0]] += cl[1]
        trip.loc[c, 'ProductImportance'] = [(item, Exports[item]) for item in Exports]
        trip.loc[c, 'PopularImport']   = city_list.loc[city, 'ImportanceImport']
    rid = i
    tcd = np.random.randint(0, numTruck)
    final_trip = first_step_creation(rid, tcd, length, trip)
    for c in range(2,length):
        final_trip = trip_creation(c, rid, tcd, length, trip, final_trip)
    final_trip = final_trip.sort_values('Order')
    return final_trip

In [138]:
# Range of trip length setted at 3:10. We set the number of trucks to 1000
minLen   = 3
maxLen   = 10
numTruck = 50
length = np.random.randint(minLen, maxLen)
trip   = pd.DataFrame(columns = ['IdCity', 'ProductImportance', 'PopularImport'])
Exports = defaultdict(int)
cities = []
for c in range(0, length):
    city     = np.random.randint(0, len(city_list))
    while city in cities:
        city = np.random.randint(0, len(city_list))
    cities.append(city)
    trip.loc[c, 'IdCity'] = city_list.loc[city, 'Name']
    for cl in city_list.loc[city, 'ImportanceExport']:
        Exports[cl[0]] += cl[1]
    trip.loc[c, 'ProductImportance'] = [(item, Exports[item]) for item in Exports]
    trip.loc[c, 'PopularImport']   = city_list.loc[city, 'ImportanceImport']

In [183]:
# Generate n trips
def generate_trips(n):
    return [generate_trip(i) for i in range(0, n)]

In [189]:
numberOfRoutes = 150
routes         = generate_trips(numberOfRoutes)

In [190]:
routes

[  RouteId Order TruckCode    StartCity      EndCity  \
 0       0     0        26      Midland    Pocatello   
 1       0     1        26    Pocatello  Srīrāmnagar   
 2       0     2        26  Srīrāmnagar       Lorica   
 3       0     3        26       Lorica     Adjumani   
 4       0     4        26     Adjumani      Stebnyk   
 
                                                Items  
 0  {Goat-milk-cheese, Kartoffelchips, Chocolate-b...  
 1  {Goat-milk-cheese, Whole-bean-coffee, Nectar-d...  
 2  {Soupes, Whole-bean-coffee, Chocolate-block, M...  
 3  {Orange-chocolate, Seabass, Soupes, Whole-bean...  
 4  {Citrus-punch, Chocolate-block, Pork-chipolata...  ,
   RouteId Order TruckCode           StartCity             EndCity  \
 0       1     0        29            Griffith             Jaunpur   
 1       1     1        29             Jaunpur              Quetta   
 2       1     2        29              Quetta            Rāmnagar   
 3       1     3        29            Rāmnaga

In [191]:
all_trips = pd.concat(routes)

In [192]:
# Save the trips for the mining part
all_trips.to_csv("route_trips.csv")