In [74]:
import pandas as pd
import numpy  as np
import time   # To calculate the execution time
import bisect # For the WeightedRandomGenerator
import random # For the generation of random numbers

# The dataset was created by using a food dataset and saved outside because the original dataset was huge (1GB).
product_list = pd.read_csv('categories.csv', 
                           delimiter = ',', 
                           encoding  = 'utf-8')

# I delete few strange strings
product_list = product_list.drop([123, 157, 440, 442])

final_product_list                     = pd.DataFrame(product_list['Name'])
final_product_list['Popularity']       = np.random.random_sample(len(final_product_list))

# Index reset due to problems with non-consecutive indexes
final_product_list.reset_index(drop    = True, 
                               inplace = True)

In [75]:
cities = pd.read_csv('world-cities.csv', 
                     delimiter = ',', 
                     encoding  = 'utf-8')

# Set the number of cities to 100
nCities = 100
cities  = cities.sample(nCities)

# Index reset due to problems with non-consecutive indexes
cities.reset_index(drop    = True, 
                   inplace = True)
city_list         = pd.DataFrame(cities.name)
city_list.columns = ['Name']

In [76]:
#city_list["Export"] = ""
#city_list["Import"] = ""

In [77]:
# range of exported and imported for each city setted respectively at 3:20 and 10:50
minExp = 5
maxExp = 40
minImp = 20
maxImp = 100

# THIS PART CAN BE RUN IN PARALLEL (SPARK)
#start = time.time()
#for c in range(0,len(city_list)):
#    num_exp = np.random.randint(minExp,maxExp)
#    num_imp = np.random.randint(minImp,maxImp)
#    exp = np.unique(final_product_list.sample(num_exp, weights=final_product_list.Popularity).Name)
#    possible_imp_products = final_product_list[~final_product_list['Name'].isin(exp)]
#    imp = np.unique(possible_imp_products.sample(num_imp, weights=possible_imp_products.Popularity).Name)
#    city_list.loc[c,'Export'] = exp
#    city_list.loc[c,'Import'] = imp
#print(time.time()-start)

In [78]:
# A class for the generation of random weighted lists of products
class WeightedRandomGenerator(object):
    def __init__(self, weights):
        self.totals   = []
        running_total = 0
        for weight in weights:
            running_total += weight
            self.totals.append(running_total)

    def next(self):
        rnd = random.random() * self.totals[-1]
        return bisect.bisect_right(self.totals, rnd)

    def __call__(self):
        return self.next()

# Produce a list of import and export for a trip
def importExportGenerator():
    num_exp = np.random.randint(minExp, maxExp) # How many export
    num_imp = np.random.randint(minImp, maxImp) # How many import
    result  = {} # The final result to return
    exp     = [] # Export
    imp     = [] # Import
    
    # Generate a list of indexes of product to export
    for ne in range(0, num_exp):
        rg = final_product_list.Name[int(randomGenerator())]
        if(rg not in exp):
            exp.append(rg)
        else:
            ne -= 1
    
    # Generate a list of indexes of product to import
    for ni in range(0, num_imp):
        rg = final_product_list.Name[int(randomGenerator())]
        # We check if the products are already in the export list 
        # in order to avoid the import of them
        if((rg not in imp) & (rg not in exp)):
            imp.append(rg)
        else:
            ni -= 1
            
    result['exp'] = exp
    result['imp'] = imp
    return result
    
start           = time.time()
randomGenerator = WeightedRandomGenerator(final_product_list.Popularity)
values          = [(value['exp'], value['imp'])
                   for value in [importExportGenerator()
                                 for i in range(0, len(city_list))]]
exp             = []
imp             = []

for index in range(0, len(values)):
    exp.append(values[index][0])
    imp.append(values[index][1])
city_list['Export'] = exp
city_list['Import'] = imp

print(time.time() - start)

0.20741724967956543


In [79]:
def standard_info(i, rid, tcd, trip, final_trip):
    final_trip.loc[i, 'RouteId']   = rid
    final_trip.loc[i, 'Order']     = i
    final_trip.loc[i, 'TruckCode'] = tcd
    final_trip.loc[i, 'StartCity'] = trip.loc[i    , 'IdCity']
    final_trip.loc[i, 'EndCity']   = trip.loc[i + 1, 'IdCity']

def first_step_creation(rid, tcd, length, trip):
    final_trip   = pd.DataFrame(columns = ['RouteId', 'Order', 'TruckCode', 'StartCity', 'EndCity', 'Items'])
    standard_info(0, rid, tcd, trip, final_trip)
    maxProbItems = list(set(trip.loc[length - 1, "PopularImport"]).intersection(trip.loc[length - 2, "ExportableItems"]))
    maxProbItems = maxProbItems * 5
    trip.loc[length - 2, "ExportableItems"].extend(maxProbItems)
    # I set the range to be not fixed to 5-20 for the first trip cities that have few exportable items and I set
    # them to half to all the possible exportable items
    numItems = np.random.randint(min(5, len(trip.loc[length - 2, "ExportableItems"]) / 2), 
                                 min(15, len(trip.loc[length - 2, "ExportableItems"])))
    sample   = set(random.sample(trip.loc[length - 2, "ExportableItems"], numItems))
    final_trip.loc[length - 2, 'Items'] = sample
    return final_trip
    
def trip_creation(i, rid, tcd, length, trip, final_trip):
    standard_info(i - 1, rid, tcd, trip, final_trip)
    crev            = length - i - 1
    possibleExports = list(final_trip.loc[crev + 1, 'Items'])
    itemsToDelete   = set(possibleExports).intersection(list(city_list.loc[list(city_list.Name).index(trip.loc[crev + 1, 
                                                                                                               'IdCity']), 
                                                                           'Export']))
    for item in itemsToDelete:
        possibleExports.remove(item)
        
        
    #############
    ############# 3 e 15 da mettere come variabili?
    #############
    numItems = np.random.randint(3, min(15, len(trip.loc[crev, "ExportableItems"])))
    #############
    sample   = set(random.sample(trip.loc[crev, "ExportableItems"], numItems))
    final_trip.loc[crev, 'Items'] = set(possibleExports).union(sample)
    return final_trip

In [80]:
# Range of trip length setted at 3:10. We set the number of trucks to 1000
minLen   = 3
maxLen   = 10
numTruck = 1000

def generate_trip(i):
    length = np.random.randint(minLen, maxLen)
    trip   = pd.DataFrame(columns = ['IdCity', 'ExportableItems', 'PopularImport'])
    items  = []
    cities = []
    for c in range(0, length):
        city     = np.random.randint(0, len(city_list))
        while city in cities:
            city = np.random.randint(0, len(city_list))
        cities.append(city)
        trip.loc[c, 'IdCity'] = city_list.loc[city, 'Name']
        items.extend(city_list.loc[city, 'Export'])
        trip.loc[c, 'ExportableItems'] = items.copy()
        trip.loc[c, 'PopularImport']   = city_list.loc[city, 'Import']
    rid = i
    tcd = np.random.randint(0, numTruck)
    final_trip = first_step_creation(rid, tcd, length, trip)
    for c in range(2,length):
        final_trip = trip_creation(c, rid, tcd, length, trip, final_trip)
    final_trip = final_trip.sort_values('Order')
    return final_trip

In [81]:
# Generate n trips
def generate_trips(n):
    return [generate_trip(i) for i in range(0, n)]

In [82]:
numberOfTrips = 100
trips         = generate_trips(numberOfTrips)

In [83]:
trips

[  RouteId Order TruckCode         StartCity           EndCity  \
 0       0     0       602            Shiyan          Zirndorf   
 1       0     1       602          Zirndorf              Ippy   
 2       0     2       602              Ippy            Azazga   
 3       0     3       602            Azazga  Sesto Fiorentino   
 4       0     4       602  Sesto Fiorentino            Kakata   
 
                                                Items  
 0  {Pralines, Scottish-shortbread, Olives-denoyau...  
 1  {Pralines, White-coconut-chocolate, Scottish-s...  
 2  {Pralines, White-coconut-chocolate, Scottish-s...  
 3  {Piccalilli, Sundried-tomatoes, Turkish-fermen...  
 4  {Peanut-in-chocolate, Low-fat-high-calcium-mil...  ,
   RouteId Order TruckCode      StartCity        EndCity  \
 0       1     0       626        Caracas   Korrewegwijk   
 1       1     1       626   Korrewegwijk      Chandauli   
 2       1     2       626      Chandauli  McKinleyville   
 3       1     3       62

In [84]:
all_trips = pd.concat(trips)

In [85]:
# Save the trips for the mining part
all_trips.to_csv("route_trips.csv")