In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time

In [2]:
# read in the datasets
train_data_path = '~/Projects/BookingWSDM/booking_train_set.csv'
unique_trips_path = '~/Projects/BookingWSDM/citiesvisitedpertrip.csv'

In [3]:
train_data = pd.read_csv(train_data_path)

In [22]:
# series that holds a list of cities visited on each trip
citiesvisited = train_data.groupby('utrip_id')['city_id'].agg(lambda x: list(x))

In [23]:
citiesvisited.head(10)

utrip_id
1000027_1                          [8183, 15626, 60902, 30628]
1000033_1                  [38677, 52089, 21328, 27485, 38677]
1000045_1     [64876, 55128, 9608, 31817, 36170, 58178, 36063]
1000083_1                         [55990, 14705, 35160, 36063]
100008_1                     [11306, 12096, 6761, 6779, 65690]
1000097_1    [17127, 31088, 40521, 55128, 21033, 6306, 6788...
1000136_1                  [62541, 42482, 20345, 33540, 32627]
1000145_1                  [47499, 27112, 17764, 56651, 35850]
100018_1       [17830, 57619, 22065, 2748, 46854, 5797, 57658]
1000208_1                    [60143, 1910, 9278, 51999, 56872]
Name: city_id, dtype: object

#### NOTE - citiesvisited holds a list of all non-unique cities visited on each utrip_id. This means it also keeps track of trips in which the traveller moved hotels within the same city, and if they made a loop back through the city again in the same trip. ####

Now, we want to create a dictionary where each city is the key, and the value is an array of length 2. At position [0] and [1], we need dictionaries of each city, but this time their value is initialized as 0. We can think of the dictionary at position [0] as a list of the number of incoming connections from each city, and the dictionary at position [1] as a list of the number of outgoing connections to each city.

In [24]:
# initialize dictionary that holds names of all cities
city_connections = {}.fromkeys(
    train_data['city_id'].unique(),
    [{}.fromkeys(train_data['city_id'].unique(), 0),
     {}.fromkeys(train_data['city_id'].unique(), 0)])

To demonstrate, if we pick city_id 58178 and check if there are any incoming connections from city_id 15626:

In [25]:
city_connections[58178][0][15626]

0

It should make sense for this to be 0, as we haven't added any of our connections to the city_connections dictionary yet.

Now, we're going to make a function that takes a city array, creates pairs, and passes these pairs to another function that will modify our dictionary entries by adding connections between pair's incoming and outgoing cities.

In [26]:
def addToDictionary(arr):
	'''
	this function will add arr[0] to arr[1]'s incoming cities list, 
	and add arr[1] to arr[0]'s outgoing cities dictionary
	'''
	global city_connections

	city_connections[arr[1]][0][arr[0]] += 1
	city_connections[arr[0]][1][arr[1]] += 1

In [27]:
def createPairs(arr):
	for i in range(len(arr) - 2):
		addToDictionary([arr[i], arr[i+1]])

In [28]:
citiesvisited.apply(lambda x: createPairs(x))

utrip_id
1000027_1    None
1000033_1    None
1000045_1    None
1000083_1    None
100008_1     None
             ... 
999776_1     None
999839_1     None
999842_1     None
999855_1     None
999944_1     None
Name: city_id, Length: 217686, dtype: object

Now, let's take a look at our city_connections - we know from previous analysis that the most popular city in Gondal is 36063, so let's check 36064's incoming connections:

In [29]:
city_connections[36063][0]

{31114: 288,
 39641: 86,
 20232: 55,
 24144: 0,
 5325: 1956,
 55: 1141,
 23921: 7097,
 65322: 2447,
 20545: 26,
 37709: 15,
 11837: 30,
 19626: 4,
 62270: 116,
 1979: 180,
 3531: 72,
 55529: 659,
 5860: 495,
 50957: 816,
 60222: 4166,
 23612: 204,
 17013: 4680,
 46794: 59,
 12884: 5,
 40875: 20,
 66657: 33,
 33667: 548,
 62185: 2127,
 17568: 8,
 6701: 31,
 67371: 51,
 40565: 420,
 28053: 5,
 6196: 162,
 13621: 15,
 65679: 277,
 67353: 543,
 14145: 1,
 11531: 3,
 57167: 0,
 29770: 3465,
 54603: 340,
 64960: 55,
 30458: 33,
 56590: 317,
 47499: 6149,
 10485: 3436,
 11783: 555,
 56268: 57,
 41772: 27,
 64876: 4633,
 55128: 5170,
 9608: 3046,
 47752: 103,
 17127: 3994,
 36063: 5371,
 60153: 4,
 33204: 138,
 14843: 167,
 47378: 711,
 14827: 1118,
 6788: 1045,
 9879: 13,
 4660: 114,
 29319: 4520,
 48483: 3888,
 13861: 73,
 62135: 174,
 25025: 1349,
 39820: 553,
 52818: 1624,
 15337: 518,
 60274: 1035,
 4790: 715,
 49412: 8,
 58683: 31,
 51103: 15,
 66966: 27,
 15215: 781,
 47759: 1503,
 1326

... and the outgoing connections...

In [30]:
city_connections[36036][1]

{31114: 246,
 39641: 86,
 20232: 44,
 24144: 1,
 5325: 2122,
 55: 1533,
 23921: 7246,
 65322: 2527,
 20545: 24,
 37709: 16,
 11837: 32,
 19626: 6,
 62270: 127,
 1979: 201,
 3531: 120,
 55529: 825,
 5860: 483,
 50957: 980,
 60222: 5062,
 23612: 204,
 17013: 3109,
 46794: 31,
 12884: 8,
 40875: 30,
 66657: 34,
 33667: 594,
 62185: 1079,
 17568: 7,
 6701: 36,
 67371: 75,
 40565: 415,
 28053: 6,
 6196: 149,
 13621: 15,
 65679: 346,
 67353: 544,
 14145: 1,
 11531: 5,
 57167: 1,
 29770: 2857,
 54603: 379,
 64960: 77,
 30458: 30,
 56590: 128,
 47499: 3383,
 10485: 3279,
 11783: 600,
 56268: 86,
 41772: 32,
 64876: 3559,
 55128: 6181,
 9608: 3669,
 47752: 116,
 17127: 3285,
 36063: 3705,
 60153: 5,
 33204: 202,
 14843: 177,
 47378: 813,
 14827: 1471,
 6788: 1156,
 9879: 14,
 4660: 111,
 29319: 3569,
 48483: 2676,
 13861: 82,
 62135: 209,
 25025: 1006,
 39820: 525,
 52818: 914,
 15337: 441,
 60274: 994,
 4790: 613,
 49412: 5,
 58683: 23,
 51103: 14,
 66966: 22,
 15215: 809,
 47759: 870,
 13260:

Now, we can use heapq.nlargest() to get the 10 cities with the highest numbers of outgoing connections for each city:

In [31]:
from heapq import nlargest
TenHighest = nlargest(10, city_connections[36036][1], key = city_connections.get)

In [32]:
for val in TenHighest:
    print(val, ":", city_connections[36036][1].get(val))

31114 : 246
39641 : 86
20232 : 44
24144 : 1
5325 : 2122
55 : 1533
23921 : 7246
65322 : 2527
20545 : 24
37709 : 16


This means that, when leaving from city_id 36036, there is the highest probability of going to city_id 23921 next - but how large is that probability? We can find out by dividing by the total number of outgoing connections from city_id 36036:

In [33]:
temp_total_outgoing = sum(city_connections[36036][1].values())

In [34]:
temp = city_connections[36036][1][23921]

In [35]:
temp/temp_total_outgoing

0.009904644220149377

Looks like there's a lot of cities that pass through city_id 36036!!

We can also check the number of routes that had more than one 'leg' of trip spent in the same city but in a different hotel.

In [36]:
city_connections[36036][0][36036]

11

In [37]:
city_connections[36036][1][36036]

16

We can make sense of the fact that the incoming trips are smaller than the outgoing in that some of these trips may start in the city, and as such will not add a trip to city_id 36036's outgoing city's dictionary.