In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time

In [2]:
# read in the datasets
train_data_path = '~/Projects/BookingWSDM/booking_train_set.csv'
unique_trips_path = '~/Projects/BookingWSDM/citiesvisitedpertrip.csv'

In [5]:
train_data = pd.read_csv(train_data_path)

In [6]:
# series that holds a list of cities visited on each trip
citiesvisited = train_data.groupby('utrip_id')['city_id'].agg(lambda x: list(set(x)))

In [28]:
citiesvisited.head(10)

utrip_id
1000027_1                          [15626, 30628, 60902, 8183]
1000033_1                         [21328, 52089, 27485, 38677]
1000045_1     [58178, 9608, 31817, 36170, 64876, 55128, 36063]
1000083_1                         [35160, 14705, 55990, 36063]
100008_1                     [12096, 6761, 11306, 65690, 6779]
1000097_1    [6306, 61187, 6788, 17127, 42503, 40521, 21033...
1000136_1                  [33540, 62541, 42482, 32627, 20345]
1000145_1                  [17764, 27112, 35850, 56651, 47499]
100018_1       [5797, 17830, 46854, 22065, 57619, 57658, 2748]
1000208_1                    [56872, 60143, 1910, 9278, 51999]
Name: city_id, dtype: object

Now, we want to create a dictionary where each city is the key, and the value is an array of length 2. At position [0] and [1], we need dictionaries of each city, but this time their value is initialized as 0. We can think of the dictionary at position [0] as a list of the number of incoming connections from each city, and the dictionary at position [1] as a list of the number of outgoing connections to each city.

In [29]:
# initialize dictionary that holds names of all cities
city_connections = {}.fromkeys(
    train_data['city_id'].unique(),
    [{}.fromkeys(train_data['city_id'].unique(), 0),
     {}.fromkeys(train_data['city_id'].unique(), 0)])

To demonstrate, if we pick city_id 58178 and check if there are any incoming connections from city_id 15626:

In [30]:
city_connections[58178][0][15626]

0

It should make sense for this to be 0, as we haven't added any of our connections to the city_connections dictionary yet.

Now, we're going to make a function that takes a city array, creates pairs, and passes these pairs to another function that will modify our dictionary entries by adding connections between pair's incoming and outgoing cities.

In [31]:
def addToDictionary(arr):
	'''
	this function will add arr[0] to arr[1]'s incoming cities list, 
	and add arr[1] to arr[0]'s outgoing cities dictionary
	'''
	global city_connections

	city_connections[arr[1]][0][arr[0]] += 1
	city_connections[arr[0]][1][arr[1]] += 1

In [32]:
def createPairs(arr):
	for i in range(len(arr) - 2):
		addToDictionary([arr[i], arr[i+1]])

In [33]:
citiesvisited.apply(lambda x: createPairs(x))

utrip_id
1000027_1    None
1000033_1    None
1000045_1    None
1000083_1    None
100008_1     None
             ... 
999776_1     None
999839_1     None
999842_1     None
999855_1     None
999944_1     None
Name: city_id, Length: 217686, dtype: object

Now, let's take a look at our city_connections - we know from previous analysis that the most popular city in Gondal is 36063, so let's check 36064's incoming connections:

In [34]:
city_connections[36063][0]

{31114: 357,
 39641: 102,
 20232: 98,
 24144: 1,
 5325: 1925,
 55: 539,
 23921: 5873,
 65322: 2737,
 20545: 36,
 37709: 12,
 11837: 11,
 19626: 7,
 62270: 11,
 1979: 94,
 3531: 63,
 55529: 774,
 5860: 411,
 50957: 633,
 60222: 152,
 23612: 65,
 17013: 1487,
 46794: 93,
 12884: 3,
 40875: 19,
 66657: 41,
 33667: 526,
 62185: 2622,
 17568: 9,
 6701: 32,
 67371: 64,
 40565: 328,
 28053: 3,
 6196: 98,
 13621: 6,
 65679: 234,
 67353: 324,
 14145: 1,
 11531: 3,
 57167: 0,
 29770: 4043,
 54603: 304,
 64960: 79,
 30458: 18,
 56590: 135,
 47499: 4063,
 10485: 708,
 11783: 346,
 56268: 52,
 41772: 21,
 64876: 4136,
 55128: 3623,
 9608: 4059,
 47752: 155,
 17127: 3718,
 36063: 326,
 60153: 4,
 33204: 128,
 14843: 79,
 47378: 670,
 14827: 1172,
 6788: 1281,
 9879: 9,
 4660: 88,
 29319: 4751,
 48483: 4588,
 13861: 56,
 62135: 44,
 25025: 1186,
 39820: 496,
 52818: 1578,
 15337: 563,
 60274: 937,
 4790: 259,
 49412: 6,
 58683: 12,
 51103: 1,
 66966: 12,
 15215: 737,
 47759: 1290,
 13260: 474,
 64155

... and the outgoing connections...

In [35]:
city_connections[36036][1]

{31114: 306,
 39641: 80,
 20232: 29,
 24144: 0,
 5325: 2426,
 55: 1134,
 23921: 6190,
 65322: 2740,
 20545: 2,
 37709: 20,
 11837: 26,
 19626: 6,
 62270: 60,
 1979: 124,
 3531: 90,
 55529: 371,
 5860: 368,
 50957: 962,
 60222: 811,
 23612: 160,
 17013: 3851,
 46794: 67,
 12884: 10,
 40875: 28,
 66657: 13,
 33667: 369,
 62185: 1487,
 17568: 0,
 6701: 32,
 67371: 78,
 40565: 490,
 28053: 6,
 6196: 200,
 13621: 15,
 65679: 256,
 67353: 378,
 14145: 0,
 11531: 5,
 57167: 1,
 29770: 3985,
 54603: 370,
 64960: 1,
 30458: 15,
 56590: 194,
 47499: 5197,
 10485: 1632,
 11783: 205,
 56268: 81,
 41772: 38,
 64876: 5807,
 55128: 3273,
 9608: 2705,
 47752: 79,
 17127: 2294,
 36063: 285,
 60153: 2,
 33204: 201,
 14843: 151,
 47378: 1005,
 14827: 1404,
 6788: 989,
 9879: 11,
 4660: 137,
 29319: 3516,
 48483: 2742,
 13861: 61,
 62135: 38,
 25025: 627,
 39820: 684,
 52818: 1846,
 15337: 361,
 60274: 1034,
 4790: 704,
 49412: 7,
 58683: 25,
 51103: 1,
 66966: 26,
 15215: 750,
 47759: 1421,
 13260: 524,


Now, we can use heapq.nlargest() to get the 10 cities with the highest numbers of outgoing connections for each city:

In [36]:
from heapq import nlargest
TenHighest = nlargest(10, city_connections[36036][1], key = city_connections.get)

In [42]:
for val in TenHighest:
    print(val, ":", city_connections[36036][1].get(val))

31114 : 306
39641 : 80
20232 : 29
24144 : 0
5325 : 2426
55 : 1134
23921 : 6190
65322 : 2740
20545 : 2
37709 : 20


This means that, when leaving from city_id 36036, there is the highest probability of going to city_id 23921 next - but how large is that probability? We can find out by dividing by the total number of outgoing connections from city_id 36036:

In [59]:
temp_total_outgoing = sum(city_connections[36036][1].values())

In [60]:
temp = city_connections[36036][1][23921]

In [61]:
temp/total_outgoing

0.010243002407684734