In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import datetime as dt
import time

After imports, read in the dataset<br>(note that we have both train_set AND test_set now... should think about including the test_set in our training data, so it has more results to train on?)

In [2]:
# read in the datasets
train_data_path = '../csv_files/booking_train_set.csv'
test_data_path = '../csv_files/booking_test_set.csv'

# import train_data set into pandas frame
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

### adding necessary columns to the datasets ###

In [3]:
# cast dates to datetime format
for df in [train_data, test_data]:
    df['checkin'] = pd.to_datetime(df['checkin'])
    df['checkout'] = pd.to_datetime(df['checkout'])

In [4]:
# create a column that holds the length of each 'leg' of the trip
for df in [train_data, test_data]:
    df['length_of_leg'] = (df['checkout'] - df['checkin']).dt.days

### stripping out target variable Y from train_data and creating training data w/o target variable ###

In [5]:
# series that holds a list of cities visited on each trip
orderofcitiesvisitedpertrip = train_data.groupby('utrip_id')['city_id'].agg(lambda x: list(x))
orderofcitiesvisitedpertrip

utrip_id
1000027_1                          [8183, 15626, 60902, 30628]
1000033_1                  [38677, 52089, 21328, 27485, 38677]
1000045_1     [64876, 55128, 9608, 31817, 36170, 58178, 36063]
1000083_1                         [55990, 14705, 35160, 36063]
100008_1                     [11306, 12096, 6761, 6779, 65690]
                                   ...                        
999776_1                          [17775, 66634, 17775, 17775]
999839_1                            [8335, 21328, 8335, 48968]
999842_1                          [51291, 66969, 67169, 24036]
999855_1     [382, 38509, 18930, 38509, 51145, 11179, 61881...
999944_1                            [17944, 47075, 228, 62930]
Name: city_id, Length: 217686, dtype: object

In [6]:
# pull out the last city of each utrip_id into a series
lastcityoftripseries = orderofcitiesvisitedpertrip.apply(lambda x: x[-1])
utrip_ids = lastcityoftripseries.index.tolist()
lastcityoftrip = lastcityoftripseries.tolist()

In [7]:
# store our target in train_Y which tells us our target variable for each trip 
Y_train = list(zip(utrip_ids, lastcityoftrip))

In [8]:
# store our training data in train_X which will holds all of the trips from each trip except for the last stop
X_train = train_data.drop(train_data.groupby('utrip_id').tail(1).index, axis=0)
samples_to_drop = train_data.groupby('utrip_id').tail(1).index
samples_to_drop

Int64Index([      3,       9,      13,      17,      22,      26,      32,
                 36,      41,      45,
            ...
            1166780, 1166792, 1166797, 1166802, 1166812, 1166816, 1166821,
            1166826, 1166830, 1166834],
           dtype='int64', length=217686)

#### Goals for tonight's meeting ####
<ul>
    <li> drop unneccessary columns from X_train <b>user_id [affiliate_id?]</b></li>
    <li> one-hot-encode the following columns:</li>
    <li> week_of_year (1 of 52)</li>
    <li> days_of_week (1 of 7)</li>
    <li> <b>weekday_weekend</b> (this might be more helpful as a metric rather than days of the week)</li>
    <li> device_class (1 of 3)</li>
    <li> booker_country (1 of 5)</li>
    <li> hotel_country (1 of 195)</li>
    <li> city_id (1 of 39901)</li>
    <li> length_of_leg (normalized)</li>
</ul>

### ideas ###
<ul>
    <li> train on country, and predict top 4 cities within that country</li>
    <li></li>
</ul>

In [9]:
X_train.head(10)

Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,length_of_leg
0,0,1006220,2016-04-09,2016-04-11,31114,desktop,384,Gondal,Gondal,1006220_1,2
1,1,1006220,2016-04-11,2016-04-12,39641,desktop,384,Gondal,Gondal,1006220_1,1
2,2,1006220,2016-04-12,2016-04-16,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1,4
4,4,1010293,2016-07-09,2016-07-10,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1,1
5,5,1010293,2016-07-10,2016-07-11,55,mobile,359,The Devilfire Empire,Cobra Island,1010293_1,1
6,6,1010293,2016-07-12,2016-07-13,23921,mobile,359,The Devilfire Empire,Cobra Island,1010293_1,1
7,7,1010293,2016-07-13,2016-07-15,65322,desktop,9924,The Devilfire Empire,Cobra Island,1010293_1,2
8,8,1010293,2016-07-15,2016-07-16,23921,desktop,9924,The Devilfire Empire,Cobra Island,1010293_1,1
10,10,1012680,2016-10-23,2016-10-25,37709,desktop,384,Gondal,Yerba,1012680_1,2
11,11,1012680,2016-10-25,2016-10-27,11837,desktop,384,Gondal,Panem,1012680_1,2


In [10]:
utrip = '1012680_1'
X_train[X_train['utrip_id'] == utrip]

Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,length_of_leg
10,10,1012680,2016-10-23,2016-10-25,37709,desktop,384,Gondal,Yerba,1012680_1,2
11,11,1012680,2016-10-25,2016-10-27,11837,desktop,384,Gondal,Panem,1012680_1,2
12,12,1012680,2016-10-27,2016-10-30,19626,desktop,384,Gondal,Yerba,1012680_1,3


In [11]:
train_data[train_data['utrip_id'] == utrip]

Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,length_of_leg
10,10,1012680,2016-10-23,2016-10-25,37709,desktop,384,Gondal,Yerba,1012680_1,2
11,11,1012680,2016-10-25,2016-10-27,11837,desktop,384,Gondal,Panem,1012680_1,2
12,12,1012680,2016-10-27,2016-10-30,19626,desktop,384,Gondal,Yerba,1012680_1,3
13,13,1012680,2016-10-30,2016-11-02,62270,desktop,384,Gondal,Yerba,1012680_1,3


In [12]:
Y_train_dict = dict(zip(utrip_ids, lastcityoftrip))
Y_train_dict[utrip]

62270

In [13]:
# get the week the stay began
train_data['week_number'] = train_data['checkin'].dt.isocalendar().week

In [14]:
# get the day of the week the stay began
train_data['day_of_week'] = train_data['checkin'].dt.isocalendar().day

In [15]:
train_data['day_of_week']

0          6
1          1
2          2
3          6
4          6
          ..
1166830    2
1166831    2
1166832    5
1166833    4
1166834    7
Name: day_of_week, Length: 1166835, dtype: UInt32

In [16]:
weekend_dict = {1:0, 2:0, 3:0, 4:0, 5:1, 6:1, 7:1}

In [17]:
train_data['is_weekend'] = train_data['day_of_week'].apply(lambda x: weekend_dict.get(x))

In [18]:
train_data.head(10)

Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,length_of_leg,week_number,day_of_week,is_weekend
0,0,1006220,2016-04-09,2016-04-11,31114,desktop,384,Gondal,Gondal,1006220_1,2,14,6,1
1,1,1006220,2016-04-11,2016-04-12,39641,desktop,384,Gondal,Gondal,1006220_1,1,15,1,0
2,2,1006220,2016-04-12,2016-04-16,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1,4,15,2,0
3,3,1006220,2016-04-16,2016-04-17,24144,desktop,384,Gondal,Gondal,1006220_1,1,15,6,1
4,4,1010293,2016-07-09,2016-07-10,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1,1,27,6,1
5,5,1010293,2016-07-10,2016-07-11,55,mobile,359,The Devilfire Empire,Cobra Island,1010293_1,1,27,7,1
6,6,1010293,2016-07-12,2016-07-13,23921,mobile,359,The Devilfire Empire,Cobra Island,1010293_1,1,28,2,0
7,7,1010293,2016-07-13,2016-07-15,65322,desktop,9924,The Devilfire Empire,Cobra Island,1010293_1,2,28,3,0
8,8,1010293,2016-07-15,2016-07-16,23921,desktop,9924,The Devilfire Empire,Cobra Island,1010293_1,1,28,5,1
9,9,1010293,2016-07-16,2016-07-17,20545,desktop,10573,The Devilfire Empire,Cobra Island,1010293_1,1,28,6,1


In [19]:
train_data['month'] = train_data['checkin'].dt.month

In [20]:
train_data = train_data.drop(['user_id', 'checkin', 'checkout', 'day_of_week', 'week_number'], axis = 1)

In [21]:
train_data

Unnamed: 0.1,Unnamed: 0,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,length_of_leg,is_weekend,month
0,0,31114,desktop,384,Gondal,Gondal,1006220_1,2,1,4
1,1,39641,desktop,384,Gondal,Gondal,1006220_1,1,0,4
2,2,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1,4,0,4
3,3,24144,desktop,384,Gondal,Gondal,1006220_1,1,1,4
4,4,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1,1,1,7
...,...,...,...,...,...,...,...,...,...,...
1166830,1166830,14197,tablet,10332,Gondal,Fook Island,999261_1,2,0,9
1166831,1166831,4476,desktop,2661,The Devilfire Empire,Gondal,999755_1,3,0,12
1166832,1166832,1034,desktop,7974,The Devilfire Empire,Gondal,999755_1,1,1,12
1166833,1166833,64876,desktop,7974,The Devilfire Empire,Fook Island,999755_1,3,0,12


In [22]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [23]:
labelencoder = LabelEncoder()
enc = OneHotEncoder(handle_unknown='ignore')

In [24]:
train_data['device_class_cat'] = labelencoder.fit_transform(train_data['device_class']).astype(int)
train_data['booker_country_cat'] = labelencoder.fit_transform(train_data['booker_country']).astype(int)
train_data['hotel_country_cat'] = labelencoder.fit_transform(train_data['hotel_country']).astype(int)
train_data['affiliate_id_cat'] = labelencoder.fit_transform(train_data['affiliate_id']).astype(int)
train_data['city_id_cat'] = labelencoder.fit_transform(train_data['city_id']).astype(int)
train_data['month_cat'] = labelencoder.fit_transform(train_data['month']).astype(int)

In [25]:
train_data['city_id_cat']
max(train_data['city_id_cat'])

39900

In [26]:
device_class_df = pd.DataFrame(enc.fit_transform(train_data[['device_class_cat']]).toarray()).astype(int)
booker_country_df = pd.DataFrame(enc.fit_transform(train_data[['booker_country_cat']]).toarray()).astype(int)
hotel_country_df = pd.DataFrame(enc.fit_transform(train_data[['hotel_country_cat']]).toarray()).astype(int)
month_df = pd.DataFrame(enc.fit_transform(train_data[['month_cat']]).toarray()).astype(int)
# affiliate_id_df = pd.DataFrame(enc.fit_transform(X_train[['affiliate_id_cat']]).toarray()).astype(int)
# city_id_df = pd.DataFrame(enc.fit_transform(X_train[['city_id_cat']]).toarray()).astype(int)

##### rename columns #####

In [27]:
device_class_df.columns = ['dc_desktop', 'dc_mobile', 'dc_tablet']
booker_country_df.columns = ['country1', 'country2', 'country3', 'country4', 'country5']
hotel_country_df.columns = [f"hc_{i}" for i in range(195)]
month_df.columns = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

#### compile all one-hot-encoded samples into one dataframe ####

In [28]:
X_train = pd.merge(device_class_df, booker_country_df, left_index=True, right_index=True)
X_train = pd.merge(X_train, hotel_country_df, left_index=True, right_index=True)
X_train = pd.merge(X_train, month_df, left_index=True, right_index=True)
X_train = pd.merge(X_train, train_data[['is_weekend','city_id']], left_index=True, right_index=True)

#### normalize length of leg feature ####

In [29]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

In [30]:
lengthofleg = train_data['length_of_leg'].values.reshape(-1, 1)
lengthoflegscaled = min_max_scaler.fit_transform(lengthofleg)
X_train['length_of_leg_scaled'] = pd.DataFrame(lengthoflegscaled)

In [31]:
X_train

Unnamed: 0,dc_desktop,dc_mobile,dc_tablet,country1,country2,country3,country4,country5,hc_0,hc_1,...,Jun,Jul,Aug,Sep,Oct,Nov,Dec,is_weekend,city_id,length_of_leg_scaled
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,31114,0.034483
1,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,39641,0.000000
2,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,20232,0.103448
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,24144,0.000000
4,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,5325,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1166830,0,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,14197,0.034483
1166831,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,4476,0.068966
1166832,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,1034,0.000000
1166833,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,64876,0.068966


All we need to deal with now is how to encode the city_id without getting running into memory issues, and drop the rows which represent our target variable Y_train - this is a computation heavy operation and I think we may need to use pytorch's tensors in order to move this calculation off the CPU for more computation power