In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

import datetime as dt
import time

In [2]:
# read in the datasets
train_data_path = '../csv_files/booking_train_set.csv'
test_data_path = '../csv_files/booking_test_set.csv'

# import train_data set into pandas frame
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [15]:
# series that holds a list of countries visited on each trip

countriesvisitedtrain = train_data.groupby('utrip_id')['hotel_country'].agg(lambda x: list(x))
countriesvisitedtest = test_data.groupby('utrip_id')['hotel_country'].agg(lambda x: list(x))

  0%|          | 0/217686 [00:12<?, ?it/s]
  0%|          | 0/70662 [00:12<?, ?it/s]


In [37]:
threestops = []
lastcountryoftrip = []

# get indices of trips with 3 stops from train_data, and add their last trip to lastcountryoftrip
for idx, trip in enumerate(countriesvisitedtrain):
    if len(trip) == 3:
        threestops.append(countriesvisitedtrain.index[idx])
        lastcountryoftrip.append(countriesvisitedtrain[idx][-1])

In [17]:
# initialize empty df
df = pd.DataFrame()

# build the new dataframe with only the trips with three stops
for idx, item in tqdm(enumerate(threestops)):
    temp = train_data[train_data['utrip_id'] == item]
    df = df.append(temp)

593it [00:37, 15.77it/s]


In [63]:
# get a fresh copy of df
threestopsdf = df.copy(deep=True)

In [50]:
# first get the indices of the values we are going to remove from the training data
tobedropped = threestopsdf.groupby('utrip_id').tail(1).index

In [51]:
# set Y_train equal to our utrip_ids and the last stop of the trip
Y_train = pd.DataFrame(zip(threestops, lastcountryoftrip))

### Feature Engineering ###
In the next few cells, we want to get some more features for the overall dataset

In [64]:
# cast dates to datetime format
threestopsdf['checkin'] = pd.to_datetime(threestopsdf['checkin'])
threestopsdf['checkout'] = pd.to_datetime(threestopsdf['checkout'])

In [65]:
# create a column that holds the length of each 'leg' of the trip
threestopsdf['length_of_leg'] = (threestopsdf['checkout'] - threestopsdf['checkin']).dt.days

In [66]:
# get the day of the week the stay began, so we can decide if it is on a weekend or not
threestopsdf['day_of_week'] = threestopsdf['checkin'].dt.isocalendar().day

weekend_dict = {1:0, 2:0, 3:0, 4:0, 5:1, 6:1, 7:1}

# create is_weekend feature if the trip starts on a weekend
threestopsdf['is_weekend'] = threestopsdf['day_of_week'].apply(lambda x: weekend_dict.get(x))

# create month feature
threestopsdf['month'] = threestopsdf['checkin'].dt.month

In [67]:
# drop unneeded features from threestopsdf
threestopsdf = threestopsdf.drop(['user_id', 'checkin', 'checkout', 'day_of_week', 'utrip_id'], axis = 1)

In [68]:
# take a look at the current state of threestopsdf
threestopsdf

Unnamed: 0.1,Unnamed: 0,city_id,device_class,affiliate_id,booker_country,hotel_country,length_of_leg,is_weekend,month
222412,222412,5581,desktop,9627,Gondal,Gondal,3,0,7
222413,222413,49414,desktop,3084,Gondal,Gondal,3,0,7
222414,222414,33177,desktop,10332,Gondal,Gondal,3,1,7
23412,23412,21337,mobile,9337,Gondal,Gondal,1,0,5
23413,23413,4476,desktop,7643,Gondal,Gondal,1,0,5
...,...,...,...,...,...,...,...,...,...
297856,297856,52471,desktop,2436,Gondal,Gondal,2,1,7
297857,297857,11364,desktop,99,Gondal,Gondal,2,0,8
64647,64647,3505,mobile,359,Elbonia,Glubbdubdrib,1,0,3
64648,64648,45923,mobile,359,Elbonia,Glubbdubdrib,1,0,3


In [78]:
# set X_train_full to the full one-hot-encoded (with all samples including last stay of trip)
X_train_full = pd.get_dummies(threestopsdf[['device_class',
                                            'affiliate_id',
                                            'booker_country',
                                            'hotel_country',
                                            'length_of_leg',
                                            'is_weekend',
                                            'month',
                                            'city_id']])
# drop the index
X_train_full.reset_index(drop=True)

Unnamed: 0,affiliate_id,length_of_leg,is_weekend,month,city_id,device_class_desktop,device_class_mobile,device_class_tablet,booker_country_Bartovia,booker_country_Elbonia,...,hotel_country_São Rico,hotel_country_Tcherkistan,hotel_country_The Devilfire Empire,hotel_country_Trans-Carpathia,hotel_country_Tsergovia,hotel_country_Turgistan,hotel_country_Urkesh,hotel_country_Vadeem,hotel_country_Yerba,hotel_country_Yudonia
0,9627,3,0,7,5581,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3084,3,0,7,49414,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10332,3,1,7,33177,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9337,1,0,5,21337,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7643,1,0,5,4476,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1774,2436,2,1,7,52471,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1775,99,2,0,8,11364,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1776,359,1,0,3,3505,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1777,359,1,0,3,45923,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [101]:
mask = (X_train_full.index%3) == 2
Xtrain = X_train_full[~mask]
Xtrain = Xtrain.reset_index(drop=True)
Xtrain.to_csv('threecountryXtrain.csv')

# create dataframe of only the last leg of each trip
Ytrain = pd.get_dummies(threestopsdf['hotel_country'])[mask]
Ytrain.to_csv('threecountryYtrain.csv')

In [95]:
Ytrain.shape

(593, 79)

In [96]:
pd.get_dummies(threestopsdf['hotel_country']).shape

(1779, 79)