# Expedia Hotel Recommendation System
## Datasets-
#### 1. Train dataset- Click and booking events from 2013 to 2014.
#### 2. Test dataset- Booking events in 2015.
#### 3. Destinations- Features extracted from hotel reviews text. 

## Data pre-processing and Analysis
***
### 1. Load the datasets:

In [19]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [58]:
#load files

path_in1='D:/Rutgers/MS-Rutgers/AlgorthmicMachineLrn/Projects/Expedia/DataSet/expedia-hotel-recommendations/train.csv' 
path_in2='D:/Rutgers/MS-Rutgers/AlgorthmicMachineLrn/Projects/Expedia/DataSet/expedia-hotel-recommendations/test.csv'
path_in3='D:/Rutgers/MS-Rutgers/AlgorthmicMachineLrn/Projects/Expedia/DataSet/expedia-hotel-recommendations/destinations.csv'
#path_train_final ='D:/Rutgers/MS-Rutgers/AlgorthmicMachineLrn/Projects/Expedia/DataSet/expedia-hotel-recommendations/final.csv'

train_set=pd.read_csv(path_in1) #nrows=10000000 limiting no. of rows in case of memory issues
test_set=pd.read_csv(path_in2)
destinations=pd.read_csv(path_in3)

### 2. Data Cleaning:

* 1. fitler data with is_booking flag=1 (data size reduced from 37670293 to 3000693)

In [60]:
unique_users = train_set['user_id'].unique()
len(unique_users)

1198786

In [61]:
train_setv1 = train_set[train_set.is_booking==1]
train_setv1 = train_setv1.reset_index(drop=True)
# get unique users
unique_users = train_setv1['user_id'].unique()

* 2.Remove booking Agents: Any user that had over 20 bookings is considered as agents.

In [4]:
#Remove booking Agents: Any user that had over 20 bookings is considered as agents
for user in unique_users:
    bookings = len(train_setv1.loc[train_setv1['user_id']==user])
    if bookings > 20:
        train_setv1 = train_setv1[train_setv1.user_id != user]

3000693

* 3. Verify null values for both test and train data

In [11]:
test_set.isna().sum()

id                           0
date_time                    0
site_name                    0
posa_continent               0
user_location_country        0
user_location_region         0
user_location_city           0
orig_destination_distance    0
user_id                      0
is_mobile                    0
is_package                   0
channel                      0
srch_ci                      0
srch_co                      0
srch_adults_cnt              0
srch_children_cnt            0
srch_rm_cnt                  0
srch_destination_id          0
srch_destination_type_id     0
hotel_continent              0
hotel_country                0
hotel_market                 0
dtype: int64

In [8]:
train_setv1.isna().sum()

date_time                    0
site_name                    0
posa_continent               0
user_location_country        0
user_location_region         0
user_location_city           0
orig_destination_distance    0
user_id                      0
is_mobile                    0
is_package                   0
channel                      0
srch_ci                      0
srch_co                      0
srch_adults_cnt              0
srch_children_cnt            0
srch_rm_cnt                  0
srch_destination_id          0
srch_destination_type_id     0
is_booking                   0
cnt                          0
hotel_continent              0
hotel_country                0
hotel_market                 0
hotel_cluster                0
dtype: int64

* update column 'orig_destination_distance' with category column to remove null values
* update test set removing rows with null values for srch_co and srch_ci

In [6]:
def tag_distance(x):
    if x < 500:
        return 'VERY_CLOSE'
    elif x >=500 and x < 2000:
        return 'CLOSE'
    elif x >=2000 and x < 6000:
        return 'FAR'
    else:
        return 'VERY_FAR'
# work on origin_destination_distance
train_setv1['orig_destination_distance'] = train_setv1.orig_destination_distance.apply(lambda x: tag_distance(x))
test_set['orig_destination_distance'] = test_set.orig_destination_distance.apply(lambda x: tag_distance(x))
test_set = test_set[test_set['srch_co'].notna()]
test_set = test_set[test_set['srch_ci'].notna()]
test_set = test_set[test_set.srch_ci != '2161-10-00'] 

In [13]:
train_setv1['srch_ci'] = pd.to_datetime(train_setv1['srch_ci'])
train_setv1['srch_co'] = pd.to_datetime(train_setv1['srch_co'])
train_setv1['date_time'] = pd.to_datetime(train_setv1['date_time'])
test_set['srch_ci'] = pd.to_datetime(test_set['srch_ci'])
test_set['srch_co'] = pd.to_datetime(test_set['srch_co'])
test_set['date_time'] = pd.to_datetime(test_set['date_time'])

### Feature Engineering
***
*1. New coulmns introduced

In [14]:
train_setv1['stay_dur'] = (train_setv1['srch_co'] - train_setv1['srch_ci']).astype('timedelta64[D]')
train_setv1['no_of_days_before_booking'] = (train_setv1['srch_ci'] - train_setv1['date_time']).astype('timedelta64[D]')
train_setv1['current_mon'] = train_setv1['date_time'].dt.month
train_setv1['current_year'] = train_setv1['date_time'].dt.year
train_setv1['srch_ci_day'] = train_setv1['srch_ci'].dt.day
train_setv1['srch_ci_mon'] = train_setv1['srch_ci'].dt.month
train_setv1['srch_ci_year'] = train_setv1['srch_ci'].dt.year
train_setv1['srch_co_mon'] = train_setv1['srch_co'].dt.month
train_setv1['srch_co_year'] = train_setv1['srch_co'].dt.year

test_set['stay_dur'] = (test_set['srch_co'] - test_set['srch_ci']).astype('timedelta64[D]')
test_set['no_of_days_before_booking'] = (test_set['srch_ci'] - test_set['date_time']).astype('timedelta64[D]')
test_set['current_mon'] = test_set['date_time'].dt.month
test_set['current_year'] = test_set['date_time'].dt.year
test_set['srch_ci_day'] = test_set['srch_ci'].dt.day
test_set['srch_ci_mon'] = test_set['srch_ci'].dt.month
test_set['srch_ci_year'] = test_set['srch_ci'].dt.year
test_set['srch_co_mon'] = test_set['srch_co'].dt.month
test_set['srch_co_year'] = test_set['srch_co'].dt.year

In [15]:
test_set = test_set.reset_index(drop=True)
train_setv1 = train_setv1.reset_index(drop=True)

In [16]:
test_set = test_set.drop(columns=['date_time','srch_ci','srch_co'])
train_setv1 = train_setv1.drop(columns=['date_time','srch_ci','srch_co'])
train_setv1 = train_setv1.drop(columns=['is_booking'])

In [18]:
label_encoder = preprocessing.LabelEncoder() 
train_setv1['orig_destination_distance'] = label_encoder.fit_transform(train_setv1['orig_destination_distance'])
test_set['orig_destination_distance'] = label_encoder.fit_transform(test_set['orig_destination_distance'])

### 3. Merge Destination sheet
*Perform PCA

In [20]:
pca = PCA(n_components=10)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

In [24]:
train_setv1 = pd.merge(train_setv1, dest_small, on='srch_destination_id', how='left')
train_setv1.replace(np.nan, 0, inplace=True)
test_set = pd.merge(test_set, dest_small, on='srch_destination_id', how='left')
test_set.replace(np.nan, 0, inplace=True)

In [27]:
#Remove negative data
train_setv1 = train_setv1[train_setv1['stay_dur']>0]
test_set = test_set[test_set['stay_dur']>0]
test_set = test_set[test_set['no_of_days_before_booking'] >0]
train_setv1 = train_setv1[train_setv1['no_of_days_before_booking'] >0]

In [28]:
train_setv1 = train_setv1.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)

In [33]:
train_setv1.dtypes

site_name                      int64
posa_continent                 int64
user_location_country          int64
user_location_region           int64
user_location_city             int64
orig_destination_distance      int32
user_id                        int64
is_mobile                      int64
is_package                     int64
channel                        int64
srch_adults_cnt                int64
srch_children_cnt              int64
srch_rm_cnt                    int64
srch_destination_id            int64
srch_destination_type_id       int64
cnt                            int64
hotel_continent                int64
hotel_country                  int64
hotel_market                   int64
hotel_cluster                  int64
stay_dur                     float64
no_of_days_before_booking    float64
current_mon                    int64
current_year                   int64
srch_ci_day                    int64
srch_ci_mon                    int64
srch_ci_year                   int64
s

In [None]:
train_setv1.to_csv('Trian_set_cleaned',index=False)
test_set.to_csv('Test_set_cleaned',index=False)