In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import zscore

Data important features:
position - (Integer) - Hotel position on Expedia's search results page. This is only provided for the training data, but not the test data.
click_bool - (Boolean) - 1 if the user clicked on the property, 0 if not.
booking_bool - (Boolean) - 1 if the user booked the property, 0 if not.
gross_booking_usd - (Float) - Total value of the transaction. This can differ from the price_usd due to taxes, fees, conventions on multiple day bookings and purchase of a room type other than the one shown in the search.

More info on: https://www.kaggle.com/c/expedia-personalized-sort/data

In [3]:
test_data = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')

print(train_data.head())

   srch_id            date_time  site_id  visitor_location_country_id  \
0        1  2013-04-04 08:32:15       12                          187   
1        1  2013-04-04 08:32:15       12                          187   
2        1  2013-04-04 08:32:15       12                          187   
3        1  2013-04-04 08:32:15       12                          187   
4        1  2013-04-04 08:32:15       12                          187   

   visitor_hist_starrating  visitor_hist_adr_usd  prop_country_id  prop_id  \
0                      NaN                   NaN              219      893   
1                      NaN                   NaN              219    10404   
2                      NaN                   NaN              219    21315   
3                      NaN                   NaN              219    27348   
4                      NaN                   NaN              219    29604   

   prop_starrating  prop_review_score  ...  comp6_rate_percent_diff  \
0                3   

In [11]:
# print size of data
print(train_data.shape)

# display data information
train_data.info()

(4958347, 54)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4958347 entries, 0 to 4958346
Data columns (total 54 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   srch_id                      int64  
 1   date_time                    object 
 2   site_id                      int64  
 3   visitor_location_country_id  int64  
 4   visitor_hist_starrating      float64
 5   visitor_hist_adr_usd         float64
 6   prop_country_id              int64  
 7   prop_id                      int64  
 8   prop_starrating              int64  
 9   prop_review_score            float64
 10  prop_brand_bool              int64  
 11  prop_location_score1         float64
 12  prop_location_score2         float64
 13  prop_log_historical_price    float64
 14  position                     int64  
 15  price_usd                    float64
 16  promotion_flag               int64  
 17  srch_destination_id          int64  
 18  srch_length_of_stay         

Non ordinal integers that should become strings:
- srch_destination_id
- site_id 
- visitor_location_country_id
- prop_country_id
- srch_destination_id 

Integers that should become boolean:
- prop_brand_bool
- srch_saturday_night_bool
- random_bool

To predict:
position: Hotel position on Expedia's search results page. This is only provided for the training data, but not the test data.

In [9]:
# print missing values in each column
for col in train_data.columns:
    # print percentage of missing values
    print(col, train_data[col].isnull().sum()/train_data.shape[0])

(4958347, 54)
srch_id 0.0
date_time 0.0
site_id 0.0
visitor_location_country_id 0.0
visitor_hist_starrating 0.949203635808466
visitor_hist_adr_usd 0.9489773507178905
prop_country_id 0.0
prop_id 0.0
prop_starrating 0.0
prop_review_score 0.0014851723770038683
prop_brand_bool 0.0
prop_location_score1 0.0
prop_location_score2 0.2199015115319682
prop_log_historical_price 0.0
position 0.0
price_usd 0.0
promotion_flag 0.0
srch_destination_id 0.0
srch_length_of_stay 0.0
srch_booking_window 0.0
srch_adults_count 0.0
srch_children_count 0.0
srch_room_count 0.0
srch_saturday_night_bool 0.0
srch_query_affinity_score 0.935985520981085
orig_destination_distance 0.32425766086964064
random_bool 0.0
comp1_rate 0.9758125036428471
comp1_inv 0.9738705258022482
comp1_rate_percent_diff 0.9809535314894258
comp2_rate 0.5916639154137457
comp2_inv 0.5703671001646314
comp2_rate_percent_diff 0.8878178554264153
comp3_rate 0.6905646176034069
comp3_inv 0.6670281446619206
comp3_rate_percent_diff 0.9046462460170698
co

In [14]:
# print all columns that contain more than 50% missing values
for col in train_data.columns:
    # print percentage of missing values
    if train_data[col].isnull().sum()/train_data.shape[0] > 0.5:
        print(col, train_data[col].isnull().sum()/train_data.shape[0])

visitor_hist_starrating 0.949203635808466
visitor_hist_adr_usd 0.9489773507178905
srch_query_affinity_score 0.935985520981085
comp1_rate 0.9758125036428471
comp1_inv 0.9738705258022482
comp1_rate_percent_diff 0.9809535314894258
comp2_rate 0.5916639154137457
comp2_inv 0.5703671001646314
comp2_rate_percent_diff 0.8878178554264153
comp3_rate 0.6905646176034069
comp3_inv 0.6670281446619206
comp3_rate_percent_diff 0.9046462460170698
comp4_rate 0.9380079691881186
comp4_inv 0.9306900061653611
comp4_rate_percent_diff 0.9735625602645398
comp5_rate 0.5517915547257988
comp5_inv 0.5240308917467857
comp5_rate_percent_diff 0.8303670557950059
comp6_rate 0.9515651082911301
comp6_inv 0.9473663299482671
comp6_rate_percent_diff 0.9806036164875108
comp7_rate 0.9364005786605899
comp7_inv 0.9281167695605007
comp7_rate_percent_diff 0.9720642786799714
comp8_rate 0.61344899822461
comp8_inv 0.5991601636593809
comp8_rate_percent_diff 0.8760211820592629
gross_bookings_usd 0.9720894886945186
