In [128]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import datetime as dt

import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

In [129]:
data_file_path = "C:/Users/Bas/OneDrive/MSc. Artificial Intelligence VU/MSc. AI Year 1/Data Mining Techniques/Assignment 2/data/"
train = pd.read_csv(data_file_path + "training_set_VU_DM.csv", sep=',')

In [135]:
train.info() #54 cols

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4958347 entries, 0 to 4958346
Data columns (total 54 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   srch_id                      int64  
 1   date_time                    object 
 2   site_id                      int64  
 3   visitor_location_country_id  int64  
 4   visitor_hist_starrating      float64
 5   visitor_hist_adr_usd         float64
 6   prop_country_id              int64  
 7   prop_id                      int64  
 8   prop_starrating              int64  
 9   prop_review_score            float64
 10  prop_brand_bool              int64  
 11  prop_location_score1         float64
 12  prop_location_score2         float64
 13  prop_log_historical_price    float64
 14  position                     int64  
 15  price_usd                    float64
 16  promotion_flag               int64  
 17  srch_destination_id          int64  
 18  srch_length_of_stay          int64  
 19  

In [142]:
# for comp in [1,2,3,4,5,6,7,8]:
#     print("comp{0}_inv".format(comp))
    
#     print(train["comp{0}_rate".format(comp)].isnull().sum())
#     print(train["comp{0}_rate".format(comp)].value_counts())
    
#     print(train["comp{0}_inv".format(comp)].isnull().sum())
#     print(train["comp{0}_inv".format(comp)].value_counts())

## Smaller subset of 500 searches for computational speed:

In [171]:
train_subset = train[train['srch_id'] < 864]
print(len(train_subset['srch_id'].unique()))
train_subset.head()

500


Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0


## Drop cols with too much missing data

In [172]:
cols_drop_na = ['position','click_bool','gross_bookings_usd']
cols_drop_bs = ['date_time','prop_location_score2','prop_id']
cols_drop_nulls = ['visitor_hist_starrating','visitor_hist_adr_usd','srch_query_affinity_score']
cols_drop = cols_drop_na + cols_drop_nulls + cols_drop_bs
train_subset.drop(columns=cols_drop, inplace=True)
train_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12098 entries, 0 to 12097
Data columns (total 45 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   srch_id                      12098 non-null  int64  
 1   site_id                      12098 non-null  int64  
 2   visitor_location_country_id  12098 non-null  int64  
 3   prop_country_id              12098 non-null  int64  
 4   prop_starrating              12098 non-null  int64  
 5   prop_review_score            12081 non-null  float64
 6   prop_brand_bool              12098 non-null  int64  
 7   prop_location_score1         12098 non-null  float64
 8   prop_log_historical_price    12098 non-null  float64
 9   price_usd                    12098 non-null  float64
 10  promotion_flag               12098 non-null  int64  
 11  srch_destination_id          12098 non-null  int64  
 12  srch_length_of_stay          12098 non-null  int64  
 13  srch_booking_win

## Aggregate competitive info

In [174]:
for i in [1,2,3,4,5,6,7,8]:
    comp_rate = "comp{0}_rate".format(i)
    comp_inv = "comp{0}_inv".format(i)
    comp_perc = "comp{0}_rate_percent_diff".format(i)
    train_subset[comp_inv].replace({0:1, 1:0, -1:np.nan}, inplace=True)
    train_subset["comp{0}".format(i)] = train_subset[comp_rate] * train_subset[comp_inv] * train_subset[comp_perc]
    train_subset.drop(columns=[comp_rate,comp_inv,comp_perc], inplace=True)

In [177]:
train_subset.columns

Index(['srch_id', 'site_id', 'visitor_location_country_id', 'prop_country_id',
       'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_log_historical_price', 'price_usd',
       'promotion_flag', 'srch_destination_id', 'srch_length_of_stay',
       'srch_booking_window', 'srch_adults_count', 'srch_children_count',
       'srch_room_count', 'srch_saturday_night_bool',
       'orig_destination_distance', 'random_bool', 'booking_bool', 'comp1',
       'comp2', 'comp3', 'comp4', 'comp5', 'comp6', 'comp7', 'comp8'],
      dtype='object')

In [63]:
list = np.nan

nan