In [1]:
# iMPORT PACKAGES
import pandas as pd
import numpy as np

import datetime as dt
from datetime import timedelta

import xgboost as xgb
from sklearn.metrics import mean_squared_error

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")



In [2]:
path = '/Users/cathytol/Documents/DMT/dmt-data/'

inputFile = path + "training_set_VU_DM.csv"
df = pd.read_csv(inputFile, sep = ',')
#df['date_time'] = pd.to_datetime(df['date_time'])
df = shuffle(df[['srch_id', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv', 'click_bool', 'gross_bookings_usd',
       'booking_bool']])
df = df.reset_index(drop = True)

trainOnlyColumns = ['position','click_bool','booking_bool','gross_bookings_usd']
X = df.drop(trainOnlyColumns, axis=1)
Y_click = df['click_bool']
Y_book = df['booking_bool']
len(df)

4958347

In [None]:
print("% clicks: ", sum(df['click_bool'])/4958347*100)
print("% bookings: ", sum(df['booking_bool'])/4958347*100)

In [3]:
inputFile = path + "test_set_VU_DM.csv"
df_evaluate = pd.read_csv(inputFile, sep = ',')
df_evaluate = df_evaluate[['srch_id', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv']]


# get training score

In [4]:
def getScore(df, y_click, y_book):
    df['gain'] = df[y_click]+5*df[y_book]
    df['ind'] = df.index+1

    df['g/i'] = df['gain']/df['ind']
    gi_sum = df['g/i'].sum()

    df['gain_sorted'] = list(df['gain'].sort_values(ascending = False))
    df['g/i_sorted'] = df['gain_sorted']/df['ind']
    gi_sorted_sum = df['g/i_sorted'].sum()
    
    score = gi_sum/gi_sorted_sum

    return score, gi_sum, gi_sorted_sum

# xg boost

In [None]:
# setup parameters for xgboost
param = {}
# use logistic regression loss, use raw prediction before logistic transformation
# since we only need the rank
param['objective'] = 'reg:logistic'
# scale weight of positive examples
param['scale_pos_weight'] = sum(y_train==0)/sum(y_train==1)
param['eta'] = 0.1
param['max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 0
param['nthread'] = 10

In [5]:
# first predict click
X_train, X_test, y_train, y_test = train_test_split(X, Y_click, test_size=0.2, random_state=123)
xg_reg_click = xgb.XGBRegressor(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 6, alpha = 10, n_estimators = 50)

xg_reg_click.fit(X_train,y_train)
preds_click = xg_reg_click.predict(X_test)

X_fin = X_test
X_fin['preds_click'] = preds_click
X_fin['y_click'] = y_test

# ten predict book
X_train, X_test, y_train, y_test = train_test_split(X, Y_book, test_size=0.2, random_state=123)
xg_reg_book = xgb.XGBRegressor(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 6, alpha = 10, n_estimators = 50)

xg_reg_book.fit(X_train,y_train)
preds_book = xg_reg_book.predict(X_test)


X_fin['preds_book'] = preds_book
X_fin['y_book'] = y_test


score, gi_sum, gi_sorted_sum = getScore(X_fin.sort_values(by=['preds_book', 'preds_click'], ascending = False), 'y_click', 'y_book')
print(score)
print(gi_sum)
print(gi_sorted_sum)

0.7834314792567336
0.3202264084221033
0.4087484571412835


In [None]:
#https://xgboost.readthedocs.io/en/latest/parameter.html#general-parameters

In [7]:
score, gi_sum, gi_sorted_sum = getScore(X_fin.sort_values(by=['srch_id', 'preds_book', 'preds_click'], ascending = [True, False, False]), 'y_click', 'y_book')
print(score)
print(gi_sum)
print(gi_sorted_sum)

0.7732298447524493
0.3202264084221037
0.4141412939442666


In [6]:
preds_click_eval = xg_reg_click.predict(df_evaluate)
preds_book_eval = xg_reg_book.predict(df_evaluate)

df_evaluate['preds_click'] = preds_click_eval
df_evaluate['preds_book'] = preds_book_eval

df_evaluate = df_evaluate.sort_values(['srch_id','preds_book', 'preds_click'],ascending=[True, False, False])

dfSubmission = df_evaluate[['srch_id','prop_id']]
dfSubmission.to_csv("submission_cathy_2.csv",index=False)

In [None]:
data_dmatrix_click = xgb.DMatrix(data=X,label=Y_click)
data_dmatrix_book = xgb.DMatrix(data=X,label=Y_book)

In [None]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix_book, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
cv_results

In [None]:
params

In [None]:
#https://www.datacamp.com/community/tutorials/xgboost-in-python

In [None]:
#NOG NIET GEBRUIKT

## NORMALISE dict
def normaliseDF(df):
    maxDict = {}
    minDict = {}
    for column in df.columns:
        maxi = df[column].max()
        maxDict[column] = maxi
        mini = df[column].min()
        minDict[column] = mini
        if (column != "id") and (column != "date") and (column != "mood_mean_TARGET"):
            df[column] = ((df[column]-mini)/(maxi-mini))
    return df, maxDict, minDict

df, maxDict, minDict = normaliseDF(df)

In [None]:
## REVERSE - NORMALISE 
def reverseNormaliseDF(df, maxDict, minDict):
    for column in df.columns:
        maxi = maxDict[column]
        mini = minDict[column]
        if (column != "id") and (column != "date") and (column != "mood_mean"):
            df[column] = (df[column]*(maxi-mini)) + mini
    return df

#df = reverseNormaliseDF(df, maxDict, minDict)

In [None]:
#df.to_csv("out_with_nan_mood_normalised.csv", index = False)

In [None]:
#df.dropna(subset = ['mood_mean'] ).to_csv("out_without_nan_mood_normalised.csv", index = False)

In [None]:
#df.dropna(subset = ['mood_mean_TARGET'] ).to_csv("out_without_nan_mood_target_normalised.csv", index = False)

In [None]:
# SHOW CORRELATIONS
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
corr[['click_bool', 'booking_bool', 'gain']].style.background_gradient(cmap='coolwarm')

In [None]:
df.describe()

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
X = df.drop(['date_time', 'click_bool', 'booking_bool', 'gain', 'ind', 'g/i', 'gain_sorted', 'g/i_sorted'],axis = 1)

In [None]:
X = df[['visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_brand_bool', 'prop_location_score1', 
  'prop_log_historical_price', 'price_usd', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay',
   'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
   'random_bool']]

In [None]:
y = df['click_bool']

In [None]:

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(X, y) 

In [None]:
inputFile = path + "test_set_VU_DM.csv"
df_test = pd.read_csv(inputFile, sep = ',')
#df_test['date_time'] = pd.to_datetime(df_test['date_time'])
df_test.head(20)

In [None]:
X_test = df_test[['visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_brand_bool', 'prop_location_score1', 
  'prop_log_historical_price', 'price_usd', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay',
   'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
   'random_bool']]

In [None]:
y_test = clf.predict(X_test)

In [None]:
sum(y_test)