In [1]:
import pandas as pd
import numpy as np
import sklearn
from xgboost import XGBClassifier
import xgboost
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle
from sklearn.utils import shuffle

In [2]:
#Don't use datetime, Don't use categorical features, xgboost doesn't those, first one hot encode
featuresToUse = ['visitor_hist_starrating', 'visitor_hist_adr_usd',
       'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'comp1_rate', 'comp1_inv',
                 'prop_starrating_mean',
       'prop_review_score_mean', 'prop_brand_bool_mean',
       'prop_location_score1_mean', 'prop_location_score2_mean',
       'prop_log_historical_price_mean']
labelToPredict = 'click_bool'
neededVars = ['srch_id','booking_bool']
nameOfModel = "binaryLogistic1May.model"

In [3]:
df = pd.read_csv('../data/80PercentTrainWithMean.csv', delimiter=",",usecols=featuresToUse+[labelToPredict]+neededVars)
dfNeeded= df[neededVars+[labelToPredict]]
Y = df[labelToPredict]
df.drop(neededVars+[labelToPredict],axis=1,inplace=True)

Lets train it on clicking

In [None]:
# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
#xgmat = xgboost.DMatrix( df, label=y_train)
xgmat = xgboost.DMatrix( df, label=Y)
# setup parameters for xgboost
param = {}
# use logistic regression loss, use raw prediction before logistic transformation
# since we only need the rank
param['objective'] = 'binary:logistic'
# scale weight of positive examples
param['scale_pos_weight'] = sum(Y==0)/sum(Y==1)
param['eta'] = 0.01
param['max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 0


# you can directly throw param in, though we want to watch multiple metrics here
plst = list(param.items())

watchlist = [ (xgmat,'train') ]
# boost 120 trees
num_round = 120
print ('loading data end, start to boost trees')
bst = xgboost.train( plst, xgmat, num_round, watchlist ,early_stopping_rounds=5);
# save out model
bst.save_model(nameOfModel)

print ('finish training')


loading data end, start to boost trees
[0]	train-auc:0.662401
Will train until train-auc hasn't improved in 5 rounds.
[1]	train-auc:0.662515
[2]	train-auc:0.664193
[3]	train-auc:0.66462
[4]	train-auc:0.664669
[5]	train-auc:0.665517
[6]	train-auc:0.665451
[7]	train-auc:0.665701
[8]	train-auc:0.665719
[9]	train-auc:0.666456
[10]	train-auc:0.666495
[11]	train-auc:0.666807
[12]	train-auc:0.66688
[13]	train-auc:0.666981
[14]	train-auc:0.667554
[15]	train-auc:0.667691
[16]	train-auc:0.667807
[17]	train-auc:0.667935
[18]	train-auc:0.66807
[19]	train-auc:0.668147
[20]	train-auc:0.668305
[21]	train-auc:0.668462
[22]	train-auc:0.668593
[23]	train-auc:0.668731
[24]	train-auc:0.66888
[25]	train-auc:0.668949
[26]	train-auc:0.669033
[27]	train-auc:0.66914
[28]	train-auc:0.669243
[29]	train-auc:0.669317
[30]	train-auc:0.669443
[31]	train-auc:0.669487
[32]	train-auc:0.66969
[33]	train-auc:0.669769
[34]	train-auc:0.670012
[35]	train-auc:0.670195
[36]	train-auc:0.67029
[37]	train-auc:0.670431
[38]	train

In [None]:
df = pd.read_csv('../data/20PercentTestWithMean.csv', delimiter=",",usecols=featuresToUse+[labelToPredict]+neededVars)
dfNeeded= df[neededVars+[labelToPredict]]
Y = df[labelToPredict]
df.drop(neededVars+[labelToPredict],axis=1,inplace=True)

In [None]:
xgmat_test = xgboost.DMatrix( df, label=Y)
y_pred = bst.predict(xgmat_test)

In [None]:
predictions = [round(value) for value in y_pred]
df = pd.concat([df,dfNeeded],axis=1)
df['y_pred'] = y_pred

In [None]:
def getScore(df):
    df['rank_srch_id'] = df.groupby('srch_id').cumcount().add(1)
    df['gain'] = 5*df['booking_bool']
    df['gain'] = np.where(df['gain'] == 0, df['click_bool'], df['gain'])

    df['g/rank'] = df['gain']/df['rank_srch_id']
    gi_sum = df['g/rank'].sum()
    
    df['gain_sorted'] = list(df[['srch_id', 'booking_bool', 'click_bool', 'gain']].sort_values(by = ['srch_id','booking_bool','click_bool'], ascending = [True, False, False])['gain'])
    df['g/rank_sorted'] = df['gain_sorted']/df['rank_srch_id']
    gi_sorted_sum = df['g/rank_sorted'].sum() 
    score = gi_sum/gi_sorted_sum
    return score, gi_sum, gi_sorted_sum

In [None]:
score, gi_sum, gi_sorted_sum = getScore(df.sort_values(by=['srch_id', 'y_pred'], ascending = [True, False]).reset_index(drop=True))
print(score)

# evaluate predictions
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Counter(predictions)

Evaluation

In [10]:
dfTest = pd.read_csv("../data/testWithMean.csv")

In [11]:
clickingModel = xgboost.Booster({'nthread': 12})  # init model
clickingModel.load_model('clickingModel1may.model')  # load data


In [12]:
xgmat_test_submission = xgboost.DMatrix(dfTest[featuresToUse])
y_pred_clicking = clickingModel.predict(xgmat_test_submission)
dfTest['predictedPos'] = y_pred_clicking
dfTest = dfTest.sort_values(['srch_id','predictedPos'],ascending=[True, False])
dfSubmission = dfTest[['srch_id','prop_id']]
dfSubmission.to_csv("submissionTest1may.csv",index=False)