In [13]:
import pandas as pd
import numpy as np
import sklearn
from xgboost import XGBClassifier
import xgboost
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle
import imblearn
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE

In [8]:
df = pd.read_csv('../data/train.csv', delimiter=",", usecols = ['srch_id', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv', 'click_bool', 'gross_bookings_usd',
       'booking_bool'])
df = shuffle(df)

In [9]:
trainOnlyColumns = ['position','click_bool','booking_bool','gross_bookings_usd']
Y = df['click_bool']
df.drop(trainOnlyColumns, axis=1,inplace=True)

In [10]:
n = 20
numOfRowsTest = (int(len(df)*(n/100)))
X_test = df.tail(numOfRowsTest)
Y_test = Y[-numOfRowsTest:]
df.drop(df.tail(numOfRowsTest).index,inplace=True)
y_train = Y[:(len(Y)-numOfRowsTest)]

In [5]:
# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
xgmat = xgboost.DMatrix( df, label=y_train)

# setup parameters for xgboost
param = {}
# use logistic regression loss, use raw prediction before logistic transformation
# since we only need the rank
param['objective'] = 'reg:logistic'
# scale weight of positive examples
param['scale_pos_weight'] = sum(y_train==0)/sum(y_train==1)
param['eta'] = 0.1
param['max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 0
param['nthread'] = 10

# you can directly throw param in, though we want to watch multiple metrics here
plst = list(param.items())+[('eval_metric', 'ams@0.15')]

watchlist = [ (xgmat,'train') ]
# boost 120 trees
num_round = 120
print ('loading data end, start to boost trees')
bst = xgboost.train( plst, xgmat, num_round, watchlist );
# save out model
bst.save_model('paraTest.model')

print ('finish training')

# fit model no training data
#model = XGBClassifier()
#model.fit(xs, ys)
# make predictions for test data

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


loading data end, start to boost trees
[0]	train-auc:0.663513	train-ams@0.15:79.8359
[1]	train-auc:0.666726	train-ams@0.15:80.9007
[2]	train-auc:0.669043	train-ams@0.15:81.611
[3]	train-auc:0.670141	train-ams@0.15:81.6612
[4]	train-auc:0.671522	train-ams@0.15:82.3888
[5]	train-auc:0.672532	train-ams@0.15:82.6537
[6]	train-auc:0.673984	train-ams@0.15:83.2229
[7]	train-auc:0.675044	train-ams@0.15:83.4713
[8]	train-auc:0.67596	train-ams@0.15:83.7212
[9]	train-auc:0.676833	train-ams@0.15:83.9669
[10]	train-auc:0.677511	train-ams@0.15:84.1121
[11]	train-auc:0.678134	train-ams@0.15:84.2686
[12]	train-auc:0.678967	train-ams@0.15:84.3691
[13]	train-auc:0.679592	train-ams@0.15:84.5298
[14]	train-auc:0.680177	train-ams@0.15:84.5912
[15]	train-auc:0.680815	train-ams@0.15:84.7645
[16]	train-auc:0.681457	train-ams@0.15:84.8511
[17]	train-auc:0.682108	train-ams@0.15:85.0594
[18]	train-auc:0.682625	train-ams@0.15:85.1209
[19]	train-auc:0.683147	train-ams@0.15:85.2663
[20]	train-auc:0.683843	train-ams

In [6]:
xgmat_test = xgboost.DMatrix( X_test, label=Y_test)
y_pred = bst.predict(xgmat_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Counter(predictions)

Accuracy: 66.89%


Counter({1.0: 337924, 0.0: 653745})

In [18]:
bst = xgboost.Booster({'nthread': 12})  # init model
bst.load_model('xgboostBinary26April.model')  # load data

In [19]:
dfTest = pd.read_csv('../data/test.csv', delimiter=",", usecols = ['srch_id', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price',  'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv'])
xgmat_test_submission = xgboost.DMatrix( dfTest)

In [23]:
y_pred = bst.predict(xgmat_test_submission)


Counter({0.46263275: 1,
         0.49010125: 1,
         0.46450725: 1,
         0.41917822: 2,
         0.6265459: 1,
         0.68176144: 1,
         0.5893881: 1,
         0.4515128: 1,
         0.6011479: 1,
         0.7952648: 1,
         0.4034876: 1,
         0.23046538: 2,
         0.74105674: 1,
         0.57702386: 1,
         0.51492953: 2,
         0.47802183: 2,
         0.56126285: 1,
         0.5598553: 3,
         0.57463586: 3,
         0.48755953: 2,
         0.5394395: 1,
         0.50966614: 2,
         0.5062034: 1,
         0.79127896: 1,
         0.5337822: 1,
         0.491406: 1,
         0.41289037: 2,
         0.3709644: 1,
         0.3987761: 1,
         0.31413037: 1,
         0.36510688: 1,
         0.2457689: 3,
         0.28797853: 1,
         0.378235: 1,
         0.2879226: 2,
         0.38002026: 2,
         0.2748225: 1,
         0.2036166: 1,
         0.48581052: 2,
         0.22336408: 1,
         0.6219239: 1,
         0.7780286: 1,
         0.234

In [None]:
dfTest['predictedPos'] = y_pred
dfTest.groupby(['srch_id']).apply(lambda x: x.sort_values(["predictedPos"], ascending = False)).reset_index()


In [None]:
dfTest.head()

In [25]:
#Counter(y_pred)
predictions = [round(value) for value in y_pred]
Counter(predictions)

Counter({0.0: 3266491, 1.0: 1692692})