In [1]:
import pandas as pd
import numpy as np
import sklearn
from xgboost import XGBClassifier
import xgboost
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle
from sklearn.utils import shuffle
import json
import os

In [2]:
def getScore(df):
    df['rank_srch_id'] = df.groupby('srch_id').cumcount().add(1)
    df['gain'] = 5*df['booking_bool']
    df['gain'] = np.where(df['gain'] == 0, df['click_bool'], df['gain'])

    df['g/rank'] = df['gain']/df['rank_srch_id']
    gi_sum = df['g/rank'].sum()
    
    df['gain_sorted'] = list(df[['srch_id', 'booking_bool', 'click_bool', 'gain']].sort_values(by = ['srch_id','booking_bool','click_bool'], ascending = [True, False, False])['gain'])
    df['g/rank_sorted'] = df['gain_sorted']/df['rank_srch_id']
    gi_sorted_sum = df['g/rank_sorted'].sum() 
    score = gi_sum/gi_sorted_sum
    return score, gi_sum, gi_sorted_sum

In [3]:
featuresToUse = ['visitor_hist_starrating', 'visitor_hist_adr_usd',
       'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'comp1_rate', 'comp1_inv',
                 'mean_prop_starrating',
       'mean_prop_review_score', 'mean_prop_brand_bool',
       'mean_prop_location_score1', 'mean_prop_location_score2',
       'mean_prop_log_historical_price','cluster__0','cluster__1','cluster__2','cluster__3','cluster__4','cluster__5']
labelToPredict = 'click_bool'
neededVars = ['srch_id','booking_bool']
nameOfModel = "models/6mayEnsemble"

In [4]:
dfTrain = pd.read_csv('../data/80PercentTrainWithMean.csv', delimiter=",",usecols=featuresToUse+[labelToPredict]+neededVars)
dfTestAll = pd.read_csv('../data/20PercentTestWithMean.csv', delimiter=",",usecols=featuresToUse+[labelToPredict]+neededVars)
dfTestAll['y_click_pred'] = 0

In [5]:
clusterNames = ['cluster__0','cluster__1','cluster__2','cluster__3','cluster__4','cluster__5']
currentCluster = 0

In [6]:
for i in range(0,len(clusterNames)):
    currentCluster = i
    curDF = dfTrain[dfTrain[clusterNames[currentCluster]] == 1]
    dfNeeded= curDF[neededVars+[labelToPredict]]
    Y = curDF[labelToPredict]
    curDF.drop(neededVars+[labelToPredict],axis=1,inplace=True)
    # construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
    #xgmat = xgboost.DMatrix( df, label=y_train)
    xgmat = xgboost.DMatrix( curDF, label=Y)
    # setup parameters for xgboost
    param = {}
    # use logistic regression loss, use raw prediction before logistic transformation
    # since we only need the rank
    param['objective'] = 'reg:logistic'
    # scale weight of positive examples
    param['scale_pos_weight'] = sum(Y==0)/sum(Y==1)
    param['eta'] = 0.1
    param['max_depth'] = 10
    param['eval_metric'] = 'auc'
    param['silent'] = 0


    # you can directly throw param in, though we want to watch multiple metrics here
    plst = list(param.items())

    watchlist = [ (xgmat,'train') ]

    num_round = 120
    print ('loading data end, start to boost trees')
    bst = xgboost.train( plst, xgmat, num_round, watchlist ,early_stopping_rounds=5);
    # save out model
    completeName = nameOfModel+"Cluster"+str(currentCluster)+".model"
    bst.save_model(completeName)

    print ('finish training')
    dfTest = dfTestAll[dfTestAll[clusterNames[currentCluster]] == 1]
    dfNeededTest= dfTest[neededVars+[labelToPredict]]
    Y = dfTest[labelToPredict]
    dfTest.drop(neededVars+[labelToPredict,'y_click_pred'],axis=1,inplace=True)
    xgmat_test = xgboost.DMatrix(dfTest)
    y_pred = bst.predict(xgmat_test)
    predictions = [round(value) for value in y_pred]
    
    dfTest = pd.concat([dfTest,dfNeededTest],axis=1)
    dfTest['y_pred'] = y_pred

    score, gi_sum, gi_sorted_sum = getScore(dfTest.sort_values(by=['srch_id', 'y_pred'], ascending = [True, False]).reset_index(drop=True))
    print("Cluster: "+str(currentCluster) + " current score: "+str(score))
    dfTestAll.loc[dfTestAll[clusterNames[currentCluster]] == 1, 'y_click_pred'] = y_pred
    


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


loading data end, start to boost trees
[0]	train-auc:0.669145
Will train until train-auc hasn't improved in 5 rounds.
[1]	train-auc:0.688159
[2]	train-auc:0.70152
[3]	train-auc:0.708897
[4]	train-auc:0.717044
[5]	train-auc:0.721825
[6]	train-auc:0.727535
[7]	train-auc:0.731705
[8]	train-auc:0.735322
[9]	train-auc:0.738907
[10]	train-auc:0.742614
[11]	train-auc:0.745578
[12]	train-auc:0.74879
[13]	train-auc:0.751955
[14]	train-auc:0.754795
[15]	train-auc:0.757618
[16]	train-auc:0.759805
[17]	train-auc:0.762577
[18]	train-auc:0.765372
[19]	train-auc:0.76775
[20]	train-auc:0.770102
[21]	train-auc:0.772269
[22]	train-auc:0.774934
[23]	train-auc:0.776312
[24]	train-auc:0.778872
[25]	train-auc:0.781245
[26]	train-auc:0.783159
[27]	train-auc:0.785276
[28]	train-auc:0.787099
[29]	train-auc:0.788261
[30]	train-auc:0.789986
[31]	train-auc:0.791558
[32]	train-auc:0.793044
[33]	train-auc:0.794408
[34]	train-auc:0.796357
[35]	train-auc:0.797259
[36]	train-auc:0.799258
[37]	train-auc:0.800609
[38]	t

[86]	train-auc:0.982782
[87]	train-auc:0.983099
[88]	train-auc:0.98336
[89]	train-auc:0.98367
[90]	train-auc:0.983796
[91]	train-auc:0.983839
[92]	train-auc:0.98388
[93]	train-auc:0.984037
[94]	train-auc:0.984528
[95]	train-auc:0.984565
[96]	train-auc:0.98483
[97]	train-auc:0.985053
[98]	train-auc:0.985089
[99]	train-auc:0.985126
[100]	train-auc:0.985842
[101]	train-auc:0.985956
[102]	train-auc:0.985979
[103]	train-auc:0.986187
[104]	train-auc:0.986225
[105]	train-auc:0.986314
[106]	train-auc:0.986463
[107]	train-auc:0.986924
[108]	train-auc:0.987643
[109]	train-auc:0.988278
[110]	train-auc:0.988427
[111]	train-auc:0.988565
[112]	train-auc:0.988583
[113]	train-auc:0.988612
[114]	train-auc:0.988687
[115]	train-auc:0.988738
[116]	train-auc:0.988832
[117]	train-auc:0.98914
[118]	train-auc:0.989229
[119]	train-auc:0.990059
finish training
Cluster: 2 current score: 0.8873459240379391
loading data end, start to boost trees
[0]	train-auc:0.693151
Will train until train-auc hasn't improved in 

[49]	train-auc:0.854982
[50]	train-auc:0.85529
[51]	train-auc:0.85682
[52]	train-auc:0.857769
[53]	train-auc:0.858334
[54]	train-auc:0.858485
[55]	train-auc:0.859001
[56]	train-auc:0.859703
[57]	train-auc:0.861823
[58]	train-auc:0.86301
[59]	train-auc:0.863767
[60]	train-auc:0.864422
[61]	train-auc:0.865112
[62]	train-auc:0.866639
[63]	train-auc:0.867118
[64]	train-auc:0.868197
[65]	train-auc:0.869971
[66]	train-auc:0.870357
[67]	train-auc:0.871287
[68]	train-auc:0.871368
[69]	train-auc:0.872184
[70]	train-auc:0.872889
[71]	train-auc:0.873637
[72]	train-auc:0.874382
[73]	train-auc:0.875903
[74]	train-auc:0.876079
[75]	train-auc:0.876954
[76]	train-auc:0.877875
[77]	train-auc:0.87895
[78]	train-auc:0.879667
[79]	train-auc:0.87977
[80]	train-auc:0.881632
[81]	train-auc:0.882245
[82]	train-auc:0.883598
[83]	train-auc:0.884308
[84]	train-auc:0.884521
[85]	train-auc:0.884663
[86]	train-auc:0.885319
[87]	train-auc:0.885464
[88]	train-auc:0.885864
[89]	train-auc:0.886025
[90]	train-auc:0.8865

Lets combine it all

In [8]:
score, gi_sum, gi_sorted_sum = getScore(dfTestAll.sort_values(by=['srch_id', 'y_click_pred'], ascending = [True, False]).reset_index(drop=True))
print("Total test score: "+str(score))

Total test score: 0.6589575279756446
