In [1]:
import os, sys
import pickle
sys.path.append("/home/stan/Documents/dev/webEcon/wegroup/")
import utility as u
import numpy as np
import pandas as pd
from sklearn import preprocessing as pre
#from hyperopt import hp
#from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import xgboost as xgb
from sklearn import metrics

Import data into a dataframe

In [2]:
train_file = "data_train.txt"
test_file = "shuffle_data_test.txt"
data_folder = 'data'
columns = ['click', 'weekday', 'hour', 'timestamp', 'log_type', 'user_id', 'user_agent', 'ip', 'region', 'city',
           'ad_exchange', 'domain', 'url', 'anon_url_id', 'ad_slot_id', 'width', 'height', 'visibility', 'format',
           'price', 'creative_id', 'key_page_url', 'advertiser_id', 'user_tags']

train, test = u.import_tr_te(train_file, test_file, columns, data_folder)
train['sample'] = 'train'
test['sample'] = 'test'
test['click'] = np.NaN
join = pd.concat([train, test], ignore_index=True)
join['id'] = join.index  # test real id is "iid"

print("Import of files finished.")

Import of files finished.


Based on EDA assign columns to groups

In [2]:
bad_features = ['log_type', 'anon_url_id', 'advertiser_id']  # no variance
id_features = ['user_id', 'ip', 'domain', 'url', "ad_slot_id", 'creative_id', 'key_page_url']
non_features = ['id', 'sample']  #auxilary vars
trans_features = ['user_agent','timestamp','user_tags']

# few levels, able to convert to numbers: (counts: 2,11,3,3)
# using a randomized classifier will make the random assignment of numbers to classes arbitrary
to_be_cat_feats = ['key_page_url', 'creative_id', 'format', 'ad_exchange']
cont = ['width', 'area', 'height', 'price', 'cost_per_area']

Drop not needed columns

In [None]:
join.drop(bad_features, axis=1, inplace=True)

In [8]:
join.drop([f for f in id_features if f not in to_be_cat_feats], axis=1, inplace=True)

Encode categorical features by numbers

In [9]:
def labelencoder(train_data):
    for f in to_be_cat_feats:
        if train_data[f].dtype == 'object':
            lbl = pre.LabelEncoder()
            lbl.fit(np.unique(list(train_data[f].values)))
            train_data[f] = lbl.transform(list(train_data[f].values))
    return train_data

In [10]:
join = labelencoder(join)

Apply data transform functions

In [11]:
simple_feat_funct = [u.tfidf, u.split_user_agent, u.split_timestamp, u.multiply, u.cost_per_area]
for funct in simple_feat_funct:
    df = funct(join)
    join = pd.concat([join, df], axis=1)
del df

Substitution of categorical encoding of sys and browser by ordinal values

In [12]:
np.unique(join['sys'])

array(['android', 'ios', 'linux', 'mac', 'other', 'windows'], dtype=object)

Numbers chosen in order of sophistication

In [12]:
ordinal_sys={'ios':1,'android':2,'windows':3, 'mac':4,'linux':5,'other':6}

In [8]:
np.unique(join['browser'])

array(['chrome', 'firefox', 'ie', 'maxthon', 'opera', 'other', 'safari',
       'sogou', 'theworld'], dtype=object)

Numbers chosen in order of sophistication

In [13]:
ordinal_browser={'ie':1,'safari':2,'chrome':3, 'opera':4,'firefox':5,'sogou':6,'theworld':6,'maxthon':6,'other':7}

In [14]:
join['sys']=join['sys'].map(lambda x: ordinal_sys[x])

In [15]:
join['browser']=join['browser'].map(lambda x: ordinal_browser[x])

In [16]:
join.drop(trans_features, axis=1, inplace=True)

In [17]:
join.dtypes

ad_exchange        int64
city               int64
click            float64
creative_id        int64
format             int64
height             int64
hour               int64
key_page_url       int64
price              int64
region             int64
sample            object
visibility         int64
weekday            int64
width              int64
id                 int64
user_tags_0      float64
user_tags_1      float64
user_tags_2      float64
user_tags_3      float64
user_tags_4      float64
sys                int64
browser            int64
ty               float64
tm               float64
td               float64
area               int64
cost_per_area    float64
dtype: object

In [18]:
index = pd.MultiIndex.from_arrays([join['sample'],join.index], names=['sample','id'])
join.index=index

In [19]:
pickle.dump(join, open("join_final.p", 'wb'))

Create train and test

In [20]:
train=join.loc['train']

In [21]:
test=join.loc['test']

In [4]:
train.drop(non_features,axis=1,inplace=True)

In [6]:
test.drop(non_features,axis=1,inplace=True)

In [22]:
print("Dumping features to file...")
pickle.dump(train, open("train_final.p", 'wb'))

Dumping features to file...


In [23]:
pickle.dump(test, open("test_final.p", 'wb'))

#### Load df from files

In [None]:
#join = pickle.load(open('join_final.p', "rb")) #if needed

In [3]:
train = pickle.load(open('train_final.p', 'rb'))

In [7]:
dataX = train[[f for f in train.columns if f!='click']]
dataY = train['click']

In [8]:
dataX.columns

Index(['ad_exchange', 'city', 'creative_id', 'format', 'height', 'hour',
       'key_page_url', 'price', 'region', 'visibility', 'weekday', 'width',
       'user_tags_0', 'user_tags_1', 'user_tags_2', 'user_tags_3',
       'user_tags_4', 'sys', 'browser', 'ty', 'tm', 'td', 'area',
       'cost_per_area'],
      dtype='object')

In [5]:
test = pickle.load(open('test_final.p', 'rb'))

In [9]:
test=test[[f for f in test.columns if f!='click']]

Normalize continuous variables

In [10]:
for c in cont:
    dataX[c] = pre.scale(dataX[c])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [11]:
for c in cont:
    test[c] = pre.scale(test[c])



Create validation sample from training data

In [12]:
sz=(dataX.values).shape

In [13]:
from sklearn.cross_validation import train_test_split

In [14]:
train_X, test_X, train_Y, test_Y = train_test_split(dataX, dataY, test_size=0.3, random_state=1234)

In [15]:
xg_train = xgb.DMatrix(train_X, label=train_Y, feature_names=dataX.columns) #
xg_test = xgb.DMatrix(test_X, label=test_Y, feature_names=dataX.columns) #
full_xg_test = xgb.DMatrix(test,feature_names=test.columns) #

Define different configs for xgboost

In [16]:
param = {}
#param['objective'] = 'binary:logistic'
param['objective'] = 'multi:softprob'

num_round = 5
#param['eta'] = 0.5
#param['lambda']=1.1
#param['max_depth'] = 8
#param['min_child_weight']=3
#param['max_delta_step']=10 # as recommended http://xgboost.readthedocs.org/en/latest/param_tuning.html
#param['subsample']=0.6

watchlist = [(xg_train,'train'), (xg_test, 'test')]
param['eval_metric']='mlogloss'
#param['eval_metric'] = 'auc'
param['nthread'] = 3
param['num_class'] = 2

In [44]:
bst = xgb.train(param, xg_train, num_round, evals=watchlist)

[0]	train-mlogloss:0.438207	test-mlogloss:0.438251
[1]	train-mlogloss:0.297442	test-mlogloss:0.297522
[2]	train-mlogloss:0.208807	test-mlogloss:0.208920
[3]	train-mlogloss:0.149569	test-mlogloss:0.149713


Default performance

In [45]:
xgbpred_sample_test = bst.predict(xg_test)
score=metrics.roc_auc_score(test_Y, xgbpred_sample_test.T[1])
score

0.57060347737966022

In [58]:
def hp_search(param_name, array_of_values):
    scores={}
    for v in array_of_values:
        #temporarely change parameter
        param[param_name]=v
        #train model, predict on validation set and calculate auc
        bst = xgb.train(param, xg_train, num_round, evals=watchlist)
        xgbpred_sample_test = bst.predict(xg_test)
        score=metrics.roc_auc_score(test_Y, xgbpred_sample_test.T[1])
        scores[v]=score
        print(str(v)+': '+str(score))
    #set param value to best value
    param[param_name]=max(scores, key=scores.get)

In [62]:
hp_search('eta',[0.5, 0.7, 1])

[0]	train-mlogloss:0.314340	test-mlogloss:0.314413
[1]	train-mlogloss:0.171912	test-mlogloss:0.172041
[2]	train-mlogloss:0.099746	test-mlogloss:0.099925
[3]	train-mlogloss:0.059881	test-mlogloss:0.060113
[4]	train-mlogloss:0.037005	test-mlogloss:0.037288
[0]	train-mlogloss:0.221786	test-mlogloss:0.221888
[1]	train-mlogloss:0.100084	test-mlogloss:0.100260
[2]	train-mlogloss:0.049042	test-mlogloss:0.049295
[3]	train-mlogloss:0.025611	test-mlogloss:0.025933
[4]	train-mlogloss:0.014541	test-mlogloss:0.014941


0.5: 0.725506274127
0.7: 0.725603221859

[0]	train-mlogloss:0.128660	test-mlogloss:0.128805
[1]	train-mlogloss:0.045093	test-mlogloss:0.045343
[2]	train-mlogloss:0.018390	test-mlogloss:0.018748
[3]	train-mlogloss:0.009305	test-mlogloss:0.009790



1: 0.728102245713


[4]	train-mlogloss:0.006308	test-mlogloss:0.006904


In [63]:
hp_search('lambda', [1, 1.05, 1.1])

[0]	train-mlogloss:0.128660	test-mlogloss:0.128805
[1]	train-mlogloss:0.045093	test-mlogloss:0.045343
[2]	train-mlogloss:0.018390	test-mlogloss:0.018748
[3]	train-mlogloss:0.009305	test-mlogloss:0.009790
[4]	train-mlogloss:0.006308	test-mlogloss:0.006904
[0]	train-mlogloss:0.128660	test-mlogloss:0.128806
[1]	train-mlogloss:0.045093	test-mlogloss:0.045343
[2]	train-mlogloss:0.018390	test-mlogloss:0.018748
[3]	train-mlogloss:0.009306	test-mlogloss:0.009783
[4]	train-mlogloss:0.006308	test-mlogloss:0.006893


1: 0.728102245713
1.05: 0.72810958286

[0]	train-mlogloss:0.128660	test-mlogloss:0.128806
[1]	train-mlogloss:0.045093	test-mlogloss:0.045342
[2]	train-mlogloss:0.018391	test-mlogloss:0.018750
[3]	train-mlogloss:0.009309	test-mlogloss:0.009783



1.1: 0.727945288264


[4]	train-mlogloss:0.006310	test-mlogloss:0.006898


In [64]:
hp_search('max_depth',[5,7,10])

[0]	train-mlogloss:0.128663	test-mlogloss:0.128803
[1]	train-mlogloss:0.045104	test-mlogloss:0.045339
[2]	train-mlogloss:0.018412	test-mlogloss:0.018742
[3]	train-mlogloss:0.009348	test-mlogloss:0.009773
[4]	train-mlogloss:0.006376	test-mlogloss:0.006898
[0]	train-mlogloss:0.128658	test-mlogloss:0.128803
[1]	train-mlogloss:0.045085	test-mlogloss:0.045341
[2]	train-mlogloss:0.018366	test-mlogloss:0.018767
[3]	train-mlogloss:0.009257	test-mlogloss:0.009829
[4]	train-mlogloss:0.006240	test-mlogloss:0.006933


5: 0.689974494071
7: 0.728555303556

[0]	train-mlogloss:0.128653	test-mlogloss:0.128808
[1]	train-mlogloss:0.045051	test-mlogloss:0.045366
[2]	train-mlogloss:0.018289	test-mlogloss:0.018794
[3]	train-mlogloss:0.009105	test-mlogloss:0.009898



10: 0.728756504411


[4]	train-mlogloss:0.005964	test-mlogloss:0.007026


In [68]:
hp_search('min_child_weight',[60,100])

[0]	train-mlogloss:0.128664	test-mlogloss:0.128801
[1]	train-mlogloss:0.045111	test-mlogloss:0.045329
[2]	train-mlogloss:0.018428	test-mlogloss:0.018726
[3]	train-mlogloss:0.009364	test-mlogloss:0.009743
[4]	train-mlogloss:0.006373	test-mlogloss:0.006844
[0]	train-mlogloss:0.128664	test-mlogloss:0.128802
[1]	train-mlogloss:0.045116	test-mlogloss:0.045332
[2]	train-mlogloss:0.018437	test-mlogloss:0.018724
[3]	train-mlogloss:0.009378	test-mlogloss:0.009742
[4]	train-mlogloss:0.006394	test-mlogloss:0.006844


60: 0.741761011201
100: 0.741141129836


In [69]:
for num_round in [6, 8, 10]:
    scores={}
    bst = xgb.train(param, xg_train, num_round, evals=watchlist)
    xgbpred_sample_test = bst.predict(xg_test)
    score=metrics.roc_auc_score(test_Y, xgbpred_sample_test.T[1])
    scores[num_round]=score
    print(str(num_round)+': '+str(score))
num_round=max(scores, key=scores.get)

[0]	train-mlogloss:0.128664	test-mlogloss:0.128801
[1]	train-mlogloss:0.045111	test-mlogloss:0.045329
[2]	train-mlogloss:0.018428	test-mlogloss:0.018726
[3]	train-mlogloss:0.009364	test-mlogloss:0.009743
[4]	train-mlogloss:0.006373	test-mlogloss:0.006844
[5]	train-mlogloss:0.005475	test-mlogloss:0.006030
[0]	train-mlogloss:0.128664	test-mlogloss:0.128801
[1]	train-mlogloss:0.045111	test-mlogloss:0.045329
[2]	train-mlogloss:0.018428	test-mlogloss:0.018726
[3]	train-mlogloss:0.009364	test-mlogloss:0.009743
[4]	train-mlogloss:0.006373	test-mlogloss:0.006844
[5]	train-mlogloss:0.005475	test-mlogloss:0.006030
[6]	train-mlogloss:0.005241	test-mlogloss:0.005909
[7]	train-mlogloss:0.005159	test-mlogloss:0.005881


6: 0.740584498072
8: 0.729416090876

[0]	train-mlogloss:0.128664	test-mlogloss:0.128801
[1]	train-mlogloss:0.045111	test-mlogloss:0.045329
[2]	train-mlogloss:0.018428	test-mlogloss:0.018726
[3]	train-mlogloss:0.009364	test-mlogloss:0.009743
[4]	train-mlogloss:0.006373	test-mlogloss:0.006844
[5]	train-mlogloss:0.005475	test-mlogloss:0.006030
[6]	train-mlogloss:0.005241	test-mlogloss:0.005909
[7]	train-mlogloss:0.005159	test-mlogloss:0.005881
[8]	train-mlogloss:0.005121	test-mlogloss:0.005905
[9]	train-mlogloss:0.005098	test-mlogloss:0.005929



10: 0.722389062504


In [73]:
param2 = {
    'booster': 'gblinear',
    'objective': 'binary:logistic',
    'nthread': 3,
    'lambda': 1.1,
}
num_round=1
bst = xgb.train(param2, xg_train, num_round, evals=watchlist)
xgbpred_sample_test = bst.predict(xg_test)


[0]	train-error:0.000708	test-error:0.000777


In [75]:
score=metrics.roc_auc_score(test_Y, xgbpred_sample_test)
score

0.62523794833888735

Final model

In [19]:
num_round = 6 # apply the best num_round value
param['eta'] = 1
param['lambda'] =1.05
param['max_depth'] =10
param['min_child_weight']=60

Model performance on validation set

In [26]:
bst = xgb.train(param, xg_train, num_round, evals=watchlist)
xgbpred_sample_test = bst.predict(xg_test)
score=metrics.roc_auc_score(test_Y, xgbpred_sample_test.T[1])
score

[0]	train-mlogloss:0.128664	test-mlogloss:0.128801
[1]	train-mlogloss:0.045111	test-mlogloss:0.045329
[2]	train-mlogloss:0.018429	test-mlogloss:0.018725
[3]	train-mlogloss:0.009367	test-mlogloss:0.009741
[4]	train-mlogloss:0.006376	test-mlogloss:0.006843
[5]	train-mlogloss:0.005481	test-mlogloss:0.006029
[6]	train-mlogloss:0.005251	test-mlogloss:0.005910
[7]	train-mlogloss:0.005171	test-mlogloss:0.005867


0.73647313116409496

Train on all of the data

In [17]:
full_xg_train = xgb.DMatrix(dataX, label=dataY, feature_names=dataX.columns) #

In [27]:
bst = xgb.train(param, full_xg_train, num_round, evals=watchlist)

[0]	train-mlogloss:0.128672	test-mlogloss:0.128809
[1]	train-mlogloss:0.045119	test-mlogloss:0.045330
[2]	train-mlogloss:0.018437	test-mlogloss:0.018712
[3]	train-mlogloss:0.009374	test-mlogloss:0.009713
[4]	train-mlogloss:0.006385	test-mlogloss:0.006774
[5]	train-mlogloss:0.005487	test-mlogloss:0.005917
[6]	train-mlogloss:0.005246	test-mlogloss:0.005693
[7]	train-mlogloss:0.005170	test-mlogloss:0.005616


In [21]:
bst.get_fscore()

{'ad_exchange': 4,
 'area': 28,
 'browser': 34,
 'city': 92,
 'cost_per_area': 32,
 'creative_id': 16,
 'format': 16,
 'height': 6,
 'hour': 76,
 'price': 56,
 'sys': 40,
 'td': 52,
 'user_tags_0': 98,
 'user_tags_1': 68,
 'user_tags_2': 90,
 'user_tags_3': 56,
 'user_tags_4': 40,
 'visibility': 36,
 'weekday': 32,
 'width': 10}

In [28]:
xgbpred_train = bst.predict(full_xg_train)

In [23]:
metrics.roc_auc_score(dataY, xgbpred_train.T[1]) #auc on train

0.81313437018811985

In [29]:
bst.save_model("xgb.model")

In [30]:
xgbpred_test = bst.predict(full_xg_test)

In [31]:
dummy_pred = pd.DataFrame({'Id':np.arange(1,545422),'Prediction':xgbpred_test.T[1]})
dummy_pred.to_csv('pred_final.csv',index=False)