In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn import cross_validation 
import warnings
warnings.filterwarnings("ignore")
import operator

In [2]:
#Read data into dataframe and combine data with header
df_header = pd.read_csv('header.csv')
df_header['name'].tolist()
df_train = pd.read_csv('training.tsv', sep = '\t', names = df_header['name'].tolist())
df_test = pd.read_csv('test.tsv', sep = '\t', names = df_header['name'].tolist())

In [3]:

y = df_train['sales']  #Use sales as label
y[y>0] = 1 #Change sales to binary variable since we are trying to get probability as prediction

#Drop deal_id since it should be inrevelant to sales and others since they have id associated with
#The reason why I dropped clicks is that there is no clicks in the test set as clicks also is after-the-fact metrics
x = df_train.drop(['deal_id','name', 'description', 'manufacturer_name', 'coupon_description',\
                  'clicks','sales'], axis = 1)

#Split data to training set as well as CV set
x_train, x_valid, y_train, y_valid = cross_validation.train_test_split(x, y, test_size=0.3)

x_test = df_test.drop(['deal_id','name', 'description', 'manufacturer_name', 'coupon_description',\
                     'clicks','sales'], axis = 1)

#Clean those missing values as 0
x_train.fillna(0, inplace = True)
x_test.fillna(0, inplace = True)
x_valid.fillna(0, inplace = True)

In [4]:
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_valid, y_valid)

In [5]:
param = {'bst:max_depth':10, 'bst:eta':0.155, 'silent':1, 'objective':'binary:logistic','lambda':1.03,\
         'subsample':0.5,'min_samples_split': 800, 'min_samples_leaf':50}
param['nthread'] = 4
param['eval_metric'] = 'auc'
evallist  = [(d_valid,'eval'), (d_train,'train')]

In [6]:
num_boost_round = 200
bst = xgb.train(param, d_train, num_boost_round, evals=evallist, early_stopping_rounds=1)

Will train until train error hasn't decreased in 1 rounds.
[0]	eval-auc:0.499933	train-auc:0.500558
[1]	eval-auc:0.501314	train-auc:0.501095
[2]	eval-auc:0.588340	train-auc:0.597689
[3]	eval-auc:0.597894	train-auc:0.611990
[4]	eval-auc:0.601975	train-auc:0.620387
[5]	eval-auc:0.618271	train-auc:0.644497
[6]	eval-auc:0.625681	train-auc:0.653925
[7]	eval-auc:0.627205	train-auc:0.658941
[8]	eval-auc:0.629057	train-auc:0.666658
[9]	eval-auc:0.632399	train-auc:0.680526
[10]	eval-auc:0.635295	train-auc:0.701973
[11]	eval-auc:0.640178	train-auc:0.713566
[12]	eval-auc:0.644375	train-auc:0.733233
[13]	eval-auc:0.641443	train-auc:0.743742
[14]	eval-auc:0.640334	train-auc:0.765297
[15]	eval-auc:0.642873	train-auc:0.780092
[16]	eval-auc:0.645660	train-auc:0.796645
[17]	eval-auc:0.650579	train-auc:0.805368
[18]	eval-auc:0.664078	train-auc:0.827495
[19]	eval-auc:0.671259	train-auc:0.841405
[20]	eval-auc:0.673446	train-auc:0.849384
[21]	eval-auc:0.677365	train-auc:0.861658
[22]	eval-auc:0.674319	trai

In [7]:
from matplotlib import pylab as plt
features = list(x_train.columns)
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
  outfile.write('{0}\t{1}\tq\n'.format(i, feat))
  i = i + 1
outfile.close()
importance = bst.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

# Plotitup
plt.figure()
df.plot()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(25, 15))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('Feature_Importance_xgb.png')

In [8]:
score = bst.get_fscore()
sorted_x = sorted(score.items(), key=operator.itemgetter(1), reverse = True)

In [9]:
train_probs = bst.predict(xgb.DMatrix(x_test))
#indices = train_probs < 0
#train_probs[indices] = 0
submission = pd.DataFrame({"deal_id": df_test["deal_id"], "prediction": train_probs})
submission.to_csv("xgboost_script_submission.csv", index = False)