In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn import cross_validation 
import warnings
warnings.filterwarnings("ignore")
import operator

In [2]:
df_header = pd.read_csv('header.csv')
df_header['name'].tolist()
df_train = pd.read_csv('training.tsv', sep = '\t', names = df_header['name'].tolist())
df_test = pd.read_csv('test.tsv', sep = '\t', names = df_header['name'].tolist())
#df_test

In [3]:
y = df_train['sales']
y[y>0] = 1
x = df_train.drop(['name', 'description', 'manufacturer_name', 'coupon_description', 'orig_price', \
                   'clicks', 'sales'], axis = 1, inplace = False)
x_train, x_valid, y_train, y_valid = cross_validation.train_test_split(x, y, test_size=0.2)

x_test = df_test.drop(['name', 'description', 'manufacturer_name', 'coupon_description', 'orig_price',\
                       'clicks', 'sales'], axis = 1, inplace = False)
x_train.fillna(0, inplace = True)
x_test.fillna(0, inplace = True)

In [4]:
x_train.fillna(0, inplace = True)
x_valid.fillna(0, inplace = True)
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_valid, y_valid)

In [17]:
param = {'bst:max_depth':5, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic', 'subsample':0.5, \
        'min_samples_split': 800, 'min_samples_leaf':50, 'lambda': 1, 'gamma': 0}
param['nthread'] = 4
param['eval_metric'] = 'auc'
evallist  = [(d_valid,'eval'), (d_train,'train')]

In [18]:
num_round = 500
bst = xgb.train(param, d_train, num_round, evals=evallist, early_stopping_rounds=1)

Will train until train error hasn't decreased in 1 rounds.
[0]	eval-auc:0.500000	train-auc:0.500000
[1]	eval-auc:0.637242	train-auc:0.624645
[2]	eval-auc:0.635538	train-auc:0.658780
[3]	eval-auc:0.645062	train-auc:0.682736
[4]	eval-auc:0.639535	train-auc:0.690439
[5]	eval-auc:0.630884	train-auc:0.700765
[6]	eval-auc:0.589791	train-auc:0.702856
[7]	eval-auc:0.597200	train-auc:0.711389
[8]	eval-auc:0.608839	train-auc:0.723990
[9]	eval-auc:0.612870	train-auc:0.729863
[10]	eval-auc:0.615007	train-auc:0.742738
[11]	eval-auc:0.602546	train-auc:0.750211
[12]	eval-auc:0.591806	train-auc:0.759750
[13]	eval-auc:0.587759	train-auc:0.765246
[14]	eval-auc:0.581603	train-auc:0.774038
[15]	eval-auc:0.588853	train-auc:0.778280
[16]	eval-auc:0.599013	train-auc:0.786474
[17]	eval-auc:0.598216	train-auc:0.791006
[18]	eval-auc:0.591534	train-auc:0.796452
[19]	eval-auc:0.598361	train-auc:0.801831
[20]	eval-auc:0.589423	train-auc:0.804884
[21]	eval-auc:0.595993	train-auc:0.813824
[22]	eval-auc:0.599855	trai

In [7]:
from matplotlib import pylab as plt
features = list(x_train.columns)
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
  outfile.write('{0}\t{1}\tq\n'.format(i, feat))
  i = i + 1
outfile.close()
importance = bst.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

# Plotitup
plt.figure()
df.plot()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(25, 15))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('Feature_Importance_xgb.png')

In [8]:
score = bst.get_fscore()
sorted_x = sorted(score.items(), key=operator.itemgetter(1), reverse = True)

In [9]:
train_probs = bst.predict(xgb.DMatrix(x_test))
indices = train_probs < 0
train_probs[indices] = 0
submission = pd.DataFrame({"deal_id": df_test["deal_id"], "prediction": train_probs})
submission.to_csv("xgboost_script_submission.csv", index = False)

In [10]:
train_probs

array([ 0.49853516,  0.49853516,  0.49853516, ...,  0.49853516,
        0.49853516,  0.49853516], dtype=float32)