In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn import cross_validation 
import warnings
warnings.filterwarnings("ignore")
import operator

In [2]:
df_header = pd.read_csv('header.csv')
df_header['name'].tolist()
df_train = pd.read_csv('training.tsv', sep = '\t', names = df_header['name'].tolist())
df_test = pd.read_csv('test.tsv', sep = '\t', names = df_header['name'].tolist())
#df_test

In [3]:
y = df_train['sales']
y[y>0] = 1
x = df_train.drop(['deal_id','name', 'description', 'sku', 'manufacturer_name', 'coupon_description', 'clicks', 'sales'],\
                        axis = 1, inplace = False)

x_train, x_valid, y_train, y_valid = cross_validation.train_test_split(x, y, test_size=0.2)

x_test = df_test.drop(['deal_id','name', 'description', 'sku', 'manufacturer_name', 'coupon_description', 'clicks', 'sales'],\
                        axis = 1, inplace = False)

In [4]:
d_train = xgb.DMatrix(x_train, y_train)
d_valid = xgb.DMatrix(x_valid, y_valid)

In [5]:
param = {'bst:max_depth':6, 'bst:eta':0.0001, 'silent':1, 'objective':'binary:logistic', 'subsample':0.5, \
        'min_samples_split': 800, 'min_samples_leaf':50, 'lambda': 0.8, 'gamma': 0.2}
param['nthread'] = 4
param['eval_metric'] = 'auc'
evallist  = [(d_valid,'eval'), (d_train,'train')]

In [6]:
num_round = 100
bst = xgb.train(param, d_train, num_round, evals=evallist, early_stopping_rounds=1)

Will train until train error hasn't decreased in 1 rounds.
[0]	eval-auc:0.501956	train-auc:0.500984
[1]	eval-auc:0.501956	train-auc:0.500984
Stopping. Best iteration:
[0]	eval-auc:0.501956	train-auc:0.500984



In [7]:
from matplotlib import pylab as plt
features = list(x_train.columns)
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
  outfile.write('{0}\t{1}\tq\n'.format(i, feat))
  i = i + 1
outfile.close()
importance = bst.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

# Plotitup
plt.figure()
df.plot()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(25, 15))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('Feature_Importance_xgb.png')

In [8]:
score = bst.get_fscore()
sorted_x = sorted(score.items(), key=operator.itemgetter(1), reverse = True)

In [9]:
train_probs = bst.predict(xgb.DMatrix(x_test))
indices = train_probs < 0
train_probs[indices] = 0
submission = pd.DataFrame({"deal_id": df_test["deal_id"], "prediction": train_probs})
submission.to_csv("xgboost_script_submission.csv", index = False)

In [10]:
train_probs

array([ 0.2612623,  0.2612623,  0.2612623, ...,  0.2612623,  0.2612623,
        0.2612623], dtype=float32)