In [1]:
import glob
import pandas as pd
import numpy as np
import xgboost as xgb

In [2]:
# get datadir
import platform
uname = platform.uname()[0]
if uname == 'Linux':
    datadir = '/home/kcavagnolo/ml_fun/santander_cs/data/'
elif uname == 'Darwin':
    datadir = '/Users/cavagnolo/ml_fun/santander_cs/data/'
else:
    raise OSError("Unknown system: " + str(uname))
    
files = sorted(glob.glob(datadir + '*.csv'))
hdf_file = datadir + 'features.h5'

In [3]:
# reopen hdf store
hdf = pd.HDFStore(hdf_file)
df_all = hdf['df_all']
hdf.close()

In [4]:
df_all.shape

(151838, 326)

In [5]:
cond = df_all['is_train'] == True
train = df_all[cond].copy()
labels = train['TARGET']

cond = df_all['is_train'] == False
test = df_all[cond].copy()

dcol = ['is_train', 'ID', 'TARGET']
train.drop(dcol, 1, inplace=True)
test.drop(dcol, 1, inplace=True)

sample = pd.read_csv(datadir + 'sample_submission.csv')

In [6]:
print train.shape
print test.shape
print sample.shape
print labels.shape

(76020, 323)
(75818, 323)
(75818, 2)
(76020,)


In [7]:
params = {'objective': "binary:logistic",
          'booster': "gbtree",
          'eval_metric': "auc",
          'eta': 0.02,
          'max_depth': 6,
          'subsample': 1,
          'colsample_bytree': 0.85
         }
num_round = 500

In [8]:
xgtrain = xgb.DMatrix(train, label=labels)
watchlist = [(xgtrain, 'train')]
clf = xgb.train(params, xgtrain, num_round, watchlist, early_stopping_rounds=50)

Will train until train error hasn't decreased in 50 rounds.
[0]	train-auc:0.928975
[1]	train-auc:0.930782
[2]	train-auc:0.931418
[3]	train-auc:0.945000
[4]	train-auc:0.945489
[5]	train-auc:0.940710
[6]	train-auc:0.941428
[7]	train-auc:0.942572
[8]	train-auc:0.944136
[9]	train-auc:0.944991
[10]	train-auc:0.946584
[11]	train-auc:0.947591
[12]	train-auc:0.948646
[13]	train-auc:0.948834
[14]	train-auc:0.948658
[15]	train-auc:0.946795
[16]	train-auc:0.945776
[17]	train-auc:0.943926
[18]	train-auc:0.944542
[19]	train-auc:0.945406
[20]	train-auc:0.945576
[21]	train-auc:0.946170
[22]	train-auc:0.946225
[23]	train-auc:0.946244
[24]	train-auc:0.945341
[25]	train-auc:0.948697
[26]	train-auc:0.949197
[27]	train-auc:0.949221
[28]	train-auc:0.949166
[29]	train-auc:0.949731
[30]	train-auc:0.949758
[31]	train-auc:0.948314
[32]	train-auc:0.947555
[33]	train-auc:0.947878
[34]	train-auc:0.948039
[35]	train-auc:0.949062
[36]	train-auc:0.949144
[37]	train-auc:0.949812
[38]	train-auc:0.949618
[39]	train-auc

In [9]:
xgtest = xgb.DMatrix(test)

In [10]:
prob = clf.predict(xgtest)

In [11]:
prob

array([ 0.15395899,  0.02063746,  0.00217654, ...,  0.42630044,
        0.87195325,  0.18467094], dtype=float32)

In [12]:
print len(sample['ID'])
print len(prob)

75818
75818


In [13]:
xgbc_prob = pd.DataFrame({
    'ID': sample['ID'].values, 
    'TARGET': prob})

In [14]:
xgbc_prob.to_csv('xgbc.csv', index=False)