In [1]:
import numpy as np
import pandas as pd
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
import xgboost as xgb

import pickle
import gzip

In [2]:
pickle_file = 'processedData.pgz'

with gzip.open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train = save['train']
    test = save['test']
    features = save['features']
    target = save['target']
    del save

In [None]:
tree_params = {
    'max_depth': [3, 5, 6], 
    'min_child_weight': [1, 3, 5]
}

ind_params = {
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'max_depth': 5,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'binary:logistic',
    'nthread': 4,
    'scale_pos_weight': 1
}

params = {
    "objective": "binary:logistic",
    "booster" : "gbtree",
    "eval_metric": "auc",
    "eta": 0.1,
    "tree_method": 'exact',
    "max_depth": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "silent": 1,
    "seed": 0,
}

num_boost_round = 115
early_stopping_rounds = 10
test_size = 0.1

In [10]:
params = {
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'max_depth': 5,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'binary:logistic',
    'nthread': 4,
    'scale_pos_weight': 1
}

xgbm1 = xgb.XGBClassifier(**params)
dtrain = xgb.DMatrix(train[features], train[target])
cv_result = xgb.cv(xgbm1.get_xgb_params(), dtrain, num_boost_round=xgbm1.get_params()['n_estimators'], 
                   nfold=5, metrics='auc', early_stopping_rounds=50, verbose_eval=False)

KeyboardInterrupt: 

In [None]:
xgb.train?

In [None]:
training, validation = train_test_split(train, test_size=test_size)
X_train = training[features]
X_valid = validation[features]
y_train = training[target]
y_valid = validation[target]

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_valid, y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

xgbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, 
                 verbose_eval=True)

In [None]:
xgbm.best_iteration

In [None]:
result_prob = xgbm.predict(xgb.DMatrix(test[features]), ntree_limit=xgbm.best_iteration+1)
result = np.array(result > .5).astype('int')

submission = pd.DataFrame({'activity_id': test['activity_id'].values,
                           'outcome': result})

submission.to_csv('result.csv', index=False)