In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import KFold, LabelKFold
from sklearn.cross_validation import LabelShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [40]:
random_state = 42

In [41]:
df = pd.read_csv("data_encoded.csv")
data_x = df.drop(["status", "filename"], 1).values
data_y = df.status.values
labels = df["filename"].values
# train_x, train_y, test_x, test_y, train_labels, test_labels
train_labels, test_labels = list(LabelShuffleSplit(labels, n_iter=1, train_size=0.75, random_state=random_state))[0]
train_x = data_x[train_labels]
train_y = data_y[train_labels]
test_x = data_x[test_labels]
test_y = data_y[test_labels]
cv_train = LabelKFold(train_labels, n_folds=10)

In [42]:
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label == 1)
    param['scale_pos_weight'] = ratio
    return (dtrain, dtest, param)

In [1]:
estimator = xgb.XGBClassifier(
    objective='binary:logistic',
    seed=42,
    missing=np.nan,
    nthread=1,
    reg_alpha=0.5,
    reg_lambda=5
)


param_grid = {
    'max_depth':range(6,12,2),
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [300, 700, 1000],
    "colsample_bytree": [0.5, 0.8, 1],
    "subsample": [0.5, 0.8, 1],
}

gsearch = GridSearchCV(estimator=estimator,
                       param_grid=param_grid,
                       scoring='roc_auc',
                       n_jobs=48,
                       iid=False,
                       cv=cv_train,
                       verbose=1
                      )

NameError: name 'xgb' is not defined

In [44]:
result_train = gsearch.fit(train_x, train_y)

Fitting 10 folds for each of 243 candidates, totalling 2430 fits


[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:    6.3s
[Parallel(n_jobs=48)]: Done 354 tasks      | elapsed:   17.9s
[Parallel(n_jobs=48)]: Done 704 tasks      | elapsed:   29.9s
[Parallel(n_jobs=48)]: Done 1154 tasks      | elapsed:   58.5s
[Parallel(n_jobs=48)]: Done 1704 tasks      | elapsed:  1.5min
[Parallel(n_jobs=48)]: Done 2430 out of 2430 | elapsed:  2.4min finished


In [45]:
print gsearch.best_params_
print gsearch.best_score_

{'n_estimators': 300, 'subsample': 1, 'learning_rate': 0.05, 'colsample_bytree': 0.5, 'max_depth': 6}
0.923497650174


In [46]:
from sklearn.metrics import accuracy_score
train_params = gsearch.best_params_
est_train = gsearch.best_estimator_
y_pred = est_train.predict(test_x)
roc_auc = roc_auc_score(test_y, y_pred)
accuracy = accuracy_score(test_y, y_pred)
print("Accuracy: " + str(accuracy))
print("ROC score: " + str(roc_auc))

Accuracy: 0.804347826087
ROC score: 0.6236481615


In [47]:
cv_data = LabelKFold(labels, n_folds=10)
gsearch.cv = cv_data
result_all = gsearch.fit(data_x, data_y)


Fitting 10 folds for each of 243 candidates, totalling 2430 fits


[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:    8.0s
[Parallel(n_jobs=48)]: Done 354 tasks      | elapsed:   24.3s
[Parallel(n_jobs=48)]: Done 704 tasks      | elapsed:   40.7s
[Parallel(n_jobs=48)]: Done 1154 tasks      | elapsed:  1.3min
[Parallel(n_jobs=48)]: Done 1704 tasks      | elapsed:  2.0min
[Parallel(n_jobs=48)]: Done 2430 out of 2430 | elapsed:  3.2min finished


In [48]:
print gsearch.best_params_
print gsearch.best_score_

{'n_estimators': 300, 'subsample': 0.8, 'learning_rate': 0.2, 'colsample_bytree': 0.8, 'max_depth': 6}
0.804133855075
