In [40]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

In [41]:
X_train_s = np.loadtxt("select_X_train_plus.csv", delimiter = ";")
y_train = np.loadtxt("y_train.csv")
X_test_s = np.loadtxt("select_X_test_plus.csv", delimiter = ";")

In [42]:
def cross_val_pimped(clf, X_train, y_train, n):
    r = np.zeros(n)
    for i in range(n):
        idx_train_1 = (np.where(y_train==1))[0]
        idx_train_0 = (np.where(y_train==0))[0]
        
        idx_test_1 = np.random.choice(idx_train_1, size=278, replace=False)
        idx_test_2 = np.random.choice(idx_train_0, size=1112, replace=False)
        
        idx_train_train_1 = np.setdiff1d(idx_train_1,idx_test_1)
        idx_train_train_2 = np.random.choice(np.setdiff1d(idx_train_0,idx_test_2), size=27994, replace=False)
        
        idx_test = np.r_[idx_test_1, idx_test_2]
        idx_train = np.r_[idx_train_train_1, idx_train_train_2]
        
        X_train_train = X_train[idx_train]
        X_train_test = X_train[idx_test]
        y_train_train = y_train[idx_train]
        y_train_test = y_train[idx_test]
        
        clf.fit(X_train_train, y_train_train)
        y_pred = clf.predict(X_train_test)
        r[i] = (accuracy_score(y_pred, y_train_test))
    print(np.mean(r),np.std(r))
    return r

In [44]:
def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.4f}".format(params['gamma']),
        'colsample_bytree': '{:.4f}'.format(params['colsample_bytree']),
        'scale_pos_weight': int(params['scale_pos_weight']),
        'n_estimators': int(params['n_estimators']),
        'learning_rate': '{:.4f}'.format(params['learning_rate']),
        'eta': '{:.4f}'.format(params['eta']),
        'subsample': '{:.4f}'.format(params['subsample']),
        'num_boost_round': int(params['num_boost_round'])
    }    
    clf = XGBClassifier(n_jobs=4, eval_metric="auc", **params)
    score = cross_val_pimped(clf, X_train_s, y_train, 8).mean()
    print("Score {:.4f} params {}".format(score, params))
    return -score

space = {
    'max_depth': hp.quniform('max_depth', 3, 12, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.6),
    'scale_pos_weight': hp.quniform('scale_pose_weight', 10, 100, 1),
    'n_estimators':  hp.quniform('n_estimators', 200, 400, 20),
    'learning_rate':  hp.uniform('learning_rate', 0.04, 0.15),
    'eta': hp.uniform('eta', 0.01, 0.1),
    'subsample': hp.uniform('subsample', 0.7, 1.0),
    'num_boost_round': hp.quniform('num_boost_round', 10, 220, 20)
    
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=150)

0.917715827338 0.00804290726217
Score 0.9177 params {'max_depth': 4, 'gamma': '0.4720', 'colsample_bytree': '0.7436', 'scale_pos_weight': 39, 'n_estimators': 260, 'learning_rate': '0.0712', 'eta': '0.0884', 'subsample': '0.9401', 'num_boost_round': 140}
0.907823741007 0.00297578994744
Score 0.9078 params {'max_depth': 11, 'gamma': '0.2100', 'colsample_bytree': '0.9091', 'scale_pos_weight': 71, 'n_estimators': 280, 'learning_rate': '0.0485', 'eta': '0.0317', 'subsample': '0.8130', 'num_boost_round': 180}
0.914298561151 0.00540092670903
Score 0.9143 params {'max_depth': 8, 'gamma': '0.1011', 'colsample_bytree': '0.9209', 'scale_pos_weight': 67, 'n_estimators': 240, 'learning_rate': '0.0829', 'eta': '0.0441', 'subsample': '0.7959', 'num_boost_round': 200}
0.923471223022 0.00540691277538
Score 0.9235 params {'max_depth': 4, 'gamma': '0.0979', 'colsample_bytree': '0.7966', 'scale_pos_weight': 83, 'n_estimators': 300, 'learning_rate': '0.1487', 'eta': '0.0400', 'subsample': '0.7100', 'num_bo

0.916366906475 0.00614413340019
Score 0.9164 params {'max_depth': 8, 'gamma': '0.2326', 'colsample_bytree': '0.9905', 'scale_pos_weight': 72, 'n_estimators': 360, 'learning_rate': '0.1057', 'eta': '0.0804', 'subsample': '0.8676', 'num_boost_round': 200}
0.920503597122 0.00610451897427
Score 0.9205 params {'max_depth': 4, 'gamma': '0.0493', 'colsample_bytree': '0.8839', 'scale_pos_weight': 61, 'n_estimators': 380, 'learning_rate': '0.1208', 'eta': '0.0996', 'subsample': '0.8278', 'num_boost_round': 180}
0.92059352518 0.00508630422388
Score 0.9206 params {'max_depth': 5, 'gamma': '0.2698', 'colsample_bytree': '0.9057', 'scale_pos_weight': 50, 'n_estimators': 260, 'learning_rate': '0.1396', 'eta': '0.0707', 'subsample': '0.9006', 'num_boost_round': 160}
0.915287769784 0.00323241020784
Score 0.9153 params {'max_depth': 3, 'gamma': '0.0832', 'colsample_bytree': '0.9450', 'scale_pos_weight': 40, 'n_estimators': 300, 'learning_rate': '0.1490', 'eta': '0.0836', 'subsample': '0.8689', 'num_boos

0.922212230216 0.00768503523578
Score 0.9222 params {'max_depth': 6, 'gamma': '0.2911', 'colsample_bytree': '0.7501', 'scale_pos_weight': 53, 'n_estimators': 240, 'learning_rate': '0.0406', 'eta': '0.0595', 'subsample': '0.7041', 'num_boost_round': 20}
0.919424460432 0.0052866685096
Score 0.9194 params {'max_depth': 5, 'gamma': '0.3936', 'colsample_bytree': '0.7335', 'scale_pos_weight': 32, 'n_estimators': 260, 'learning_rate': '0.0516', 'eta': '0.0808', 'subsample': '0.7346', 'num_boost_round': 40}
0.922661870504 0.00801924345669
Score 0.9227 params {'max_depth': 4, 'gamma': '0.5535', 'colsample_bytree': '0.8935', 'scale_pos_weight': 15, 'n_estimators': 240, 'learning_rate': '0.0789', 'eta': '0.0532', 'subsample': '0.8955', 'num_boost_round': 140}
0.927428057554 0.00615136837218
Score 0.9274 params {'max_depth': 5, 'gamma': '0.4620', 'colsample_bytree': '0.8020', 'scale_pos_weight': 47, 'n_estimators': 280, 'learning_rate': '0.0605', 'eta': '0.0622', 'subsample': '0.8415', 'num_boost_

0.918075539568 0.00611973481326
Score 0.9181 params {'max_depth': 4, 'gamma': '0.4920', 'colsample_bytree': '0.9045', 'scale_pos_weight': 77, 'n_estimators': 280, 'learning_rate': '0.0922', 'eta': '0.0379', 'subsample': '0.9057', 'num_boost_round': 180}
0.915917266187 0.00670491921811
Score 0.9159 params {'max_depth': 3, 'gamma': '0.3739', 'colsample_bytree': '0.9651', 'scale_pos_weight': 26, 'n_estimators': 340, 'learning_rate': '0.0879', 'eta': '0.0559', 'subsample': '0.8825', 'num_boost_round': 120}
0.920053956835 0.00493786111989
Score 0.9201 params {'max_depth': 5, 'gamma': '0.4754', 'colsample_bytree': '0.9502', 'scale_pos_weight': 56, 'n_estimators': 340, 'learning_rate': '0.1260', 'eta': '0.0630', 'subsample': '0.9264', 'num_boost_round': 220}
0.915287769784 0.00824007999205
Score 0.9153 params {'max_depth': 6, 'gamma': '0.3047', 'colsample_bytree': '0.9159', 'scale_pos_weight': 69, 'n_estimators': 260, 'learning_rate': '0.1152', 'eta': '0.0185', 'subsample': '0.8097', 'num_boo

KeyboardInterrupt: 

In [54]:
params = {'max_depth': 4, 'gamma': '0.3197', 'colsample_bytree': '0.9559', 'scale_pos_weight': 48, 'n_estimators': 400, 'learning_rate': '0.1147', 'eta': '0.0289', 'subsample': '0.9040', 'num_boost_round': 200}
clf = XGBClassifier(n_jobs=4, eval_metric="auc", **params)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)

In [55]:
np.sum(y_pred)/len(y_pred)

0.1614196911971125

In [52]:
params = {'max_depth': 4, 
          'gamma': 0.3197, 
          'colsample_bytree': 0.9559, 
          'scale_pos_weight': 48, 
          'n_estimators': 400, 
          'learning_rate': 0.1147, 
          'eta': 0.0289, 
          'subsample': 0.9040, 
          'num_boost_round': 200}
clf = XGBClassifier(n_jobs=4, eval_metric="auc", **params)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)

In [53]:
np.sum(y_pred)/len(y_pred)

0.1614196911971125

In [47]:
y_pred = np.array(y_pred, dtype='bool')
StackingSubmission = pd.DataFrame(y_pred)
random = pd.read_csv("random.csv")
StackingSubmission = pd.DataFrame({'Id': random.Id,'Prediction': y_pred})
StackingSubmission.to_csv("submission.csv", index=False)