In [1]:
from dataset import *
from helpers.ipython import *
from helpers.model import *

%matplotlib inline

# Data

In [2]:
def get_y():
    return data.get('train', col='TARGET')

def get_X(dset='train'):
    return data.extract(dset, [
        ('data', None),
    ], drop_cols=['TARGET'])

X, y = get_X(), get_y()

# Tuning

In [3]:
cv = skl.cross_validation.StratifiedKFold(y, shuffle=True, random_state=1234, n_folds=5)
tuner = TuneXGB(LogisticXGB, X, y, metric='auc', cv=cv)
tuner.best_max_depth = 5
tuner.best_min_child_weight = 1
tuner.best_gamma = 0.3
tuner.best_subsample = 0.90
tuner.best_colsample_bytree = 0.55

## max_depth, min_child_weight

In [4]:
scores = tuner.tune_depth()

{'max_depth': 3, 'min_child_weight': 1}
Will train until validation_0 error hasn't decreased in 25 rounds.
[0]	validation_0-auc:0.503254
[10]	validation_0-auc:0.801163
[20]	validation_0-auc:0.829140
[30]	validation_0-auc:0.833471
[40]	validation_0-auc:0.836021
[50]	validation_0-auc:0.836986
[60]	validation_0-auc:0.837622
[70]	validation_0-auc:0.837481
[80]	validation_0-auc:0.837712
[90]	validation_0-auc:0.837654
Stopping. Best iteration:
[66]	validation_0-auc:0.837938

Will train until validation_0 error hasn't decreased in 25 rounds.
[0]	validation_0-auc:0.500728
[10]	validation_0-auc:0.804382
[20]	validation_0-auc:0.826746
[30]	validation_0-auc:0.834038
[40]	validation_0-auc:0.835907
[50]	validation_0-auc:0.836424
[60]	validation_0-auc:0.836833
[70]	validation_0-auc:0.836965
[80]	validation_0-auc:0.837130
[90]	validation_0-auc:0.836388
Stopping. Best iteration:
[65]	validation_0-auc:0.837521

Will train until validation_0 error hasn't decreased in 25 rounds.
[0]	validation_0-auc:0.50

In [5]:
scores

Unnamed: 0,score,ntree,max_depth,min_child_weight
0,0.839799,45.8,5,1
13,0.839795,58.0,4,1
1,0.839739,69.4,3,5
2,0.839716,46.2,5,3
14,0.839603,43.2,5,2
15,0.839547,60.6,4,2
16,0.839269,49.0,6,2
3,0.839128,68.4,3,3
4,0.838847,40.8,5,5
5,0.838735,67.4,3,1


## gamma

In [4]:
scores = tuner.tune_gamma()

{'gamma': 0.0}
Will train until validation_0 error hasn't decreased in 25 rounds.
[0]	validation_0-auc:0.503322
[10]	validation_0-auc:0.806668
[20]	validation_0-auc:0.835763
[30]	validation_0-auc:0.837860
[40]	validation_0-auc:0.838490
[50]	validation_0-auc:0.838659
[60]	validation_0-auc:0.838712
[70]	validation_0-auc:0.837967
Stopping. Best iteration:
[53]	validation_0-auc:0.839149

Will train until validation_0 error hasn't decreased in 25 rounds.
[0]	validation_0-auc:0.500796
[10]	validation_0-auc:0.811281
[20]	validation_0-auc:0.832192
[30]	validation_0-auc:0.835353
[40]	validation_0-auc:0.836735
[50]	validation_0-auc:0.835523
[60]	validation_0-auc:0.834453
Stopping. Best iteration:
[41]	validation_0-auc:0.836987

Will train until validation_0 error hasn't decreased in 25 rounds.
[0]	validation_0-auc:0.503220
[10]	validation_0-auc:0.812592
[20]	validation_0-auc:0.826878
[30]	validation_0-auc:0.830156
[40]	validation_0-auc:0.830120
[50]	validation_0-auc:0.830155
Stopping. Best itera

In [5]:
scores

Unnamed: 0,score,ntree,gamma
0,0.840055,47.4,0.3
6,0.840001,48.0,0.35
7,0.839996,41.4,0.25
1,0.839878,42.8,0.1
2,0.839799,45.8,0.0
3,0.839702,46.8,0.2
4,0.839627,43.2,0.4


## subsample, colsample_bytree

In [7]:
scores = tuner.tune_sample()

{'subsample': 1.0, 'colsample_bytree': 1.0}
Will train until validation_0 error hasn't decreased in 25 rounds.
[0]	validation_0-auc:0.503322
[10]	validation_0-auc:0.806685
[20]	validation_0-auc:0.835692
[30]	validation_0-auc:0.838338
[40]	validation_0-auc:0.837720
[50]	validation_0-auc:0.838215
Stopping. Best iteration:
[31]	validation_0-auc:0.838736

Will train until validation_0 error hasn't decreased in 25 rounds.
[0]	validation_0-auc:0.500796
[10]	validation_0-auc:0.811349
[20]	validation_0-auc:0.832183
[30]	validation_0-auc:0.835358
[40]	validation_0-auc:0.836851
[50]	validation_0-auc:0.836509
[60]	validation_0-auc:0.835905
Stopping. Best iteration:
[43]	validation_0-auc:0.837467

Will train until validation_0 error hasn't decreased in 25 rounds.
[0]	validation_0-auc:0.503220
[10]	validation_0-auc:0.811253
[20]	validation_0-auc:0.825278
[30]	validation_0-auc:0.829863
[40]	validation_0-auc:0.830536
[50]	validation_0-auc:0.829974
[60]	validation_0-auc:0.830459
[70]	validation_0-auc:

In [8]:
scores

Unnamed: 0,score,ntree,subsample,colsample_bytree
25,0.840711,48.6,0.9,0.55
26,0.840517,39.6,0.85,0.65
0,0.840481,44.8,0.9,0.6
28,0.840332,36.6,0.9,0.65
29,0.840279,40.8,0.85,0.55
1,0.840078,39.2,0.8,0.8
2,0.840055,47.4,1.0,1.0
30,0.839998,37.4,0.95,0.6
3,0.839967,41.2,0.7,0.6
4,0.839956,38.4,0.7,0.8


# Training

In [6]:
tuner.early_stopping_rounds = 100
scores = tuner.train_models_cv_score({
    'learning_rate': 0.01,
}, mean=False, progress=True)

Will train until validation_0 error hasn't decreased in 100 rounds.
[0]	validation_0-auc:0.502492
[10]	validation_0-auc:0.503084
[20]	validation_0-auc:0.503609
[30]	validation_0-auc:0.503201
[40]	validation_0-auc:0.503201
[50]	validation_0-auc:0.502929
[60]	validation_0-auc:0.502895
[70]	validation_0-auc:0.502895
[80]	validation_0-auc:0.508068
[90]	validation_0-auc:0.683048
[100]	validation_0-auc:0.727962
[110]	validation_0-auc:0.723640
[120]	validation_0-auc:0.724846
[130]	validation_0-auc:0.725695
[140]	validation_0-auc:0.729974
[150]	validation_0-auc:0.730379
[160]	validation_0-auc:0.735818
[170]	validation_0-auc:0.783658
[180]	validation_0-auc:0.796629
[190]	validation_0-auc:0.805714
[200]	validation_0-auc:0.811451
[210]	validation_0-auc:0.813625
[220]	validation_0-auc:0.817244
[230]	validation_0-auc:0.814915
[240]	validation_0-auc:0.815071
[250]	validation_0-auc:0.817454
[260]	validation_0-auc:0.817115
[270]	validation_0-auc:0.817684
[280]	validation_0-auc:0.818371
[290]	validatio

In [5]:
scores

Unnamed: 0,model,ntree,score
4,"LogisticXGB(base_score=0.5, colsample_bylevel=...",980,0.852418
3,"LogisticXGB(base_score=0.5, colsample_bylevel=...",939,0.842085
0,"LogisticXGB(base_score=0.5, colsample_bylevel=...",818,0.839926
1,"LogisticXGB(base_score=0.5, colsample_bylevel=...",924,0.839051
2,"LogisticXGB(base_score=0.5, colsample_bylevel=...",702,0.834107


In [11]:
print scores['ntree'].mean()
print scores['score'].mean()
print scores['ntree'].max() * (float(len(y)) / tuner.get_cv_train_size())

872.6
0.8415174
1225.02014306


In [5]:
model = tuner.build_model({
    'n_estimators': 1225,
    'learning_rate': 0.01
})
tuner.early_stopping_rounds = 100
tuner.apply_cv(model, 1)
model.eval_verbose=50
model.fit(X, y)

print model.get_params()

Will train until validation_0 error hasn't decreased in 100 rounds.
[0]	validation_0-auc:0.500796
[50]	validation_0-auc:0.504372
[100]	validation_0-auc:0.745246
[150]	validation_0-auc:0.763980
[200]	validation_0-auc:0.829779
[250]	validation_0-auc:0.837147
[300]	validation_0-auc:0.845202
[350]	validation_0-auc:0.850341
[400]	validation_0-auc:0.853805
[450]	validation_0-auc:0.857901
[500]	validation_0-auc:0.861891
[550]	validation_0-auc:0.864476
[600]	validation_0-auc:0.867325
[650]	validation_0-auc:0.869493
[700]	validation_0-auc:0.871171
[750]	validation_0-auc:0.873024
[800]	validation_0-auc:0.874344
[850]	validation_0-auc:0.875850
[900]	validation_0-auc:0.877080
[950]	validation_0-auc:0.878282
[1000]	validation_0-auc:0.879482
[1050]	validation_0-auc:0.880542
[1100]	validation_0-auc:0.881578
[1150]	validation_0-auc:0.882706
[1200]	validation_0-auc:0.883702


{'reg_alpha': 0, 'colsample_bytree': 0.55, 'silent': True, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.01, 'missing': None, 'max_delta_step': 1, 'nthread': -1, 'base_score': 0.5, 'n_estimators': 1225, 'subsample': 0.9, 'reg_lambda': 1, 'seed': 1234, 'min_child_weight': 1, 'objective': 'binary:logistic', 'max_depth': 5, 'gamma': 0.3}


[1224]	validation_0-auc:0.884177


# Submit

In [6]:
proba = pd.DataFrame({'TARGET': model.predict_proba(get_X('test'))[:, 1]}, index=data.get('test').index)
proba.to_csv("./output/xgb_model_tune_v1.csv")