In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
seed = 2
from xgboost import XGBClassifier
import pickle

In [2]:
#train = pd.read_csv('./data/new_train.csv')
train = pd.read_csv('./data/train_sample.csv')

In [3]:
sample_size = int(1e2)
try :
    sample = train[train.target == 1].sample(n=sample_size/2, random_state=seed)
    sample = sample.append( train[train.target == 0].sample(n=sample_size/2, random_state=seed))
except ValueError :
    sample = train[train.target == 1]
    sample = sample.append( train[train.target == 0].sample(n=sample_size-21694, random_state=seed) )

sample = sample.sample(frac=1, random_state=seed).reset_index(drop=True)
#sample.to_csv('./data/train_sample.csv', index=False)
train = sample

In [4]:
target = train.target
train.drop(['target','id'], inplace=True, axis=1)
train.drop([ col for col in train.columns if col.startswith('ps_cont') ],axis=1, inplace=True)

In [5]:
with open('./data/OneHotEncoder.clf', 'rb') as f:
    encoders = pickle.load(f)

In [6]:
enc_train = None

for feature,encoder in zip(train.columns,encoders) :
    encoded = encoder.transform(train[feature].values.reshape(-1,1))
    if enc_train is None :
        enc_train = encoded
    else :
        enc_train = np.concatenate((enc_train, encoded), axis=1)

In [7]:
train.shape

(100, 52)

In [8]:
#with open('./data/model.pkl','rb') as f :
#    model = pickle.load(f)

In [9]:
#with open('./data/model.pkl', 'wb') as f:
#    pickle.dump(file=f, obj=model)

In [10]:
#def split_data(X, Y, ratio=0.1):
#    test_ids = np.random.randint(0, X.shape[0], int(X.shape[0] * ratio))
#    x_test = X[test_ids]
#    y_test = Y[test_ids]
#    x_train = np.delete(X, test_ids, axis=0)
#    y_train = Y.drop(test_ids)
#    return x_train, x_test, y_train, y_test

#x_train, x_test, y_train, y_test = split_data(enc_train, target, .1)

In [11]:
#model.fit(x_train,y_train)

In [12]:
#model = XGBClassifier(**params)

In [13]:
#model.fit(enc_train, target)

In [14]:
#from sklearn.metrics import classification_report
#pred = model.predict(enc_train)
#print classification_report(y_true=target, y_pred=pred)

In [15]:
#import xgboost_optimizer

In [16]:
#xgboost_optimizer.xgboost_optimizer(enc_train, target)

In [17]:
params = {

    #-----------------------------------------------------------------------
    # dealing with imblanced data
    #-------------------------------------------------------------------------------
    'max_delta_step':0,
    #Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0,
    #it means there is no constraint. If it is set to a positive value, it can help making the update step more 
    #conservative. Usually this parameter is not needed, but it might help in logistic regression when class is
    #extremely imbalanced. Set it to value of 1-10 might help control the update.
    #default:0, range[0-inf]
    'scale_pos_weight' : 1,  # control balance between +ve and -ve weights, default:1
    
    #--------------------------------------------------------------------------------------
    #  Regularization
    #-------------------------------------------------------------------------------
    'alpha' : 0,
    #L1 regularization term on weights, increase this value will make model more conservative, default:0
    'lambda' : 1,
    #L2 regularization term on weights, increase this value will make model more conservative, default1.
    
    #------------------------------------------------------
    # add randomness to make training robust to noise
    #----------------------------------------------------------------
    'subsample' : 1,
    #ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data 
    #instances to grow trees and this will prevent overfitting. default:1, range[0-1]
    'colsample_bytree' : 1, #subsample ratio of columns when constructing each tree.default:1, range[0-1]
    'colsample_bylevel' : 1, #subsample ratio of columns for each split, in each level. default:1, range[0-1]
    
    #-------------------------------------------------------------
    # Tree parameters
    #--------------------------------------------------
    'min_child_weight': 4, # minimum sum of instance weight (hessian) needed in a child. 
    # the more conservative the algorithm will be. default:1, range[0-inf]
    'max_depth' : 12,
    # maximum depth of a tree, increase this value will make the model more complex / likely to be overfitting
    # default : 6, range[0-inf]
    
    #-----------------------------------------------------------------------
    # LOSS reduction
    #------------------------------------------
    'gamma':0, #minimum loss reduction required to make a further partition on a leaf node of the tree.
    #The larger, the more conservative the algorithm will be. default : 0, range[0-inf]
    'eta':0.3, #step size shrinkage used in update to prevents overfitting. default:0.3, range[0-1]
    
    #---------------------------------------------------------------------------------------------------
    #Learning Task Parameters
    #------------------------------------------------------------------------------
    'objective': 'reg:logistic', # reg:linear, reg:logistic, binary:logistic, multi:softmax
    'eval_metric' : 'logloss', # error for binary class., merror for multiclass classification,
    # "map" Mean Average Precesion.
    'random_seed' : 0,
    'n_jobs' : 4
}

In [18]:
cv_params1 = {
    'max_delta_step': range(0,3,2)
}
cv_params2 = {
    'scale_pos_weight': range(1,10,2)
}
cv_params3 = {
    'min_child_weight': range(3,10,2),
    'max_depth' : range(3,10,2)
}
cv_params4 = {
    'min_child_weight': [4,3],
    'max_depth' : [10,11,12]
}
cv_params5 = {
    'gamma': np.arange(0, 0.5, 0.1)
}
cv_params6 = {
    'subsample': np.arange(0.1, 0.6, 0.1),
    'colsample_bylevel': np.arange(0.1, 0.6, 0.1)
}
cv_params7 = {
 'subsample':[.35,.4,.45],
 'colsample_bylevel':[.35,.4,.45]
}
cv_params8 = {
 'subsample':[.35,.4,.45],
 'colsample_bylevel':[.35,.4,.45]
}
cv_params9 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]
}


In [23]:
model = XGBClassifier(**params)
gsearch = GridSearchCV(model, param_grid = cv_params3, cv=2, verbose=50, scoring='f1' )


In [24]:
gsearch.fit(enc_train, target)
print gsearch.best_params_,'\n' ,gsearch.best_score_


Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] max_depth=3, min_child_weight=3 .................................
[CV] .. max_depth=3, min_child_weight=3, score=0.500000, total=   0.1s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[CV] max_depth=3, min_child_weight=3 .................................
[CV] .. max_depth=3, min_child_weight=3, score=0.490566, total=   0.1s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[CV] max_depth=3, min_child_weight=5 .................................
[CV] .. max_depth=3, min_child_weight=5, score=0.576923, total=   0.1s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[CV] max_depth=3, min_child_weight=5 .................................
[CV] .. max_depth=3, min_child_weight=5, score=0.571429, total=   0.1s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[CV] max_depth=3, min_child_weight=7 ..............

  'precision', 'predicted', average, warn_for)


[CV] .. max_depth=3, min_child_weight=7, score=0.000000, total=   0.1s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s
[CV] max_depth=3, min_child_weight=7 .................................
[CV] .. max_depth=3, min_child_weight=7, score=0.000000, total=   0.1s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.5s remaining:    0.0s
[CV] max_depth=3, min_child_weight=9 .................................
[CV] .. max_depth=3, min_child_weight=9, score=0.000000, total=   0.1s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.6s remaining:    0.0s
[CV] max_depth=3, min_child_weight=9 .................................
[CV] .. max_depth=3, min_child_weight=9, score=0.000000, total=   0.1s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.7s remaining:    0.0s
[CV] max_depth=5, min_child_weight=3 .................................
[CV] .. max_depth=5, min_child_weight=3, score=0.500000, total=   0.1s
[Parallel(n_jobs=1)]: Done   9 out of   9

GridSearchCV(cv=2, error_score='raise',
       estimator=XGBClassifier(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eta=0.3, eval_metric='logloss', gamma=0,
       lambda=1, learning_rate=0.1, max_delta_step=0, max_depth=12,
       min_child_weight=4, missing=None, n_estimators=100, n_jobs=4,
       nthread=None, objective='reg:logistic', random_seed=0,
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 5, 7, 9], 'min_child_weight': [3, 5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=50)

{'max_depth': 3, 'min_child_weight': 5} 
0.574175824176


In [None]:
n_jobs=4,pre_dispatch=4,

In [None]:
import sys
#importlib.reload(GridSearchCV)

In [None]:
reload(sys.modules['sklearn.model_selection']).GridSearchCV