In [1]:
# https://www.kaggle.com/aharless/xgboost-cv-lb-284?scriptVersionId=1683906

In [2]:
import sys
sys.path.insert(0, '/home/watts/Software/xgboost')

In [50]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc
from scipy.stats import randint as sp_randint
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from hccEncoding.EncoderForClassification import BayesEncoding,BayesEncodingKfold,LOOEncoding,LOOEncodingKfold
from sklearn.model_selection import train_test_split

In [4]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [5]:
# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


In [6]:
def gini1(solution, submission):
    df = zip(solution, submission, range(len(solution)))
    df = sorted(df, key=lambda x: (x[1],-x[2]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

# def normalized_gini(solution, submission):
#     normalized_gini = gini1(solution, submission)/gini1(solution, solution)
#     return normalized_gini

In [7]:
def normalized_gini(solution, submission):
    normalized_gini = eval_gini(solution, submission)/eval_gini(solution, solution)
    return normalized_gini

In [8]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
     assert( len(actual) == len(pred) )
     all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
     all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
     totalLosses = all[:,0].sum()
     giniSum = all[:,0].cumsum().sum() / totalLosses
 
     giniSum -= (len(actual) + 1) / 2.
     return giniSum / len(actual)
 
def gini_normalized(a, p):
    return eval_gini(a, p) / eval_gini(a, a)

In [9]:
# Normalized Gini Scorer
# gini_scorer = make_scorer(normalized_gini, greater_is_better = True)
def normalized_gini(solution, submission):
    normalized_gini = eval_gini(solution, submission)/eval_gini(solution, solution)
    return normalized_gini
gini_scorer = make_scorer(gini_normalized, greater_is_better = True)

In [10]:
# from olivier
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
]

In [11]:
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

In [12]:
# Read data
df_train = pd.read_csv('../cache/train_X.csv') 
df_test = pd.read_csv('../cache/test_Y.csv')

In [13]:
df_train.isnull().sum()

ps_car_13                            0
ps_reg_03                       107772
ps_ind_05_cat                     5809
ps_ind_03                            0
ps_ind_15                            0
ps_reg_02                            0
ps_car_14                        42620
ps_car_12                            1
ps_car_01_cat                      107
ps_car_07_cat                    11489
ps_ind_17_bin                        0
ps_car_03_cat                   411231
ps_reg_01                            0
ps_car_15                            0
ps_ind_01                            0
ps_ind_16_bin                        0
ps_ind_07_bin                        0
ps_car_06_cat                        0
ps_car_04_cat                        0
ps_ind_06_bin                        0
ps_car_09_cat                      569
ps_car_02_cat                        5
ps_ind_02_cat                      216
ps_car_11                            5
ps_car_05_cat                   266551
ps_calc_09               

In [14]:
df_test.isnull().sum()

ps_car_13                            0
ps_reg_03                       161684
ps_ind_05_cat                     8710
ps_ind_03                            0
ps_ind_15                            0
ps_reg_02                            0
ps_car_14                        63805
ps_car_12                            0
ps_car_01_cat                      160
ps_car_07_cat                    17331
ps_ind_17_bin                        0
ps_car_03_cat                   616911
ps_reg_01                            0
ps_car_15                            0
ps_ind_01                            0
ps_ind_16_bin                        0
ps_ind_07_bin                        0
ps_car_06_cat                        0
ps_car_04_cat                        0
ps_ind_06_bin                        0
ps_car_09_cat                      877
ps_car_02_cat                        5
ps_ind_02_cat                      307
ps_car_11                            1
ps_car_05_cat                   400359
ps_calc_09               

In [15]:
df_train.fillna(-1, inplace=True)

In [16]:
df_test.fillna(-1, inplace=True)

In [17]:
df_train.head()

Unnamed: 0,ps_car_13,ps_reg_03,ps_ind_05_cat,ps_ind_03,ps_ind_15,ps_reg_02,ps_car_14,ps_car_12,ps_car_01_cat,ps_car_07_cat,...,ps_car_08_cat,ps_ind_09_bin,ps_ind_04_cat,ps_ind_18_bin,ps_ind_12_bin,ps_ind_14,ps_reg_01_plus_ps_car_02_cat,ps_reg_01_plus_ps_car_04_cat,id,target
0,0.883679,0.71807,0.0,5,11,0.2,0.37081,0.4,10.0,1.0,...,0,0,1.0,0,0,0,19,70,7,0
1,0.618817,0.766078,0.0,7,3,0.4,0.388716,0.316228,11.0,1.0,...,1,0,0.0,1,0,0,21,80,9,0
2,0.641586,-1.0,0.0,9,12,0.0,0.347275,0.316228,7.0,1.0,...,1,0,1.0,0,0,0,1,0,13,0
3,0.542949,0.580948,0.0,2,8,0.2,0.294958,0.374166,7.0,1.0,...,1,0,0.0,0,0,0,23,90,16,0
4,0.565832,0.840759,0.0,0,9,0.6,0.365103,0.31607,11.0,1.0,...,1,0,1.0,0,0,0,19,70,17,0


In [18]:
df_test.head()

Unnamed: 0,ps_car_13,ps_reg_03,ps_ind_05_cat,ps_ind_03,ps_ind_15,ps_reg_02,ps_car_14,ps_car_12,ps_car_01_cat,ps_car_07_cat,...,ps_calc_05,ps_ind_08_bin,ps_car_08_cat,ps_ind_09_bin,ps_ind_04_cat,ps_ind_18_bin,ps_ind_12_bin,ps_ind_14,ps_reg_01_plus_ps_car_02_cat,ps_reg_01_plus_ps_car_04_cat
0,0.669556,0.610328,0.0,8,12,0.3,0.352136,0.316228,7.0,1.0,...,1,0,1,0,1.0,0,0,0,14,50
1,0.60632,0.771362,0.0,5,5,0.5,0.358329,0.316228,4.0,1.0,...,3,0,1,1,1.0,0,0,0,23,90
2,0.896239,0.916174,0.0,3,10,0.0,0.398497,0.4,11.0,1.0,...,3,0,1,1,0.0,0,0,0,11,40
3,0.65211,-1.0,0.0,6,4,0.2,0.381445,0.374166,7.0,1.0,...,1,0,1,0,0.0,0,0,0,3,10
4,0.812914,0.817771,0.0,7,4,0.4,0.385097,0.374166,11.0,1.0,...,4,0,1,1,0.0,0,0,0,23,90


In [19]:
f_cats = [f for f in df_train.columns if "_cat" in f]

In [20]:
df = pd.read_csv('../cache/test_id.csv')

In [21]:
df.head()

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4


In [22]:
id_test = df['id'].values
id_train = df_train['id'].values
y = df_train['target'].values

In [23]:
sum(y == 0)

573518

In [24]:
sum(y==1)

21694

In [25]:
sp = sum(y==0)/sum(y==1)

In [26]:
sp

26.436710611228911

In [27]:
f_cats

['ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_07_cat',
 'ps_car_03_cat',
 'ps_car_06_cat',
 'ps_car_04_cat',
 'ps_car_09_cat',
 'ps_car_02_cat',
 'ps_ind_02_cat',
 'ps_car_05_cat',
 'ps_car_08_cat',
 'ps_ind_04_cat',
 'ps_reg_01_plus_ps_car_02_cat',
 'ps_reg_01_plus_ps_car_04_cat']

In [28]:
# import category_encoders as ce

# # encoder = ce.BackwardDifferenceEncoder(cols=f_cats)
# encoder = ce.BinaryEncoder(cols=f_cats)
# encoder = ce.HashingEncoder(cols=f_cats)
# encoder = ce.HelmertEncoder(cols=f_cats)
# encoder = ce.OneHotEncoder(cols=f_cats)
# encoder = ce.OrdinalEncoder(cols=f_cats)
# encoder = ce.SumEncoder(cols=f_cats)
# encoder = ce.PolynomialEncoder(cols=f_cats)

# encoder.fit(df_train, y)
# df_train = encoder.transform(df_train)


In [29]:
def target_encode(trn_series=None,    # Revised to encode validation series
#                   val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
#     ft_val_series = pd.merge(
#         val_series.to_frame(val_series.name),
#         averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
#         on=val_series.name,
#         how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
#     # pd.merge does not keep the index so restore it
#     ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [30]:
for f_cat in f_cats:    
#     df_train,df_test = BayesEncoding(df_train,df_test,'target',f_cat,f=1,noise=0.01)
    df_train,df_test = LOOEncoding(df_train,df_test,'target',f_cat, noise=0.01)
#     df_train,df_test = LOOEncodingKfold(df_train,df_test,'target',f_cat, noise=0.01)

In [31]:
# for f_cat in f_cats:
#     df_train[f_cat + "_avg"], df_test[f_cat + "_avg"] = target_encode(trn_series=df_train[f_cat],
#                                                               tst_series=df_test[f_cat],
#                                                               target=df_train['target'],
#                                                               min_samples_leaf=200,
#                                                               smoothing=10,
#                                                               noise_level=0
#                                                              )

In [32]:
df_train.shape

(595212, 52)

In [33]:
df_test.shape

(892816, 50)

In [34]:
df_train.head()

Unnamed: 0,ps_car_13,ps_reg_03,ps_ind_05_cat,ps_ind_03,ps_ind_15,ps_reg_02,ps_car_14,ps_car_12,ps_car_01_cat,ps_car_07_cat,...,loo_ps_car_06_cat,loo_ps_car_04_cat,loo_ps_car_09_cat,loo_ps_car_02_cat,loo_ps_ind_02_cat,loo_ps_car_05_cat,loo_ps_car_08_cat,loo_ps_ind_04_cat,loo_ps_reg_01_plus_ps_car_02_cat,loo_ps_reg_01_plus_ps_car_04_cat
0,0.883679,0.71807,0.0,5,11,0.2,0.37081,0.4,10.0,1.0,...,0.033972,0.033696,0.033066,0.034048,0.038213,0.040614,0.045007,0.038885,0.036315,0.035762
1,0.618817,0.766078,0.0,7,3,0.4,0.388716,0.316228,11.0,1.0,...,0.031783,0.033352,0.03628,0.033508,0.035814,0.031668,0.034426,0.034653,0.036053,0.036438
2,0.641586,-1.0,0.0,9,12,0.0,0.347275,0.316228,7.0,1.0,...,0.034444,0.033766,0.036341,0.033675,0.041034,0.031927,0.034484,0.038634,0.037381,0.035752
3,0.542949,0.580948,0.0,2,8,0.2,0.294958,0.374166,7.0,1.0,...,0.031731,0.033546,0.035089,0.033557,0.03582,0.040215,0.035005,0.034646,0.037951,0.038162
4,0.565832,0.840759,0.0,0,9,0.6,0.365103,0.31607,11.0,1.0,...,0.0346,0.033779,0.036214,0.033858,0.038014,0.031402,0.035025,0.038389,0.035658,0.035647


In [35]:
for f in f_cats:
    df_train = df_train.drop(f, axis=1)
    df_test = df_test.drop(f, axis=1)

In [36]:
df_train.shape

(595212, 38)

In [37]:
df_test.shape

(892816, 36)

In [38]:
df_train = df_train.drop('id', axis=1)
df_train = df_train.drop('target', axis=1)

In [39]:
df_train.shape

(595212, 36)

In [40]:
X = df_train

In [41]:
X.isnull().sum()

ps_car_13                           0
ps_reg_03                           0
ps_ind_03                           0
ps_ind_15                           0
ps_reg_02                           0
ps_car_14                           0
ps_car_12                           0
ps_ind_17_bin                       0
ps_reg_01                           0
ps_car_15                           0
ps_ind_01                           0
ps_ind_16_bin                       0
ps_ind_07_bin                       0
ps_ind_06_bin                       0
ps_car_11                           0
ps_calc_09                          0
ps_calc_05                          0
ps_ind_08_bin                       0
ps_ind_09_bin                       0
ps_ind_18_bin                       0
ps_ind_12_bin                       0
ps_ind_14                           0
loo_ps_ind_05_cat                   0
loo_ps_car_01_cat                   0
loo_ps_car_07_cat                   0
loo_ps_car_03_cat                   0
loo_ps_car_0

In [42]:
df_test.isnull().sum()

ps_car_13                           0
ps_reg_03                           0
ps_ind_03                           0
ps_ind_15                           0
ps_reg_02                           0
ps_car_14                           0
ps_car_12                           0
ps_ind_17_bin                       0
ps_reg_01                           0
ps_car_15                           0
ps_ind_01                           0
ps_ind_16_bin                       0
ps_ind_07_bin                       0
ps_ind_06_bin                       0
ps_car_11                           0
ps_calc_09                          0
ps_calc_05                          0
ps_ind_08_bin                       0
ps_ind_09_bin                       0
ps_ind_18_bin                       0
ps_ind_12_bin                       0
ps_ind_14                           0
loo_ps_ind_05_cat                   0
loo_ps_car_01_cat                   0
loo_ps_car_07_cat                   0
loo_ps_car_03_cat                   0
loo_ps_car_0

In [43]:
sp

26.436710611228911

In [44]:
wts = []
wts = [26.43671 if i == 1 else 1 for i in y ]

In [45]:
sum(pd.Series(wts) == 26.43671)

21694

In [59]:
# model = XGBClassifier(    
#                         n_estimators=MAX_ROUNDS,
#                         max_depth=4,
#                         objective="binary:logistic",
#                         learning_rate=LEARNING_RATE, 
#                         subsample=.8,
#                         min_child_weight=6,
#                         colsample_bytree=.8,
#                         scale_pos_weight=1.6,
#                         gamma=10,
#                         reg_alpha=8,
#                         reg_lambda=1.3,
#                      )

xgb_model = XGBClassifier(objective='binary:logistic', scale_pos_weight=26.43671, missing=-1)
parameters = {
    'n_estimators': [100,200],
    'learning_rate': [0.03, 0.05, 0.08, 0.1], #so called `eta` value
    'max_depth': sp_randint(1,11),
    'subsample': [0.8, 0.7],
    'colsample_bytree': [0.8, 0.7],
    'gamma': sp_randint(1,10),
    'reg_alpha': sp_randint(0,10),
    'reg_lambda': [1,1.3],
    'seed': [1337]
}


clf = RandomizedSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=StratifiedKFold(y, 5),
                   scoring = gini_scorer,
                   verbose=3, refit=True)


In [60]:
# clf = XGBClassifier(n_estimators=200,
#     max_depth=4,
#     objective="binary:logistic",
#     learning_rate=.1, 
#     subsample=.8, 
#     colsample_bytree=.8,
#     scale_pos_weight=26.43671,
#     gamma=1,
#     reg_alpha=0,
#     reg_lambda=1,
#     missing=-1,
#     nthread=2)

In [61]:
# x1, x2, y1, y2 = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True, stratify=y)

In [62]:
clf.fit(X, y)
# clf.fit(x1, y1, 
#     eval_set=[(x1, y1), (x2, y2)],
#     eval_metric=gini_xgb,
#     early_stopping_rounds=None,
#     verbose=True)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] reg_alpha=8, seed=1337, gamma=9, max_depth=2, n_estimators=100, learning_rate=0.1, subsample=0.7, reg_lambda=1, colsample_bytree=0.8 
[CV] reg_alpha=8, seed=1337, gamma=9, max_depth=2, n_estimators=100, learning_rate=0.1, subsample=0.7, reg_lambda=1, colsample_bytree=0.8 
[CV] reg_alpha=8, seed=1337, gamma=9, max_depth=2, n_estimators=100, learning_rate=0.1, subsample=0.7, reg_lambda=1, colsample_bytree=0.8 
[CV] reg_alpha=8, seed=1337, gamma=9, max_depth=2, n_estimators=100, learning_rate=0.1, subsample=0.7, reg_lambda=1, colsample_bytree=0.8 
[CV] reg_alpha=8, seed=1337, gamma=9, max_depth=2, n_estimators=100, learning_rate=0.1, subsample=0.7, reg_lambda=1, colsample_bytree=0.8 
[CV]  reg_alpha=8, seed=1337, gamma=9, max_depth=2, n_estimators=100, learning_rate=0.1, subsample=0.7, reg_lambda=1, colsample_bytree=0.8, score=0.215782 - 1.6min
[CV] reg_alpha=5, seed=1337, gamma=7, max_depth=4, n_estimators=100, learning_ra

[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed: 21.1min


[CV]  reg_alpha=1, seed=1337, gamma=4, max_depth=7, n_estimators=200, learning_rate=0.1, subsample=0.7, reg_lambda=1.3, colsample_bytree=0.8, score=0.280380 -11.5min
[CV] reg_alpha=0, seed=1337, gamma=2, max_depth=3, n_estimators=200, learning_rate=0.05, subsample=0.8, reg_lambda=1, colsample_bytree=0.7 
[CV]  reg_alpha=1, seed=1337, gamma=4, max_depth=7, n_estimators=200, learning_rate=0.1, subsample=0.7, reg_lambda=1.3, colsample_bytree=0.8, score=0.295245 -11.3min
[CV] reg_alpha=0, seed=1337, gamma=2, max_depth=3, n_estimators=200, learning_rate=0.05, subsample=0.8, reg_lambda=1, colsample_bytree=0.7 
[CV]  reg_alpha=1, seed=1337, gamma=4, max_depth=7, n_estimators=200, learning_rate=0.1, subsample=0.7, reg_lambda=1.3, colsample_bytree=0.8, score=0.270838 -12.0min
[CV] reg_alpha=0, seed=1337, gamma=2, max_depth=3, n_estimators=200, learning_rate=0.05, subsample=0.8, reg_lambda=1, colsample_bytree=0.7 
[CV]  reg_alpha=0, seed=1337, gamma=2, max_depth=3, n_estimators=200, learning_rat

[Parallel(n_jobs=5)]: Done  50 out of  50 | elapsed: 46.7min finished


RandomizedSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=5, shuffle=False, random_state=None),
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=-1, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=26.43671, seed=None,
       silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=10, n_jobs=5,
          param_distributions={'reg_alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb2cced8160>, 'seed': [1337], 'reg_lambda': [1, 1.3], 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb2cced8320>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb2cced8f28>, 'subsample': [0.8, 0.7], 'n_estimators': [100

In [63]:
print(clf.best_score_)
print(clf.best_params_)

# 0.24184412440294284
# {'reg_lambda': 1, 'reg_alpha': 9, 'seed': 1337, 'gamma': 13, 'max_depth': 8, 'silent': 1, 
#  'subsample': 0.7, 'min_child_weight': 4, 'learning_rate': 0.08, 'colsample_bytree': 0.8, 'missing': -999}
# 0.254 on public LB,

# 0.2382713543302345
# {'seed': 1337, 'max_depth': 5, 'silent': 1, 'subsample': 0.8, 'min_child_weight': 5, 
#  'learning_rate': 0.08, 'colsample_bytree': 0.7, 'missing': -999}
# when gamma, reg_alpha, reg_lamda is filetered out, using new gini scorer

# 0.23755666055779237
# {'seed': 1337, 'gamma': 6, 'silent': 1, 'subsample': 0.8, 'min_child_weight': 6, 
#  'learning_rate': 0.1, 'colsample_bytree': 0.7, 'missing': -999, 'reg_alpha': 8, 'max_depth': 9, 
#  'scale_pos_weight': 26.4367, 'reg_lambda': 3.0}
# putting gamma back, shifting scale_pos_weightin parameters, using new gini scorer

# using f =10 and noise=0 in the encoding functions
# 1.0
# {'seed': 1337, 'gamma': 2, 'silent': 1, 'subsample': 0.8, 'min_child_weight': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'missing': -999, 'reg_alpha': 5, 
#  'max_depth': 4, 'scale_pos_weight': 26.4367, 'reg_lambda': 0}

# using f =10 and noise=0 in the encoding functions, dropped LOO
# 0.203635367561805
# {'seed': 1337, 'gamma': 5, 'silent': 1, 'subsample': 0.7, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.7, 'missing': -999, 
#  'reg_alpha': 7, 'max_depth': 4, 'scale_pos_weight': 26.4367, 'reg_lambda': 2.0}

# using f = 20, noise=0.0, no LOO
# 0.1982897319930031
# {'seed': 1337, 'gamma': 1, 'silent': 1, 'subsample': 0.8, 'min_child_weight': 1, 'learning_rate': 0.06, 'colsample_bytree': 0.8, 'missing': -999, 'reg_alpha': 6, 
#  'max_depth': 6, 'scale_pos_weight': 26.4367, 'reg_lambda': 1.3}

# f = 10, noise=0.01 no LOO
# 0.1921171790111911
# {'seed': 1337, 'gamma': 2, 'silent': 1, 'subsample': 0.7, 'min_child_weight': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.7, 'missing': -999, 
#  'reg_alpha': 9, 'max_depth': 6, 'scale_pos_weight': 26.4367, 'reg_lambda': 0}

# f = 1, noise=0.01, no LOO
#0.19488331448594218
# {'seed': 1337, 'gamma': 5, 'silent': 1, 'subsample': 0.7, 'min_child_weight': 3, 'learning_rate': 0.06, 'colsample_bytree': 0.7, 'missing': -999, 'reg_alpha': 4, 'max_depth': 4, 'scale_pos_weight': 26.4367, 'reg_lambda': 0}
# In [45]:

# LOO only, noise=0
# 1.0
# {'seed': 1337, 'gamma': 0, 'silent': 1, 'subsample': 0.8, 'min_child_weight': 8, 'learning_rate': 0.08, 'colsample_bytree': 0.8, 'missing': -999, 'reg_alpha': 7, 
#  'max_depth': 8, 'scale_pos_weight': 26.4367, 'reg_lambda': 0}

# LOO only , noise=0.01, last gini
# 0.2572973729916049
# {'seed': 1337, 'gamma': 8, 'silent': 1, 'subsample': 0.7, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.7, 'missing': -999, 'reg_alpha': 0, 
#  'max_depth': 6, 'scale_pos_weight': 26.4367, 'reg_lambda': 1.5}
# scored 0.264 on pubic LB

# LOO only, noise=0.01, with eval_gini
# 0.2572973729916049
# {'seed': 1337, 'gamma': 8, 'silent': 1, 'subsample': 0.7, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.7, 'missing': -999, 
#  'reg_alpha': 0, 'max_depth': 6, 'scale_pos_weight': 26.4367, 'reg_lambda': 1.5}

# LOO only, noise=0.02, with eval_gini
# 0.212160199299836
# {'seed': 1337, 'gamma': 11, 'silent': 1, 'subsample': 0.7, 'min_child_weight': 5, 'learning_rate': 0.07, 'colsample_bytree': 0.7, 'missing': -999, 'reg_alpha': 2, 
#  'max_depth': 5, 'scale_pos_weight': 26.4367, 'reg_lambda': 3.0}

# LOOKFold, noise=0.01, eval gini
# 0.15067528137467348
# {'seed': 1337, 'gamma': 14, 'silent': 1, 'subsample': 0.8, 'min_child_weight': 8, 'learning_rate': 0.04, 'colsample_bytree': 0.8, 'missing': -999, 
#  'reg_alpha': 2, 'max_depth': 4, 'scale_pos_weight': 26.4367, 'reg_lambda': 1}

# Backward Diff encoder
# 1.0
# {'seed': 1337, 'gamma': 10, 'silent': 1, 'subsample': 0.8, 'min_child_weight': 9, 
#  'learning_rate': 0.05, 'colsample_bytree': 0.7, 'missing': -999, 'reg_alpha': 7, 'max_depth': 9, 'scale_pos_weight': 26.4367, 'reg_lambda': 2.0}
# In [42]:

# Hashing Encoder
# 0.18163349897449646
# {'seed': 1337, 'gamma': 1, 'silent': 1, 'subsample': 0.8, 'min_child_weight': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.7, 'missing': -999, 'reg_alpha': 7, 
#  'max_depth': 4, 'scale_pos_weight': 26.4367, 'reg_lambda': 1.5}

# helmert
# 0.1998236311643095
# {'seed': 1337, 'gamma': 4, 'silent': 1, 'subsample': 0.8, 'min_child_weight': 8, 'learning_rate': 0.07, 'colsample_bytree': 0.8, 'missing': -999, 'reg_alpha': 6, 
#  'max_depth': 5, 'scale_pos_weight': 26.4367, 'reg_lambda': 0}

# OneHot
# 0.1974928419377269
# {'seed': 1337, 'gamma': 10, 'silent': 1, 'subsample': 0.7, 'min_child_weight': 1, 'learning_rate': 0.06, 'colsample_bytree': 0.8, 'missing': -999, 'reg_alpha': 7, 
#  'max_depth': 6, 'scale_pos_weight': 26.4367, 'reg_lambda': 1.5}

#ordinal
# 1.0
#stopped

# Polynomial encoder
# 0.19832716986326504
# {'seed': 1337, 'gamma': 7, 'silent': 1, 'subsample': 0.7, 'min_child_weight': 5, 'learning_rate': 0.06, 'colsample_bytree': 0.8, 'missing': -999, 'reg_alpha': 1,
#  'max_depth': 4, 'scale_pos_weight': 26.4367, 'reg_lambda': 1}

# Sum encoder
# 0.1981555069566366
# {'seed': 1337, 'gamma': 7, 'silent': 1, 'subsample': 0.7, 'min_child_weight': 1, 'learning_rate': 0.05, 'colsample_bytree': 0.8, 'missing': -999, 'reg_alpha': 9, 
#  'max_depth': 7, 'scale_pos_weight': 26.4367, 'reg_lambda': 1.3}

# kaggle taget encoding
#0.1972640463910713
# {'seed': 1337, 'gamma': 13, 'silent': 1, 'subsample': 0.7, 'min_child_weight': 9, 
#  'learning_rate': 0.07, 'colsample_bytree': 0.8, 'missing': -999, 'reg_alpha': 0, 'max_depth': 5, 
#  'scale_pos_weight': 26.4367, 'reg_lambda': 1.5}

# kaggle target encoding, n_estimators = 400
# 0.20004995842026038
# {'seed': 1337, 'gamma': 10, 'silent': 1, 'n_estimators': 400, 'min_child_weight': 6, 'learning_rate': 0.08, 'scale_pos_weight': 26.4367, 
#  'missing': -999, 'reg_alpha': 8, 'max_depth': 4, 'colsample_bytree': 0.8, 'reg_lambda': 1.3}

# kaggle target encoding, n_estimators = 400, scale_pos_weight when declaring model
# 0.005075658237357413
# {'reg_lambda': 1.3, 'reg_alpha': 8, 'seed': 1337, 'gamma': 10, 'max_depth': 6, 'silent': 1, 'n_estimators': 400, 'min_child_weight': 6, 
#  'learning_rate': 0.06, 'colsample_bytree': 0.8, 'missing': -999}

# finally some sanity, after moved scale_pos_weight to the model line, removed sample_weight
# 0.2561991118839686
# {'reg_lambda': 1.3, 'reg_alpha': 0, 'seed': 1337, 'gamma': 1, 'max_depth': 4, 'subsample': 0.8, 'silent': 1, 
#  'n_estimators': 200, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.8}

# same as above but removed min_child_weight
# 0.2566225813070569
# {'reg_lambda': 1, 'reg_alpha': 0, 'seed': 1337, 'gamma': 1, 'max_depth': 4, 
#  'subsample': 0.8, 'silent': 1, 'n_estimators': 200, 'learning_rate': 0.1, 'colsample_bytree': 0.8}

# same as bove but replaced missing=-999 with missing=-1
# 0.25776746983546517
# {'reg_lambda': 1.3, 'reg_alpha': 0, 'seed': 1337, 'gamma': 1, 'max_depth': 4, 'subsample': 0.8, 'silent': 1, '
#  n_estimators': 200, 'learning_rate': 0.1, 'colsample_bytree': 0.8}

# with separate classfier and using train_test_split,
# validation_0-error:0.324294	validation_1-error:0.326174
# validation_0-gini:-0.433448	validation_1-gini:-0.384038

# with randomized CV, LOO
# 0.2862247407834804
# {'reg_lambda': 1.3, 'reg_alpha': 4, 'seed': 1337, 'gamma': 7, 
# 'max_depth': 5, 'subsample': 0.8, 'n_estimators': 200, 'learning_rate': 0.1, 'colsample_bytree': 0.8}

0.2862247407834804
{'reg_lambda': 1.3, 'reg_alpha': 4, 'seed': 1337, 'gamma': 7, 'max_depth': 5, 'subsample': 0.8, 'n_estimators': 200, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


In [42]:
df_test.fillna(-1, inplace=True)

In [43]:
test_probs = clf.predict_proba(df_test)[:,1]

In [44]:
test_probs

array([ 0.37710699,  0.42687458,  0.43030238, ...,  0.45527622,
        0.42130691,  0.452133  ], dtype=float32)

In [45]:
test_probs.shape

(892816,)

In [46]:
np.min(test_probs)

0.054419678

In [47]:
np.max(test_probs)

0.98862666

In [48]:
test_probs = np.clip(test_probs, a_min=0.05, a_max=0.95)

In [49]:
submission = pd.DataFrame()
submission['id'] = id_test
submission['target'] = test_probs

submission.to_csv('../submissions/sub_xgb_2_0.csv', index=False)
