In [1]:
!which python

/opt/conda/bin/python


In [2]:
# !python

In [3]:
# from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

"""
This simple scripts demonstrates the use of xgboost eval results to get the best round
for the current fold and accross folds. 
It also shows an upsampling method that limits cross-validation overfitting.
"""

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import gc
from numba import jit
from sklearn.preprocessing import LabelEncoder
import time 
from datetime import datetime
from tqdm import tqdm

In [5]:
@jit
def eval_gini(y_true, y_prob):
    """
    Original author CPMP : https://www.kaggle.com/cpmpml
    In kernel : https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))



In [6]:
def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)


In [7]:
gc.enable()

trn_df = pd.read_csv("../data/train.csv", index_col=0)
sub_df = pd.read_csv("../data/test.csv", index_col=0)

target = trn_df["target"]
del trn_df["target"]

In [8]:
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
    "ps_car_11_cat" # Very nice spot from Tilii : https://www.kaggle.com/tilii7
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

In [9]:
f_cats = [f for f in train_features if "_cat" in f]

In [10]:
f_cars = [f for f in train_features if "_car" in f and '_cat' not in f]

In [11]:
f_cars

['ps_car_13', 'ps_car_14', 'ps_car_12', 'ps_car_15', 'ps_car_11']

In [12]:
start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
    sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
    trn_df[name1] = lbl.transform(list(trn_df[name1].values))
    sub_df[name1] = lbl.transform(list(sub_df[name1].values))

    train_features.append(name1)

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0

In [13]:
f_cats

['ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_07_cat',
 'ps_car_03_cat',
 'ps_car_06_cat',
 'ps_car_04_cat',
 'ps_car_09_cat',
 'ps_car_02_cat',
 'ps_ind_02_cat',
 'ps_car_05_cat',
 'ps_car_08_cat',
 'ps_ind_04_cat',
 'ps_car_11_cat']

In [12]:
col_calc_mean = ['ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']

In [14]:
for f in f_cats:
    for col in f_cars:

        new_col1 = '{}_{}_mean'.format(col, f) 
        new_col2 = '{}_{}_median'.format(col, f) 
        new_col3 = '{}_{}_skew'.format(col, f) 
        new_col4 = '{}_{}_kurtosis'.format(col, f) 
        trn_df[new_col1] = 0
        trn_df[new_col2] = 0
        trn_df[new_col3] = 0
        trn_df[new_col4] = 0
        
        sub_df[new_col1] = 0
        sub_df[new_col2] = 0
        sub_df[new_col3] = 0
        sub_df[new_col4] = 0

In [17]:
-

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 5/5 [01:27<00:00, 18.26s/it]


In [21]:
trn_df = trn_df[train_features]
sub_df = sub_df[train_features]


In [22]:
pd.set_option('display.max_columns', None)

In [23]:
trn_df.head()

Unnamed: 0_level_0,ps_car_13,ps_reg_03,ps_ind_05_cat,ps_ind_03,ps_ind_15,ps_reg_02,ps_car_14,ps_car_12,ps_car_01_cat,ps_car_07_cat,ps_ind_17_bin,ps_car_03_cat,ps_reg_01,ps_car_15,ps_ind_01,ps_ind_16_bin,ps_ind_07_bin,ps_car_06_cat,ps_car_04_cat,ps_ind_06_bin,ps_car_09_cat,ps_car_02_cat,ps_ind_02_cat,ps_car_11,ps_car_05_cat,ps_calc_09,ps_calc_05,ps_ind_08_bin,ps_car_08_cat,ps_ind_09_bin,ps_ind_04_cat,ps_ind_18_bin,ps_ind_12_bin,ps_ind_14,ps_car_11_cat,ps_reg_01_plus_ps_car_02_cat,ps_reg_01_plus_ps_car_04_cat,ps_car_13_ps_ind_05_cat_mean,ps_car_13_ps_ind_05_cat_median,ps_car_13_ps_car_01_cat_mean,ps_car_13_ps_car_01_cat_median,ps_car_13_ps_car_07_cat_mean,ps_car_13_ps_car_07_cat_median,ps_car_13_ps_car_03_cat_mean,ps_car_13_ps_car_03_cat_median,ps_car_13_ps_car_06_cat_mean,ps_car_13_ps_car_06_cat_median,ps_car_13_ps_car_04_cat_mean,ps_car_13_ps_car_04_cat_median,ps_car_13_ps_car_09_cat_mean,ps_car_13_ps_car_09_cat_median,ps_car_13_ps_car_02_cat_mean,ps_car_13_ps_car_02_cat_median,ps_car_13_ps_ind_02_cat_mean,ps_car_13_ps_ind_02_cat_median,ps_car_13_ps_car_05_cat_mean,ps_car_13_ps_car_05_cat_median,ps_car_13_ps_car_08_cat_mean,ps_car_13_ps_car_08_cat_median,ps_car_13_ps_ind_04_cat_mean,ps_car_13_ps_ind_04_cat_median,ps_car_13_ps_car_11_cat_mean,ps_car_13_ps_car_11_cat_median,ps_car_14_ps_ind_05_cat_mean,ps_car_14_ps_ind_05_cat_median,ps_car_14_ps_car_01_cat_mean,ps_car_14_ps_car_01_cat_median,ps_car_14_ps_car_07_cat_mean,ps_car_14_ps_car_07_cat_median,ps_car_14_ps_car_03_cat_mean,ps_car_14_ps_car_03_cat_median,ps_car_14_ps_car_06_cat_mean,ps_car_14_ps_car_06_cat_median,ps_car_14_ps_car_04_cat_mean,ps_car_14_ps_car_04_cat_median,ps_car_14_ps_car_09_cat_mean,ps_car_14_ps_car_09_cat_median,ps_car_14_ps_car_02_cat_mean,ps_car_14_ps_car_02_cat_median,ps_car_14_ps_ind_02_cat_mean,ps_car_14_ps_ind_02_cat_median,ps_car_14_ps_car_05_cat_mean,ps_car_14_ps_car_05_cat_median,ps_car_14_ps_car_08_cat_mean,ps_car_14_ps_car_08_cat_median,ps_car_14_ps_ind_04_cat_mean,ps_car_14_ps_ind_04_cat_median,ps_car_14_ps_car_11_cat_mean,ps_car_14_ps_car_11_cat_median,ps_car_12_ps_ind_05_cat_mean,ps_car_12_ps_ind_05_cat_median,ps_car_12_ps_car_01_cat_mean,ps_car_12_ps_car_01_cat_median,ps_car_12_ps_car_07_cat_mean,ps_car_12_ps_car_07_cat_median,ps_car_12_ps_car_03_cat_mean,ps_car_12_ps_car_03_cat_median,ps_car_12_ps_car_06_cat_mean,ps_car_12_ps_car_06_cat_median,ps_car_12_ps_car_04_cat_mean,ps_car_12_ps_car_04_cat_median,ps_car_12_ps_car_09_cat_mean,ps_car_12_ps_car_09_cat_median,ps_car_12_ps_car_02_cat_mean,ps_car_12_ps_car_02_cat_median,ps_car_12_ps_ind_02_cat_mean,ps_car_12_ps_ind_02_cat_median,ps_car_12_ps_car_05_cat_mean,ps_car_12_ps_car_05_cat_median,ps_car_12_ps_car_08_cat_mean,ps_car_12_ps_car_08_cat_median,ps_car_12_ps_ind_04_cat_mean,ps_car_12_ps_ind_04_cat_median,ps_car_12_ps_car_11_cat_mean,ps_car_12_ps_car_11_cat_median,ps_car_15_ps_ind_05_cat_mean,ps_car_15_ps_ind_05_cat_median,ps_car_15_ps_car_01_cat_mean,ps_car_15_ps_car_01_cat_median,ps_car_15_ps_car_07_cat_mean,ps_car_15_ps_car_07_cat_median,ps_car_15_ps_car_03_cat_mean,ps_car_15_ps_car_03_cat_median,ps_car_15_ps_car_06_cat_mean,ps_car_15_ps_car_06_cat_median,ps_car_15_ps_car_04_cat_mean,ps_car_15_ps_car_04_cat_median,ps_car_15_ps_car_09_cat_mean,ps_car_15_ps_car_09_cat_median,ps_car_15_ps_car_02_cat_mean,ps_car_15_ps_car_02_cat_median,ps_car_15_ps_ind_02_cat_mean,ps_car_15_ps_ind_02_cat_median,ps_car_15_ps_car_05_cat_mean,ps_car_15_ps_car_05_cat_median,ps_car_15_ps_car_08_cat_mean,ps_car_15_ps_car_08_cat_median,ps_car_15_ps_ind_04_cat_mean,ps_car_15_ps_ind_04_cat_median,ps_car_15_ps_car_11_cat_mean,ps_car_15_ps_car_11_cat_median,ps_car_11_ps_ind_05_cat_mean,ps_car_11_ps_ind_05_cat_median,ps_car_11_ps_car_01_cat_mean,ps_car_11_ps_car_01_cat_median,ps_car_11_ps_car_07_cat_mean,ps_car_11_ps_car_07_cat_median,ps_car_11_ps_car_03_cat_mean,ps_car_11_ps_car_03_cat_median,ps_car_11_ps_car_06_cat_mean,ps_car_11_ps_car_06_cat_median,ps_car_11_ps_car_04_cat_mean,ps_car_11_ps_car_04_cat_median,ps_car_11_ps_car_09_cat_mean,ps_car_11_ps_car_09_cat_median,ps_car_11_ps_car_02_cat_mean,ps_car_11_ps_car_02_cat_median,ps_car_11_ps_ind_02_cat_mean,ps_car_11_ps_ind_02_cat_median,ps_car_11_ps_car_05_cat_mean,ps_car_11_ps_car_05_cat_median,ps_car_11_ps_car_08_cat_mean,ps_car_11_ps_car_08_cat_median,ps_car_11_ps_ind_04_cat_mean,ps_car_11_ps_ind_04_cat_median,ps_car_11_ps_car_11_cat_mean,ps_car_11_ps_car_11_cat_median
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1
7,0.883679,0.71807,0,5,11,0.2,0.37081,0.4,10,1,1,-1,0.7,3.605551,2,0,1,4,0,0,0,1,2,2,1,1,1,0,0,0,1,0,0,0,12,19,70,0.814889,0,0.865479,0,0.803957,0,0.0,0,0.773211,0,0.761576,0,0.814952,0,0.764163,0,0.773175,0,0.832608,0,0.998247,0,0.804762,0,0.825397,0,0.27753,0,0.310299,0,0.277064,0,0.0,0,0.343222,0,0.270192,0,0.30914,0,0.269678,0,0.266021,0,0.253556,0,0.149042,0,0.273869,0,0.359531,0,0.380469,0,0.391929,0,0.378428,0,0.0,0,0.367296,0,0.367082,0,0.380256,0,0.367553,0,0.365796,0,0.390513,0,0.384054,0,0.373392,0,0.4,0,3.067873,0,3.135044,0,3.060768,0,0.0,0,3.182322,0,3.024636,0,3.064638,0,3.040633,0,3.06971,0,2.930494,0,3.67522,0,3.101007,0,3.414482,0,2.342579,0,2.19562,0,2.366016,0,0.0,0,2.5044,0,2.44538,0,2.179474,0,2.312095,0,2.39049,0,2.271372,0,2.417467,0,2.418377,0,2.85454,0
9,0.618817,0.766078,0,7,3,0.4,0.388716,0.316228,11,1,0,-1,0.8,2.44949,1,0,0,11,0,0,2,1,1,3,-1,1,1,1,1,0,0,1,0,0,19,21,80,0.814889,0,0.858722,0,0.803957,0,0.0,0,0.756987,0,0.761576,0,0.807165,0,0.764163,0,0.826747,0,0.0,0,0.775934,0,0.819302,0,0.55472,0,0.27753,0,0.263036,0,0.277064,0,0.0,0,0.308645,0,0.270192,0,0.263259,0,0.269678,0,0.279591,0,0.0,0,0.301929,0,0.27798,0,0.212453,0,0.380469,0,0.388844,0,0.378428,0,0.0,0,0.374233,0,0.367082,0,0.377333,0,0.367553,0,0.384766,0,0.0,0,0.379116,0,0.384622,0,0.31595,0,3.067873,0,3.155021,0,3.060768,0,0.0,0,2.962629,0,3.024636,0,3.090635,0,3.040633,0,3.067131,0,0.0,0,2.942934,0,3.040733,0,1.618905,0,2.342579,0,2.478882,0,2.366016,0,0.0,0,2.185148,0,2.44538,0,2.437507,0,2.312095,0,2.332402,0,0.0,0,2.331664,0,2.294416,0,2.502452,0
13,0.641586,-1.0,0,9,12,0.0,0.347275,0.316228,7,1,0,-1,0.0,3.316625,5,1,0,14,0,0,2,1,4,1,-1,2,2,1,1,0,1,0,0,0,60,1,0,0.814889,0,0.756952,0,0.803957,0,0.0,0,0.785353,0,0.761576,0,0.807165,0,0.764163,0,0.777829,0,0.0,0,0.775934,0,0.804762,0,0.645717,0,0.27753,0,0.26568,0,0.277064,0,0.0,0,0.250851,0,0.270192,0,0.263259,0,0.269678,0,0.265928,0,0.0,0,0.301929,0,0.273869,0,0.336433,0,0.380469,0,0.36729,0,0.378428,0,0.0,0,0.382101,0,0.367082,0,0.377333,0,0.367553,0,0.369465,0,0.0,0,0.379116,0,0.373392,0,0.316228,0,3.067873,0,2.994601,0,3.060768,0,0.0,0,3.001509,0,3.024636,0,3.090635,0,3.040633,0,2.994064,0,0.0,0,2.942934,0,3.101007,0,3.277474,0,2.342579,0,2.39758,0,2.366016,0,0.0,0,2.31649,0,2.44538,0,2.437507,0,2.312095,0,2.352522,0,0.0,0,2.331664,0,2.418377,0,1.0,0
16,0.542949,0.580948,0,2,8,0.2,0.294958,0.374166,7,1,0,0,0.9,2.0,0,1,0,11,0,1,3,1,1,1,1,4,4,0,1,0,0,0,0,0,104,24,90,0.814889,0,0.756952,0,0.803957,0,0.792103,0,0.756987,0,0.761576,0,0.632474,0,0.764163,0,0.826747,0,0.832608,0,0.775934,0,0.819302,0,0.977018,0,0.27753,0,0.26568,0,0.277064,0,0.173881,0,0.308645,0,0.270192,0,0.197483,0,0.269678,0,0.279591,0,0.253556,0,0.301929,0,0.27798,0,0.249136,0,0.380469,0,0.36729,0,0.378428,0,0.388782,0,0.374233,0,0.367082,0,0.367884,0,0.367553,0,0.384766,0,0.390513,0,0.379116,0,0.384622,0,0.429322,0,3.067873,0,2.994601,0,3.060768,0,2.78537,0,2.962629,0,3.024636,0,2.09738,0,3.040633,0,3.067131,0,2.930494,0,2.942934,0,3.040733,0,2.958673,0,2.342579,0,2.39758,0,2.366016,0,2.365037,0,2.185148,0,2.44538,0,2.321496,0,2.312095,0,2.332402,0,2.271372,0,2.331664,0,2.294416,0,2.250144,0
17,0.565832,0.840759,0,0,9,0.6,0.365103,0.31607,11,1,0,-1,0.7,2.0,0,1,0,14,0,1,2,1,2,3,-1,2,2,0,1,0,1,0,0,0,82,19,70,0.814889,0,0.858722,0,0.803957,0,0.0,0,0.785353,0,0.761576,0,0.807165,0,0.764163,0,0.773175,0,0.0,0,0.775934,0,0.804762,0,0.650229,0,0.27753,0,0.263036,0,0.277064,0,0.0,0,0.250851,0,0.270192,0,0.263259,0,0.269678,0,0.266021,0,0.0,0,0.301929,0,0.273869,0,0.229346,0,0.380469,0,0.388844,0,0.378428,0,0.0,0,0.382101,0,0.367082,0,0.377333,0,0.367553,0,0.365796,0,0.0,0,0.379116,0,0.373392,0,0.316183,0,3.067873,0,3.155021,0,3.060768,0,0.0,0,3.001509,0,3.024636,0,3.090635,0,3.040633,0,3.06971,0,0.0,0,2.942934,0,3.101007,0,2.834002,0,2.342579,0,2.478882,0,2.366016,0,0.0,0,2.31649,0,2.44538,0,2.437507,0,2.312095,0,2.39049,0,0.0,0,2.331664,0,2.418377,0,2.941452,0


In [24]:
f_cats = [f for f in trn_df.columns if "_cat" in f]

In [25]:
for f in f_cats:
    trn_df[f + "_avg"], sub_df[f + "_avg"] = target_encode(trn_series=trn_df[f],
                                         tst_series=sub_df[f],
                                         target=target,
                                         min_samples_leaf=200,
                                         smoothing=10,
                                         noise_level=0)

In [26]:
n_splits = 5
n_estimators = 200
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(trn_df.columns), n_splits))
xgb_evals = np.zeros((n_estimators, n_splits))
oof = np.empty(len(trn_df))
sub_preds = np.zeros(len(sub_df))
increase = True
np.random.seed(0)


In [27]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(target, target)):
    trn_dat, trn_tgt = trn_df.iloc[trn_idx], target.iloc[trn_idx]
    val_dat, val_tgt = trn_df.iloc[val_idx], target.iloc[val_idx]

    print(sum(trn_tgt==0)/sum(trn_tgt==1))
    
    sp = sum(trn_tgt==0)/sum(trn_tgt==1)
    clf = XGBClassifier(n_estimators=n_estimators,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=.1, 
                        subsample=.8, 
                        colsample_bytree=.8,
                        scale_pos_weight=sp,
                        gamma=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        nthread=6)
    # Upsample during cross validation to avoid having the same samples
    # in both train and validation sets
    # Validation set is not up-sampled to monitor overfitting
    if increase:
        # Get positive examples
        pos = pd.Series(trn_tgt == 1)
        # Add positive examples
        trn_dat = pd.concat([trn_dat, trn_dat.loc[pos]], axis=0)
        trn_tgt = pd.concat([trn_tgt, trn_tgt.loc[pos]], axis=0)
        # Shuffle data
        idx = np.arange(len(trn_dat))
        np.random.shuffle(idx)
        trn_dat = trn_dat.iloc[idx]
        trn_tgt = trn_tgt.iloc[idx]
        
    clf.fit(trn_dat, trn_tgt, 
            eval_set=[(trn_dat, trn_tgt), (val_dat, val_tgt)],
            eval_metric=gini_xgb,
            early_stopping_rounds=None,
            verbose=False)
            
    # Keep feature importances
    imp_df[:, fold_] = clf.feature_importances_

    # Find best round for validation set
    xgb_evals[:, fold_] = clf.evals_result_["validation_1"]["gini"]
    # Xgboost provides best round starting from 0 so it has to be incremented
    best_round = np.argsort(xgb_evals[:, fold_])[::-1][0]
    print(best_round)
    
    # Predict OOF and submission probas with the best round
    oof[val_idx] = clf.predict_proba(val_dat, ntree_limit=int(best_round))[:, 1]
    # Update submission
    sub_preds += clf.predict_proba(sub_df, ntree_limit=int(best_round))[:, 1] / n_splits

    # Display results
    print("Fold %2d : %.6f @%4d / best score is %.6f @%4d"
          % (fold_ + 1,
             eval_gini(val_tgt, oof[val_idx]),
             n_estimators,
             xgb_evals[best_round, fold_],
             best_round))
          


26.4369922213
91
Fold  1 : 0.271929 @ 200 / best score is 0.272182 @  91
26.4369922213
111
Fold  2 : 0.278302 @ 200 / best score is 0.278628 @ 111
26.4369922213
81
Fold  3 : 0.301982 @ 200 / best score is 0.302092 @  81
26.4370498415
101
Fold  4 : 0.272883 @ 200 / best score is 0.273043 @ 101
26.435526619
67
Fold  5 : 0.271686 @ 200 / best score is 0.271827 @  67


In [28]:
print("Full OOF score : %.6f" % eval_gini(target, oof))
# org with clipping, Full OOF score : 0.284952, LB: 0.275
# org no clipping, Full OOF score : 0.284952, LB: 0.275
# org, removed ntree_limit, kaggle/python: Full OOF score : 0.283630, LB: 0.274
# org, with ntree_limit, kaggle/python: Full OOF score : 0.284745, LB: 0.282
# org + counts for 4, Full OOF score : 0.279249
# org + counts for all cars, only mean: 0.279233

Full OOF score : 0.279233


In [None]:
# Compute mean score and std
mean_eval = np.mean(xgb_evals, axis=1)
std_eval = np.std(xgb_evals, axis=1)
best_round = np.argsort(mean_eval)[::-1][0]

print("Best mean score : %.6f + %.6f @%4d"
      % (mean_eval[best_round], std_eval[best_round], best_round))
    


In [None]:
importances = sorted([(trn_df.columns[i], imp) for i, imp in enumerate(imp_df.mean(axis=1))],
                     key=lambda x: x[1])

for f, imp in importances[::-1]:
    print("%-34s : %10.4f" % (f, imp))

In [None]:
sub_df["target"] = sub_preds
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.xgb.{}GMT'.format(now)
sub_df[["target"]].to_csv(fn, index=True, float_format="%.9f")

In [None]:
print(now)

In [None]:
sub_df.tail()