In [1]:
!which python

/opt/conda/bin/python


In [2]:
# !python

In [3]:
# from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

"""
This simple scripts demonstrates the use of xgboost eval results to get the best round
for the current fold and accross folds. 
It also shows an upsampling method that limits cross-validation overfitting.
"""

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import gc
from numba import jit
from sklearn.preprocessing import LabelEncoder
import time 
from datetime import datetime
from tqdm import tqdm
import patsy

In [5]:
@jit
def eval_gini(y_true, y_prob):
    """
    Original author CPMP : https://www.kaggle.com/cpmpml
    In kernel : https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))



In [6]:
def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)


In [7]:
gc.enable()

trn_df = pd.read_csv("../data/train.csv", index_col=0)
sub_df = pd.read_csv("../data/test.csv", index_col=0)

# trn_df = pd.read_csv("../data/train.csv")
# sub_df = pd.read_csv("../data/test.csv")

target = trn_df["target"]
del trn_df["target"]

In [8]:
trn_df.shape

(595212, 57)

In [9]:
trn_df.head()

Unnamed: 0_level_0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,2,2,5,1,0,0,1,0,0,0,...,9,1,5,8,0,1,1,0,0,1
9,1,1,7,0,0,0,0,1,0,0,...,3,1,1,9,0,1,1,0,1,0
13,5,4,9,1,0,0,0,1,0,0,...,4,2,7,7,0,1,1,0,1,0
16,0,1,2,0,0,1,0,0,0,0,...,2,2,4,9,0,0,0,0,0,0
17,0,2,0,1,0,1,0,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [10]:
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
    "ps_car_11_cat" # Very nice spot from Tilii : https://www.kaggle.com/tilii7
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

In [11]:
# ps_car_12 = cc,
# ps_car_13 = vehicle value,
# ps_car_14 = vehicle weight Kg
# ps_car_15 = manufacture year
# ps_car_06_cat = Car Makers

# ps_car_11_cat = Individual Car Models

In [12]:
# ps_reg_03                          :     0.1284
# ps_car_13                          :     0.1179
# ps_car_14                          :     0.0718
# ps_ind_03                          :     0.0594
# ps_ind_15                          :     0.0441
# ps_ind_01                          :     0.0345
# ps_car_11_cat_avg                  :     0.0340
# ps_reg_02                          :     0.0300
# ps_reg_01_plus_ps_car_04_cat_avg   :     0.0281
# ps_car_15                          :     0.0270
# ps_car_11_cat                      :     0.0265

In [13]:
def transform_df(df):
    d_median = df.median(axis=0)
    d_mean = df.mean(axis=0)
    dcol = [c for c in df.columns if c not in ['id','target']]
    df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
#     df['ps_car_13_d_ps_reg_03'] = df['ps_car_13'] / df['ps_reg_03']
#     df['ps_car_13_od_ps_reg_03'] = df['ps_reg_03'] / df['ps_car_13']
#     df['ps_car_13_plus_ps_reg_03'] = df['ps_car_13'] + df['ps_reg_03']
#     df['ps_car_13_ps_sub_reg_03'] = df['ps_car_13'] - df['ps_reg_03']
#     df['negative_one_vals'] = np.sum((df[dcol]==-1).values, axis=1)
    for c in dcol:
        if '_bin' not in c: #standard arithmetic
            df[c+str('_median_range')] = (df[c].values > d_median[c]).astype(np.int)
            df[c+str('_mean_range')] = (df[c].values > d_mean[c]).astype(np.int)
            #df[c+str('_sq')] = np.power(df[c].values,2).astype(np.float32)
            #df[c+str('_sqr')] = np.square(df[c].values).astype(np.float32)
            #df[c+str('_log')] = np.log(np.abs(df[c].values) + 1)
            #df[c+str('_exp')] = np.exp(df[c].values) - 1
        train_features.append(c+str('_median_range'))
        train_features.append(c+str('_mean_range'))
    train_features.append('ps_car_13_x_ps_reg_03')
    return df

In [14]:
trn_df=transform_df(trn_df)
sub_df=transform_df(sub_df)

# train_features.append('ps_car_13_x_ps_reg_03')
# train_features.append('ps_car_13_d_ps_reg_03')
# train_features.append('ps_car_13_od_ps_reg_03')
# train_features.append('ps_car_13_plus_ps_reg_03')
# train_features.append('ps_car_13_ps_sub_reg_03')
# train_features.append('negative_one_vals')

In [15]:
# f_ind_reg = [
#     'ps_car_13',
#     'ps_reg_03',
#     'ps_ind_03',
#     'ps_car_14'
# ]

f_ind_reg = [
    'ps_car_13',
    'ps_reg_03'
]

# transformations = ['sq', 'sqrt', 'exp', 'div_sqrt', 'cbrt', 'pow_3', 'pow_5', 'sin', 'log']
transformations = ['sin']

In [16]:
def trans(t, x):
    if (x == -1) | (float(x) == -1.0):
        return -1
    if float(x) == 0.0:
        x = float(x) + 0.001 # increment x by delta
    if t == 'sq':
        return x * x
    elif t == 'sqrt':
        return math.sqrt(x)
    elif t == 'exp':
        return math.exp(x)
    elif t == 'div_sqrt':
        return 1./math.sqrt(x)
    elif t == 'cbrt':
        return x ** (1./3)
    elif t == 'pow_3':
        return 3 ** x
    elif t == 'pow_5':
        return 5 ** x
    elif t == 'pow_5':
        return 5 ** x
    elif t == 'sin':
        return math.sin(x)
    elif t == 'log':
        return math.log(x)

In [17]:
import math

In [18]:
# # transforms
# for i in tqdm(range(len(f_ind_reg))):
#     col = f_ind_reg[i]
#     for t in transformations:
#         new_col = col + '_t_' + str(t)
# #         print(new_col)
#         trn_df[new_col] = trn_df[col].map(lambda x: trans(t,x))
#         sub_df[new_col] = sub_df[col].map(lambda x: trans(t,x))
        
#         train_features.append(new_col)

In [19]:
start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
    sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
    trn_df[name1] = lbl.transform(list(trn_df[name1].values))
    sub_df[name1] = lbl.transform(list(sub_df[name1].values))

    train_features.append(name1)

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.1

In [20]:
# f_calc_counts = ['ps_reg_03','ps_car_13']
# f_calc_counts = ['ps_car_12','ps_car_13','ps_car_14','ps_car_15', 'ps_reg_03']
f_calc_counts = ['ps_car_14','ps_car_13','ps_reg_03', 'ps_ind_03']
# f_calc_counts = ['ps_car_13']
# f_calc_counts = [
# 'ps_calc_10',    #       :  309.57 / shadow  296.18
# 'ps_calc_01',    #       :  205.10 / shadow  189.43
# 'ps_calc_02',    #       :  201.08 / shadow  192.53
# 'ps_calc_03',    #       :  190.70 / shadow  188.47
# 'ps_calc_13',    #       :  188.75 / shadow  181.13
# 'ps_calc_08',    #       :  172.73 / shadow  169.42
# 'ps_calc_07',    #       :  170.48 / shadow  162.17
# 'ps_calc_12',    #       :  135.05 / shadow  133.40
# 'ps_calc_04'
# ]
# f_calc_cats = ['ps_ind_05_cat']
f_calc_cats = ['ps_car_01_cat', 'ps_reg_01_plus_ps_car_04_cat']
# f_calc_cats = ['ps_car_11_cat', 'ps_car_06_cat', 'ps_reg_01_plus_ps_car_04_cat']
# f_calc_cats = ['ps_car_01_cat', 'ps_ind_05_cat']

# ps_reg_03                          :     0.1117
# ps_car_13                          :     0.1115
# ps_car_14                          :     0.0619
# ps_ind_03                          :     0.0569
# ps_ind_15                          :     0.0423
# ps_ind_01                          :     0.0334
# ps_car_11_cat_avg                  :     0.0313
# ps_reg_02                          :     0.0300
# ps_reg_01_plus_ps_car_04_cat_avg   :     0.0291

In [21]:
for col in f_calc_counts:
    for f in f_calc_cats:

        new_col1 = '{}_{}_mean'.format(col, f) 
        new_col2 = '{}_{}_median'.format(col, f) 
        new_col3 = '{}_{}_skew'.format(col, f) 
        new_col4 = '{}_{}_kurtosis'.format(col, f) 
        trn_df[new_col1] = 0
        trn_df[new_col2] = 0
        trn_df[new_col3] = 0
        trn_df[new_col4] = 0
        
        sub_df[new_col1] = 0
        sub_df[new_col2] = 0
        sub_df[new_col3] = 0
        sub_df[new_col4] = 0

In [22]:
for col in tqdm(f_calc_counts):
    for f in f_calc_cats:
        new_col1 = '{}_{}_mean'.format(col, f) 
        new_col2 = '{}_{}_median'.format(col, f) 
        new_col3 = '{}_{}_skew'.format(col, f) 
        new_col4 = '{}_{}_kurtosis'.format(col, f) 
        unique_f = np.unique(trn_df[f].values)
        for val in unique_f:
            if val == -1:
                continue
            data1 = trn_df[col][trn_df[f] == val]
            mean1 = data1.mean()
            median1 = data1.median()
            skew1 = data1.skew()
            kurtosis1 = data1.kurtosis()
            trn_df[new_col1][trn_df[f] == val] = mean1
            trn_df[new_col2][trn_df[f] == val] = median1
            trn_df[new_col3][trn_df[f] == val] = skew1
            trn_df[new_col4][trn_df[f] == val] = kurtosis1
            
            data1 = sub_df[col][sub_df[f] == val]
            mean1 = data1.mean()
            median1 = data1.median()
            skew1 = data1.skew()
            kurtosis1 = data1.kurtosis()
            sub_df[new_col1][sub_df[f] == val] = mean1
            sub_df[new_col2][sub_df[f] == val] = median1
            sub_df[new_col3][sub_df[f] == val] = skew1
            sub_df[new_col4][sub_df[f] == val] = kurtosis1
            
            data2 = trn_df[col][trn_df[f] == val]
            mean2 = data2.mean()
            median2 = data2.median()
            skew2 = data2.skew()
            kurtosis2 = data1.kurtosis()
            trn_df[new_col1][trn_df[f] == val] = mean2
            trn_df[new_col2][trn_df[f] == val] = median2
            trn_df[new_col3][trn_df[f] == val] = skew2
            trn_df[new_col4][trn_df[f] == val] = kurtosis2
            
            data2 = sub_df[col][sub_df[f] == val]
            mean2 = data2.mean()
            median2 = data2.median()
            skew2 = data2.skew()
            kurtosis2 = data1.kurtosis()
            sub_df[new_col1][sub_df[f] == val] = mean2
            sub_df[new_col2][sub_df[f] == val] = median2
            sub_df[new_col3][sub_df[f] == val] = skew2
            sub_df[new_col4][sub_df[f] == val] = kurtosis2
        
        train_features.append(new_col1)
        train_features.append(new_col2)
        train_features.append(new_col3)
        train_features.append(new_col4)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [23]:
#Suspicious features (11)
f_calc_cols = [
'ps_calc_10',    #       :  309.57 / shadow  296.18
'ps_calc_01',    #       :  205.10 / shadow  189.43
'ps_calc_02',    #       :  201.08 / shadow  192.53
'ps_calc_03',    #       :  190.70 / shadow  188.47
'ps_calc_13',    #       :  188.75 / shadow  181.13
'ps_calc_08',    #       :  172.73 / shadow  169.42
'ps_calc_07',    #       :  170.48 / shadow  162.17
'ps_calc_12',    #       :  135.05 / shadow  133.40
'ps_calc_04'
]
#       :  130.43 / shadow  126.17
# ps_calc_17_bin #      :   40.23 / shadow   37.10
# ps_car_10_cat   

In [24]:
# f_calc_counts = ['ps_reg_03','ps_car_13']

In [25]:
trn_df = trn_df[train_features]
sub_df = sub_df[train_features]


KeyError: "['ps_ind_06_bin_median_range' 'ps_ind_06_bin_mean_range'\n 'ps_ind_07_bin_median_range' 'ps_ind_07_bin_mean_range'\n 'ps_ind_08_bin_median_range' 'ps_ind_08_bin_mean_range'\n 'ps_ind_09_bin_median_range' 'ps_ind_09_bin_mean_range'\n 'ps_ind_10_bin_median_range' 'ps_ind_10_bin_mean_range'\n 'ps_ind_11_bin_median_range' 'ps_ind_11_bin_mean_range'\n 'ps_ind_12_bin_median_range' 'ps_ind_12_bin_mean_range'\n 'ps_ind_13_bin_median_range' 'ps_ind_13_bin_mean_range'\n 'ps_ind_16_bin_median_range' 'ps_ind_16_bin_mean_range'\n 'ps_ind_17_bin_median_range' 'ps_ind_17_bin_mean_range'\n 'ps_ind_18_bin_median_range' 'ps_ind_18_bin_mean_range'\n 'ps_calc_15_bin_median_range' 'ps_calc_15_bin_mean_range'\n 'ps_calc_16_bin_median_range' 'ps_calc_16_bin_mean_range'\n 'ps_calc_17_bin_median_range' 'ps_calc_17_bin_mean_range'\n 'ps_calc_18_bin_median_range' 'ps_calc_18_bin_mean_range'\n 'ps_calc_19_bin_median_range' 'ps_calc_19_bin_mean_range'\n 'ps_calc_20_bin_median_range' 'ps_calc_20_bin_mean_range'\n 'ps_ind_06_bin_median_range' 'ps_ind_06_bin_mean_range'\n 'ps_ind_07_bin_median_range' 'ps_ind_07_bin_mean_range'\n 'ps_ind_08_bin_median_range' 'ps_ind_08_bin_mean_range'\n 'ps_ind_09_bin_median_range' 'ps_ind_09_bin_mean_range'\n 'ps_ind_10_bin_median_range' 'ps_ind_10_bin_mean_range'\n 'ps_ind_11_bin_median_range' 'ps_ind_11_bin_mean_range'\n 'ps_ind_12_bin_median_range' 'ps_ind_12_bin_mean_range'\n 'ps_ind_13_bin_median_range' 'ps_ind_13_bin_mean_range'\n 'ps_ind_16_bin_median_range' 'ps_ind_16_bin_mean_range'\n 'ps_ind_17_bin_median_range' 'ps_ind_17_bin_mean_range'\n 'ps_ind_18_bin_median_range' 'ps_ind_18_bin_mean_range'\n 'ps_calc_15_bin_median_range' 'ps_calc_15_bin_mean_range'\n 'ps_calc_16_bin_median_range' 'ps_calc_16_bin_mean_range'\n 'ps_calc_17_bin_median_range' 'ps_calc_17_bin_mean_range'\n 'ps_calc_18_bin_median_range' 'ps_calc_18_bin_mean_range'\n 'ps_calc_19_bin_median_range' 'ps_calc_19_bin_mean_range'\n 'ps_calc_20_bin_median_range' 'ps_calc_20_bin_mean_range'] not in index"

In [None]:
trn_df.head()

In [None]:
trn_df1 = trn_df.drop_duplicates(trn_df.columns.difference(['id']))

In [None]:
trn_df1.shape

In [None]:
f_cats = [f for f in trn_df.columns if "_cat" in f]

In [None]:
for f in f_cats:
    trn_df[f + "_avg"], sub_df[f + "_avg"] = target_encode(trn_series=trn_df[f],
                                         tst_series=sub_df[f],
                                         target=target,
                                         min_samples_leaf=200,
                                         smoothing=10,
                                         noise_level=0)

In [None]:
# sub_df = sub_df.drop('target',axis=1)

In [None]:
n_splits = 5
n_estimators = 200
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(trn_df.columns), n_splits))
xgb_evals = np.zeros((n_estimators, n_splits))
oof = np.empty(len(trn_df))
sub_preds = np.zeros(len(sub_df))
increase = True
# np.random.seed(0)


In [None]:
# for sp in np.linspace(1.0, 3.0, num=20):
#     sp = 2.0
#     Full OOF score : 0.284916
#     Best mean score : 0.284269 + 0.011259 @ 113
#     sp = 2.9
#     Full OOF score : 0.284905
#     Best mean score : 0.284473 + 0.010661 @ 123
# 1.55555555556
# Full OOF score : 0.285119
# Best mean score : 0.284699 + 0.010672 @ 171
# --------------------------------------
np.random.seed(0)
for fold_, (trn_idx, val_idx) in enumerate(folds.split(target, target)):
    trn_dat, trn_tgt = trn_df.iloc[trn_idx], target.iloc[trn_idx]
    val_dat, val_tgt = trn_df.iloc[val_idx], target.iloc[val_idx]


    # Upsample during cross validation to avoid having the same samples
    # in both train and validation sets
    # Validation set is not up-sampled to monitor overfitting
    if increase:
        # Get positive examples
        pos = pd.Series(trn_tgt == 1)
        # Add positive examples
        trn_dat = pd.concat([trn_dat, trn_dat.loc[pos]], axis=0)
        trn_tgt = pd.concat([trn_tgt, trn_tgt.loc[pos]], axis=0)
        # Shuffle data
        idx = np.arange(len(trn_dat))
        np.random.shuffle(idx)
        trn_dat = trn_dat.iloc[idx]
        trn_tgt = trn_tgt.iloc[idx]

#     print(sum(trn_tgt==0)/sum(trn_tgt==1))

#     sp = sum(trn_tgt==0)/sum(trn_tgt==1)
#         sp = 1.55556
    sp = 1.52631578947
    clf = XGBClassifier(n_estimators=n_estimators,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=.1, 
                        subsample=.8, 
                        colsample_bytree=.8,
                        scale_pos_weight=sp,
                        missing=-1,
                        gamma=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        nthread=5)

    clf.fit(trn_dat, trn_tgt, 
            eval_set=[(trn_dat, trn_tgt), (val_dat, val_tgt)],
            eval_metric=gini_xgb,
            early_stopping_rounds=None,
            verbose=False)

    # Keep feature importances
    imp_df[:, fold_] = clf.feature_importances_

    # Find best round for validation set
    xgb_evals[:, fold_] = clf.evals_result_["validation_1"]["gini"]
    # Xgboost provides best round starting from 0 so it has to be incremented
    best_round = np.argsort(xgb_evals[:, fold_])[::-1][0]
    print(best_round)

    # Predict OOF and submission probas with the best round
    oof[val_idx] = clf.predict_proba(val_dat, ntree_limit=int(best_round))[:, 1]
    # Update submission
    sub_preds += clf.predict_proba(sub_df, ntree_limit=int(best_round))[:, 1] / n_splits

    # Display results
    print("Fold %2d : %.6f @%4d / best score is %.6f @%4d"
          % (fold_ + 1,
             eval_gini(val_tgt, oof[val_idx]),
             n_estimators,
             xgb_evals[best_round, fold_],
             best_round))

print(sp)
full_oof_score = eval_gini(target, oof)
print("Full OOF score : %.6f" % eval_gini(target, oof))
# Compute mean score and std
mean_eval = np.mean(xgb_evals, axis=1)
std_eval = np.std(xgb_evals, axis=1)
best_round = np.argsort(mean_eval)[::-1][0]

print("Best mean score : %.6f + %.6f @%4d"
      % (mean_eval[best_round], std_eval[best_round], best_round))
print('--------------------------------------')
print('\n')

In [None]:
full_oof_score = eval_gini(target, oof)
print("Full OOF score : %.6f" % eval_gini(target, oof))
# org with clipping, Full OOF score : 0.284952, LB: 0.275
# org no clipping, Full OOF score : 0.284952, LB: 0.275
# org, removed ntree_limit, kaggle/python: Full OOF score : 0.283630, LB: 0.274
# org, with ntree_limit, kaggle/python: Full OOF score : 0.284745, LB: 0.282
# org, with missing=-1, Full OOF score, no sp : 0.285726, LB:0.281
# # org, with missing=-1, Full OOF score, sp : 0.283507, LB:
# org with top 5 values means, med, skew, Kurt of 3 vas: 0.282344
# org with top 2 values means, med, skew, Kurt of 3 vas: 0.283757, sub: 2017_11_27_20_27_58GMT, LB: 0.281
# org with 1 value mmsk: 0.282906, LB: won't do
# above with max_delta_step=1.1, removed sp: 0.0.285664, LB: won't do
# above with calc features: 0.282900
# org with 2 values mmsk: 0.286022, sub:2017_11_28_06_36_39, LB: 0.281
# org with 4 car values mmsk, delta_step: 0.284071, sub:sub.xgb.0.28407065884695226.2017_11_28_07_01_03GMT, LB: 
# above with sp, no max_delta, 4 mmsk, sub.xgb.0.28059886243614096.2017_11_28_07_55_18GMT, LB:
# about with sp=1.55556, 4 mmsk, ssub.xgb.0.2851186935146799.2017_11_28_19_38_05GMT, LB: 0.281
# about with sp=1.55556, 4 mmsk, sub.xgb.0.2839886673471289.2017_11_28_20_28_27GMT, LB: 0.280
# above + sp=1.52, sub.xgb.0.2845332509136781.2017_11_29_03_58_34GMT, LB: 0.281
# took imp from features, Full OOF score : 0.285477, sub.xgb.0.2854771395865894.2017_11_29_04_43_17GMT, LB:0.280
#0.284813
#0.285085

In [None]:
# Compute mean score and std
mean_eval = np.mean(xgb_evals, axis=1)
std_eval = np.std(xgb_evals, axis=1)
best_round = np.argsort(mean_eval)[::-1][0]

print("Best mean score : %.6f + %.6f @%4d"
      % (mean_eval[best_round], std_eval[best_round], best_round))
    


In [None]:
importances = sorted([(trn_df.columns[i], imp) for i, imp in enumerate(imp_df.mean(axis=1))],
                     key=lambda x: x[1])

for f, imp in importances[::-1]:
    print("%-34s : %10.4f" % (f, imp))

In [None]:
sub_df["target"] = sub_preds
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.xgb.{}.{}GMT'.format(full_oof_score, now)
sub_df[["target"]].to_csv(fn, index=True, float_format="%.9f")

In [None]:
print(now)

In [None]:
print(fn)

In [None]:
np.unique(trn_df['ps_reg_03'])

In [None]:
np.unique(trn_df['ps_car_13'])

In [None]:
print(now)

In [None]:
sub_df.tail()

In [None]:
np.min(sub_preds)

In [None]:
np.max(sub_preds)

In [None]:
sub_df.shape

In [None]:
trn_df.shape

In [None]:
y = [int(round(x)) for x in sub_preds]

In [None]:
y

In [None]:
y1 = [x for x in target.values]

In [None]:
Y = y1 + y

In [None]:
len(Y)

In [None]:
df = pd.DataFrame()

In [None]:
df['target'] = Y

In [None]:
sub_df = sub_df.drop('target', axis=1)

In [None]:
trn_df.shape

In [None]:
sub_df.shape

In [None]:
target = df['target']

In [None]:
X = pd.concat([trn_df, sub_df], axis=0)

In [None]:
X.shape

In [None]:
n_splits = 5
n_estimators = 200
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(X.columns), n_splits))
xgb_evals = np.zeros((n_estimators, n_splits))
oof = np.empty(len(X))
sub_preds = np.zeros(len(sub_df))
increase = True
np.random.seed(0)


In [None]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(target, target)):
    trn_dat, trn_tgt = X.iloc[trn_idx], target.iloc[trn_idx]
    val_dat, val_tgt = X.iloc[val_idx], target.iloc[val_idx]

    
    # Upsample during cross validation to avoid having the same samples
    # in both train and validation sets
    # Validation set is not up-sampled to monitor overfitting
    if False:
        # Get positive examples
        pos = pd.Series(trn_tgt == 1)
        # Add positive examples
        trn_dat = pd.concat([trn_dat, trn_dat.loc[pos]], axis=0)
        trn_tgt = pd.concat([trn_tgt, trn_tgt.loc[pos]], axis=0)
        # Shuffle data
        idx = np.arange(len(trn_dat))
        np.random.shuffle(idx)
        trn_dat = trn_dat.iloc[idx]
        trn_tgt = trn_tgt.iloc[idx]
    
    print(sum(trn_tgt==0)/sum(trn_tgt==1))
    
    sp = sum(trn_tgt==0)/sum(trn_tgt==1)
    clf = XGBClassifier(n_estimators=n_estimators,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=.1, 
                        subsample=.8, 
                        colsample_bytree=.8,
                        scale_pos_weight =sp,
                        missing=-1,
                        gamma=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        nthread=5)
    
    clf.fit(trn_dat, trn_tgt, 
            eval_set=[(trn_dat, trn_tgt), (val_dat, val_tgt)],
            eval_metric=gini_xgb,
            early_stopping_rounds=None,
            verbose=False)
            
    # Keep feature importances
    imp_df[:, fold_] = clf.feature_importances_

    # Find best round for validation set
    xgb_evals[:, fold_] = clf.evals_result_["validation_1"]["gini"]
    # Xgboost provides best round starting from 0 so it has to be incremented
    best_round = np.argsort(xgb_evals[:, fold_])[::-1][0]
    print(best_round)
    
    # Predict OOF and submission probas with the best round
    oof[val_idx] = clf.predict_proba(val_dat, ntree_limit=int(best_round))[:, 1]
    # Update submission
    sub_preds += clf.predict_proba(sub_df, ntree_limit=int(best_round))[:, 1] / n_splits

    # Display results
    print("Fold %2d : %.6f @%4d / best score is %.6f @%4d"
          % (fold_ + 1,
             eval_gini(val_tgt, oof[val_idx]),
             n_estimators,
             xgb_evals[best_round, fold_],
             best_round))
          


In [None]:
np.min(sub_preds)

In [None]:
np.max(sub_preds)

In [None]:
y

In [None]:
sub_preds

In [None]:
sub_df["target"] = sub_preds
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.xgb.{}GMT'.format(now)
sub_df[["target"]].to_csv(fn, index=True, float_format="%.9f")
# 0.274 on pblic LB