In [1]:
!which python

/opt/conda/bin/python


In [2]:
# !python

In [3]:
# from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

"""
This simple scripts demonstrates the use of xgboost eval results to get the best round
for the current fold and accross folds. 
It also shows an upsampling method that limits cross-validation overfitting.
"""

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import gc
from numba import jit
from sklearn.preprocessing import LabelEncoder
import time 
from datetime import datetime
import patsy

In [5]:
@jit
def eval_gini(y_true, y_prob):
    """
    Original author CPMP : https://www.kaggle.com/cpmpml
    In kernel : https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))



In [6]:
def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)


In [7]:
gc.enable()

trn_df = pd.read_csv("../data/train.csv", index_col=0)
sub_df = pd.read_csv("../data/test.csv", index_col=0)

target = trn_df["target"]


In [8]:
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
    "ps_car_11_cat" # Very nice spot from Tilii : https://www.kaggle.com/tilii7
]

l = ['ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_07_cat',
 'ps_car_03_cat',
 'ps_car_06_cat',
 'ps_car_04_cat',
 'ps_car_09_cat',
 'ps_car_02_cat',
 'ps_ind_02_cat',
 'ps_car_05_cat',
 'ps_car_08_cat',
 'ps_ind_04_cat'
]

l_bins = ['ps_ind_17_bin',
 'ps_ind_16_bin',
 'ps_ind_07_bin',
 'ps_ind_06_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_18_bin',
 'ps_ind_12_bin']

# add combinations
combos = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat')
]


In [9]:
# for n_c, (f1, f2) in enumerate(combos):
#     name1 = f1 + "_plus_" + f2
#     train_features.append(name1)

In [10]:
start = time.time()
for n_c, (f1, f2) in enumerate(combos):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
    sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
    trn_df[name1] = lbl.transform(list(trn_df[name1].values))
    sub_df[name1] = lbl.transform(list(sub_df[name1].values))

    train_features.append(name1)

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0

In [11]:
f = 'target ~ '

In [12]:
s = []
for i in range(len(l)):
    for j in range(i+1,len(l)):
        col1 = l[i]
        col2 = l[j]
        
        s.append('C({}):C({})'.format(col1, col2))
s = ' + '.join(s)
s += ' -1'

In [13]:
f = f + ' ' + s

In [14]:
f

'target ~  C(ps_ind_05_cat):C(ps_car_01_cat) + C(ps_ind_05_cat):C(ps_car_07_cat) + C(ps_ind_05_cat):C(ps_car_03_cat) + C(ps_ind_05_cat):C(ps_car_06_cat) + C(ps_ind_05_cat):C(ps_car_04_cat) + C(ps_ind_05_cat):C(ps_car_09_cat) + C(ps_ind_05_cat):C(ps_car_02_cat) + C(ps_ind_05_cat):C(ps_ind_02_cat) + C(ps_ind_05_cat):C(ps_car_05_cat) + C(ps_ind_05_cat):C(ps_car_08_cat) + C(ps_ind_05_cat):C(ps_ind_04_cat) + C(ps_car_01_cat):C(ps_car_07_cat) + C(ps_car_01_cat):C(ps_car_03_cat) + C(ps_car_01_cat):C(ps_car_06_cat) + C(ps_car_01_cat):C(ps_car_04_cat) + C(ps_car_01_cat):C(ps_car_09_cat) + C(ps_car_01_cat):C(ps_car_02_cat) + C(ps_car_01_cat):C(ps_ind_02_cat) + C(ps_car_01_cat):C(ps_car_05_cat) + C(ps_car_01_cat):C(ps_car_08_cat) + C(ps_car_01_cat):C(ps_ind_04_cat) + C(ps_car_07_cat):C(ps_car_03_cat) + C(ps_car_07_cat):C(ps_car_06_cat) + C(ps_car_07_cat):C(ps_car_04_cat) + C(ps_car_07_cat):C(ps_car_09_cat) + C(ps_car_07_cat):C(ps_car_02_cat) + C(ps_car_07_cat):C(ps_ind_02_cat) + C(ps_car_07_cat):

In [15]:
sub_df['target']=0
df_all = pd.concat([trn_df, sub_df], axis=0)

In [16]:
y,X = patsy.dmatrices(f, df_all, return_type='dataframe')

In [17]:
X

Unnamed: 0_level_0,C(ps_ind_05_cat)[-1]:C(ps_car_01_cat)[-1],C(ps_ind_05_cat)[0]:C(ps_car_01_cat)[-1],C(ps_ind_05_cat)[1]:C(ps_car_01_cat)[-1],C(ps_ind_05_cat)[2]:C(ps_car_01_cat)[-1],C(ps_ind_05_cat)[3]:C(ps_car_01_cat)[-1],C(ps_ind_05_cat)[4]:C(ps_car_01_cat)[-1],C(ps_ind_05_cat)[5]:C(ps_car_01_cat)[-1],C(ps_ind_05_cat)[6]:C(ps_car_01_cat)[-1],C(ps_ind_05_cat)[-1]:C(ps_car_01_cat)[0],C(ps_ind_05_cat)[0]:C(ps_car_01_cat)[0],...,C(ps_ind_02_cat)[T.3]:C(ps_ind_04_cat)[T.1],C(ps_ind_02_cat)[T.4]:C(ps_ind_04_cat)[T.1],C(ps_car_05_cat)[T.0]:C(ps_car_08_cat)[T.1],C(ps_car_05_cat)[T.1]:C(ps_car_08_cat)[T.1],C(ps_car_05_cat)[T.0]:C(ps_ind_04_cat)[T.0],C(ps_car_05_cat)[T.1]:C(ps_ind_04_cat)[T.0],C(ps_car_05_cat)[T.0]:C(ps_ind_04_cat)[T.1],C(ps_car_05_cat)[T.1]:C(ps_ind_04_cat)[T.1],C(ps_car_08_cat)[T.1]:C(ps_ind_04_cat)[T.0],C(ps_car_08_cat)[T.1]:C(ps_ind_04_cat)[T.1]
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [18]:
del trn_df["target"]

In [19]:
# for c in X.columns:
#     X[c] = X[c].astype('int')

In [20]:
X.shape

(1488028, 1866)

In [21]:
# drop columns with constant value
X = X.loc[:, (X != X.iloc[0]).any()] 

In [22]:
X.shape

(1488028, 1795)

In [23]:
df_all.shape

(1488028, 60)

In [24]:
df_all = pd.concat([df_all, X], axis=1)

In [25]:
trn_df = df_all.iloc[:len(trn_df)]
sub_df = df_all.iloc[len(trn_df):]

In [26]:
trn_df.shape

(595212, 1855)

In [27]:
sub_df.shape

(892816, 1855)

In [28]:
sub_df = sub_df.drop('target', axis=1)

In [29]:
len(train_features)

37

In [30]:
train_features1 = train_features + list(X.columns)

In [31]:
len(train_features)

37

In [32]:
trn_df.isnull().sum()

ps_calc_01                                     0
ps_calc_02                                     0
ps_calc_03                                     0
ps_calc_04                                     0
ps_calc_05                                     0
ps_calc_06                                     0
ps_calc_07                                     0
ps_calc_08                                     0
ps_calc_09                                     0
ps_calc_10                                     0
ps_calc_11                                     0
ps_calc_12                                     0
ps_calc_13                                     0
ps_calc_14                                     0
ps_calc_15_bin                                 0
ps_calc_16_bin                                 0
ps_calc_17_bin                                 0
ps_calc_18_bin                                 0
ps_calc_19_bin                                 0
ps_calc_20_bin                                 0
ps_car_01_cat       

In [33]:
sub_df.isnull().sum()

ps_calc_01                                     0
ps_calc_02                                     0
ps_calc_03                                     0
ps_calc_04                                     0
ps_calc_05                                     0
ps_calc_06                                     0
ps_calc_07                                     0
ps_calc_08                                     0
ps_calc_09                                     0
ps_calc_10                                     0
ps_calc_11                                     0
ps_calc_12                                     0
ps_calc_13                                     0
ps_calc_14                                     0
ps_calc_15_bin                                 0
ps_calc_16_bin                                 0
ps_calc_17_bin                                 0
ps_calc_18_bin                                 0
ps_calc_19_bin                                 0
ps_calc_20_bin                                 0
ps_car_01_cat       

In [34]:

# for i in range(len(l)):
#     for j in range(i+1, len(l)):
#         f1 = l[i]
#         f2 = l[j]
#         name1 = f1 + "_plus_" + f2
#         print('current feature %60s %4d in %5.1f'
#               % (name1, n_c + 1, (time.time() - start) / 60), end='')
#         print('\r' * 75, end='')
#         trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
#         sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
#         print('\n')
#         lbl = LabelEncoder()
#         lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
#         train_features.append(name1)
#         combos.append(name1)
        
# for i in range(len(l_bins)):
#     for j in range(i+1, len(l_bins)):
#         f1 = l_bins[i]
#         f2 = l_bins[j]
#         name1 = f1 + "_" + f2 + '_cat'
#         print('current feature %60s %4d in %5.1f'
#               % (name1, n_c + 1, (time.time() - start) / 60), end='')
#         print('\r' * 75, end='')
#         trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
#         sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
#         print('\n')
#         lbl = LabelEncoder()
#         lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
#         train_features.append(name1)
#         combos.append(name1)
        
# for i in range(len(l_bins)):
#     for j in range(len(l)):
#         f1 = l_bins[i]
#         f2 = l[j]
#         name1 = f1 + "_" + f2
#         print('current feature %60s %4d in %5.1f'
#               % (name1, n_c + 1, (time.time() - start) / 60), end='')
#         print('\r' * 75, end='')
#         trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
#         sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
#         print('\n')
# #         lbl = LabelEncoder()
# #         lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
#         train_features.append(name1)
#         combos.append(name1)

In [35]:
trn_df = trn_df[train_features1]
sub_df = sub_df[train_features1]


In [36]:
trn_df.shape

(595212, 1832)

In [37]:
sub_df.shape

(892816, 1832)

In [38]:
f_cats = [f for f in train_features if "_cat" in f]

In [39]:
len(f_cats)

15

In [40]:
for f in f_cats:
    trn_df[f + "_avg"], sub_df[f + "_avg"] = target_encode(trn_series=trn_df[f],
                                         tst_series=sub_df[f],
                                         target=target,
                                         min_samples_leaf=200,
                                         smoothing=10,
                                         noise_level=0)

In [41]:
trn_df.columns[100]

'C(ps_ind_05_cat)[2]:C(ps_car_01_cat)[7]'

In [42]:
# for f in f_cats:
#     trn_df = trn_df.drop(f, axis=1)
#     sub_df = sub_df.drop(f, axis=1)

In [43]:
# for i in range(len(l)):
#     for j in range(i+1, len(l)):
#         f1 = l[i]
#         f2 = l[j]
#         f = f1 + "_plus_" + f2
#         trn_df = trn_df.drop(f, axis=1)
#         sub_df = sub_df.drop(f, axis=1)

In [44]:
del df_all
del X
gc.collect()

512

In [45]:
n_splits = 5
n_estimators = 200
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(trn_df.columns), n_splits))
xgb_evals = np.zeros((n_estimators, n_splits))
oof = np.empty(len(trn_df))
sub_preds = np.zeros(len(sub_df))
increase = True
np.random.seed(0)


In [46]:
col_names = ['col_'+str(i) for i in range(len(list(trn_df.columns)))]

In [47]:
trn_df.columns = col_names
sub_df.columns = col_names

In [48]:
trn_df.shape

(595212, 1847)

In [None]:
sub_df.shape

(892816, 1847)

In [None]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(target, target)):
    trn_dat, trn_tgt = trn_df.iloc[trn_idx], target.iloc[trn_idx]
    val_dat, val_tgt = trn_df.iloc[val_idx], target.iloc[val_idx]

    clf = XGBClassifier(n_estimators=n_estimators,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=.1, 
                        subsample=.8, 
                        colsample_bytree=.8,
                        gamma=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        missing=-1,
                        nthread=5)
    # Upsample during cross validation to avoid having the same samples
    # in both train and validation sets
    # Validation set is not up-sampled to monitor overfitting
    if increase:
        # Get positive examples
        pos = pd.Series(trn_tgt == 1)
        # Add positive examples
        trn_dat = pd.concat([trn_dat, trn_dat.loc[pos]], axis=0)
        trn_tgt = pd.concat([trn_tgt, trn_tgt.loc[pos]], axis=0)
        # Shuffle data
        idx = np.arange(len(trn_dat))
        np.random.shuffle(idx)
        trn_dat = trn_dat.iloc[idx]
        trn_tgt = trn_tgt.iloc[idx]
        
    clf.fit(trn_dat, trn_tgt, 
            eval_set=[(trn_dat, trn_tgt), (val_dat, val_tgt)],
            eval_metric=gini_xgb,
            early_stopping_rounds=None,
            verbose=False)
            
    # Keep feature importances
    imp_df[:, fold_] = clf.feature_importances_

    # Find best round for validation set
    xgb_evals[:, fold_] = clf.evals_result_["validation_1"]["gini"]
    # Xgboost provides best round starting from 0 so it has to be incremented
    best_round = np.argsort(xgb_evals[:, fold_])[::-1][0]
    print(best_round)
    
    # Predict OOF and submission probas with the best round
    oof[val_idx] = clf.predict_proba(val_dat, ntree_limit=int(best_round))[:, 1]
    # Update submission
    sub_preds += clf.predict_proba(sub_df, ntree_limit=int(best_round))[:, 1] / n_splits

    # Display results
    print("Fold %2d : %.6f @%4d / best score is %.6f @%4d"
          % (fold_ + 1,
             eval_gini(val_tgt, oof[val_idx]),
             n_estimators,
             xgb_evals[best_round, fold_],
             best_round))
          


196
Fold  1 : 0.275446 @ 200 / best score is 0.275458 @ 196
165146
Fold  3 : 0.305321 @ 200 / best score is 0.305368 @ 146
140
Fold  4 : 0.278624 @ 200 / best score is 0.278701 @ 140
123
Fold  5 : 0.275338 @ 200 / best score is 0.275941 @ 123


In [None]:
print("Full OOF score : %.6f" % eval_gini(target, oof))
# org with clipping, Full OOF score : 0.284952, LB: 0.275
# org no clipping, Full OOF score : 0.284952, LB: 0.275
# org, removed ntree_limit, kaggle/python: Full OOF score : 0.283630, LB: 0.274
# org, with ntree_limit, kaggle/python: Full OOF score : 0.284745, LB: 0.282
# org, with my changes (l,l_bin and combos), Full OOF score : 0.286360, LB: 
# org, with (l and combos): Full OOF score : 0.285772, LB:
# org, (lm l_bin): Full OOF score : 0.286804, sub: 2017_11_25_18_45_18 GMT, LB: 0.280
# above, with clipping: 0.273
# org, with l combo, no sp, Full OOF score : 0.286919, sub: 2017_11_26_08_19_53GMT, LB: 0.281
# org with patsy, all cols, Full OOF score : 0.282860, LB:0.279

Full OOF score : 0.282860


In [None]:
# Compute mean score and std
mean_eval = np.mean(xgb_evals, axis=1)
std_eval = np.std(xgb_evals, axis=1)
best_round = np.argsort(mean_eval)[::-1][0]

print("Best mean score : %.6f + %.6f @%4d"
      % (mean_eval[best_round], std_eval[best_round], best_round))
    


Best mean score : 0.282652 + 0.011475 @ 140


In [None]:
importances = sorted([(trn_df.columns[i], imp) for i, imp in enumerate(imp_df.mean(axis=1))],
                     key=lambda x: x[1])

for f, imp in importances[::-1]:
    print("%-34s : %10.4f" % (f, imp))

col_1                              :     0.0768
col_0                              :     0.0635
col_3                              :     0.0496
col_4                              :     0.0413
col_6                              :     0.0389
col_14                             :     0.0263
col_1846                           :     0.0260
col_5                              :     0.0245
col_1844                           :     0.0243
col_10                             :     0.0210
col_13                             :     0.0191
col_1845                           :     0.0161
col_1832                           :     0.0153
col_34                             :     0.0136
col_1833                           :     0.0124
col_26                             :     0.0102
col_7                              :     0.0097
col_16                             :     0.0096
col_36                             :     0.0093
col_25                             :     0.0089
col_15                             :    

In [None]:
len(importances)

1847

In [None]:
len(trn_df.columns)

1847

In [None]:
sub_preds

array([ 0.04947873,  0.04501828,  0.05357854, ...,  0.07932309,
        0.05074991,  0.06225527])

In [None]:
np.min(sub_preds)

0.014956292696297169

In [None]:
np.max(sub_preds)

0.71768708527088165

In [None]:
# sub_preds1 = np.clip(sub_preds, a_min=0.05, a_max=0.95) #!!! no clipping must be done here

In [None]:
sub_df["target"] = sub_preds
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.xgb.{}GMT'.format(now)
sub_df[["target"]].to_csv(fn, index=True, float_format="%.9f")

In [None]:
print(now)

2017_11_26_18_57_55


In [None]:
sub_df.tail()

Unnamed: 0_level_0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_1838,col_1839,col_1840,col_1841,col_1842,col_1843,col_1844,col_1845,col_1846,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1488022,1.528774,1.048809,0,6,2,0.3,0.565685,0.565685,0,0,...,0.033344,0.033772,0.035725,0.040111,0.034735,0.03479,0.066504,0.030048,0.045198,0.182764
1488023,1.040567,1.246495,0,5,11,1.0,0.418569,0.424264,8,1,...,0.033344,0.033772,0.037217,0.040111,0.044933,0.038644,0.039117,0.035967,0.035641,0.07981
1488024,0.968992,0.609303,0,5,5,0.0,0.390384,0.374166,11,1,...,0.056946,0.033772,0.035725,0.031709,0.044933,0.03479,0.045593,0.030893,0.030549,0.079323
1488025,0.725125,0.920937,0,5,13,0.6,0.378153,0.387298,10,1,...,0.033344,0.049507,0.035725,0.040454,0.034735,0.038644,0.042431,0.048512,0.033977,0.05075
1488026,0.637175,0.992157,0,4,12,0.8,0.296648,0.316228,9,1,...,0.036319,0.033772,0.035725,0.040111,0.034735,0.038644,0.029784,0.038214,0.038347,0.062255


In [None]:
#best
ps_car_13                          :     0.1226
ps_reg_03                          :     0.0891
ps_ind_03                          :     0.0610
ps_ind_15                          :     0.0485
ps_car_14                          :     0.0484
ps_ind_01                          :     0.0360
ps_reg_02                          :     0.0357
ps_car_11_cat_avg                  :     0.0340
ps_reg_01_plus_ps_car_04_cat_avg   :     0.0333
ps_ind_05_cat_avg                  :     0.0288
ps_car_11_cat                      :     0.0266
ps_car_01_cat_avg                  :     0.0257
ps_car_15                          :     0.0256
ps_reg_01_plus_ps_car_02_cat_avg   :     0.0251
ps_ind_17_bin                      :     0.0226
ps_car_12                          :     0.0206
ps_reg_01_plus_ps_car_04_cat       :     0.0193
ps_calc_05                         :     0.0187
ps_calc_09                         :     0.0185
ps_car_09_cat_avg                  :     0.0184
ps_car_06_cat                      :     0.0170
ps_car_01_cat                      :     0.0161
ps_car_06_cat_avg                  :     0.0152
ps_car_07_cat                      :     0.0148
ps_ind_02_cat_avg                  :     0.0135
ps_ind_05_cat                      :     0.0123
ps_car_03_cat                      :     0.0123
ps_car_11                          :     0.0108
ps_reg_01_plus_ps_car_02_cat       :     0.0108
ps_ind_02_cat                      :     0.0107

In [None]:
l1 = [
    1, 
    0,
    3,
    4 ,
    6,
    14,
    1846, 
    5,
    1844,
    10,
    13,
    1845,
    1832,
    34,
    1833,
    26 
]

In [None]:
for i in l1:
    print(train_features[i])

In [24]:
f_ind_reg = [f for f in trn_df.columns if "_cat" not in f and '_bin' not in f]

In [25]:
f_ind_reg

['ps_car_13',
 'ps_reg_03',
 'ps_ind_03',
 'ps_ind_15',
 'ps_reg_02',
 'ps_car_14',
 'ps_car_12',
 'ps_reg_01',
 'ps_car_15',
 'ps_ind_01',
 'ps_car_11',
 'ps_calc_09',
 'ps_calc_05',
 'ps_ind_14']

In [None]:
use_delta = []
use_delta = [1 if int(np.min(f) = 0]

In [None]:
ps_car_12 = cc,
ps_car_13 = vehicle value,
ps_car_14 = vehicle weight Kg
ps_car_15 = manufacture year

(4*ps_reg_03)^2 maps to 2dp resolution
(4*ps_reg_03)^2

Column "ps_car_15" contains below unique values, 
which are square roots of integer 0 ~ 14. Is it devised deliberately? It looks non-sense.


In [28]:
for f in f_ind_reg:
    print(f)
    print(f, np.min(trn_df[f].values), np.max(trn_df[f]))
    print(f, np.min(sub_df[f].values), np.max(sub_df[f]))
    print('\n')

ps_car_13
ps_car_13 0.2506190682 3.7206260026
ps_car_13 0.2757783875 4.0313005715


ps_reg_03
ps_reg_03 -1.0 4.0379450219
ps_reg_03 -1.0 4.4235167005


ps_ind_03
ps_ind_03 0 11
ps_ind_03 0 11


ps_ind_15
ps_ind_15 0 13
ps_ind_15 0 13


ps_reg_02
ps_reg_02 0.0 1.8
ps_reg_02 0.0 1.8


ps_car_14
ps_car_14 -1.0 0.6363961031
ps_car_14 -1.0 0.6363961031


ps_car_12
ps_car_12 -1.0 1.2649110641
ps_car_12 0.1414213562 1.2649110641


ps_reg_01
ps_reg_01 0.0 0.9
ps_reg_01 0.0 0.9


ps_car_15
ps_car_15 0.0 3.7416573868
ps_car_15 0.0 3.7416573868


ps_ind_01
ps_ind_01 0 7
ps_ind_01 0 7


ps_car_11
ps_car_11 -1 3
ps_car_11 -1 3


ps_calc_09
ps_calc_09 0 7
ps_calc_09 0 7


ps_calc_05
ps_calc_05 0 6
ps_calc_05 0 6


ps_ind_14
ps_ind_14 0 4
ps_ind_14 0 4




In [33]:
for f in f_cats:
    if f not in combos:
        print(f, np.unique(trn_df[f].values))

ps_ind_05_cat [-1  0  1  2  3  4  5  6]
ps_car_01_cat [-1  0  1  2  3  4  5  6  7  8  9 10 11]
ps_car_07_cat [-1  0  1]
ps_car_03_cat [-1  0  1]
ps_car_06_cat [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
ps_car_04_cat [0 1 2 3 4 5 6 7 8 9]
ps_car_09_cat [-1  0  1  2  3  4]
ps_car_02_cat [-1  0  1]
ps_ind_02_cat [-1  1  2  3  4]
ps_car_05_cat [-1  0  1]
ps_car_08_cat [0 1]
ps_ind_04_cat [-1  0  1]
ps_car_11_cat [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104]
ps_reg_01_plus_ps_car_02_cat [ 0  1  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]
ps_reg_01_plus_ps_car_04_cat [ 0  1  2  3  4  5  

In [None]:
very serious - 7 points

serious - 5 points

average - 4 points

light - 3 points

i don't know if this is important or if this exists in this competition

another detail, there are 2 types of licenses,
provisional (usually the first license) and permanent. the time of these licenses is 1 year, 
and 10 years (if i'm not wrong)

In [None]:
dat["cont1"] = sqrt(dat["cont1"])
dat$cont2 = asin(sqrt(dat$cont2))
dat$cont3 = exp(dat$cont3)
dat["cont4"] = 1/sqrt(dat["cont4"])
dat["cont5"] = 1/sqrt(dat["cont5"])
#D$cont8 = log10(D$cont8)
dat["cont10"] = Math.cbrt(dat["cont10"])
dat["cont11"] = 1/sqrt(dat["cont11"])
dat["cont12"] = 1/exp(dat["cont12"])
dat["cont6"] = sqrt(dat["cont6"])
#D["cont7"] = exp(D["cont7"])
dat$cont7 = 3 ^ dat$cont7
dat["cont9"] = sin(dat["cont9"])
dat$cont13 = asin(sqrt(dat$cont13))
dat$cont14 = 5 ^ dat$cont14


In [None]:
transformations = ['sq', 'sqrt', 'exp', 'div_sqrt', 'cbrt', 'pow_3', 'pow_5', 'sin', 'log']

In [None]:
int(0.2)

In [None]:
def trans(t, x):
    if (x == -1) | (float(x) == -1.0):
        return -1
    if x == 0 | x == 0.001:
        x = float(x) + 0.001 # increment x by delta
    if t == 'sq':
        return x * x
    elif t == 'sqrt':
        return math.sqrt(x)
    elif t == 'exp':
        return math.exp(x)
    elif t == 'div_sqrt':
        return 1./math.sqrt(x)
    elif t == 'cbrt':
        return x ** (1./3)
    elif t == 'pow_3':
        return 3 ** x
    elif t == 'pow_5':
        return 5 ** x
    elif t == 'pow_5':
        return 5 ** x
    elif t == 'sin':
        return math.sin(x)
    elif t == 'log':
        return math.log(x)

In [None]:
for i in range(len(f_ind_reg)):
    f = f_ind_reg[i]
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
    sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
    print('\n')

    train_features.append(name1)
    combos.append(name1)

In [None]:
# transforms
for i in range(len(f_ind_reg)):
    col = f_in_reg[i]
    for t in transformations:
        new_col = col + '_t' + '_' + str(t)
        trn_df[new_col] = trn_df[col].map(lambda x: trans(t,x))
        sub_df[new_col] = sub_df[col].map(lambda x: trans(t,x))

In [None]:
for i in range(len(f_ind_reg)):
    for j in range(i+1, len(f_ind_reg)):
        col1 = f_in_reg[i]
        col2 = f_in_reg[j]
        new_col = col1 + '_m_' + col2
        trn_df[new_col] = trn_df[col1] * trn_df[col2]
        trn_df[new_col] = trn_df[new_col][trn_df[new_col] < 0] = -1
        
        new_col = col1 + '_d_' + col2
        if np.min(trn_df[col2] < 0)
        trn_df[new_col] = trn_df[col1]/trn_df[col2]
        trn_df[new_col] = trn_df[new_col][trn_df[new_col] < 0] = -1
        
        new_col = col1 + '_a_' + col2
        trn_df[new_col] = trn_df[col1] * trn_df[col2]
        trn_df[new_col] = trn_df[new_col][trn_df[new_col] < 0] = -1
        
        new_col = col1 + '_s_' + col2
        trn_df[new_col] = trn_df[col1] * trn_df[col2]
        trn_df[new_col] = trn_df[new_col][trn_df[new_col] < 0] = -1