In [1]:
!which python

/opt/conda/bin/python


In [2]:
# !python

In [3]:
# from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

"""
This simple scripts demonstrates the use of xgboost eval results to get the best round
for the current fold and accross folds. 
It also shows an upsampling method that limits cross-validation overfitting.
"""

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import gc
from numba import jit
from sklearn.preprocessing import LabelEncoder
import time 
from datetime import datetime
import patsy
import math
from tqdm import tqdm

In [5]:
@jit
def eval_gini(y_true, y_prob):
    """
    Original author CPMP : https://www.kaggle.com/cpmpml
    In kernel : https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))



In [6]:
def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)


In [7]:
gc.enable()

trn_df = pd.read_csv("../data/train.csv", index_col=0)
sub_df = pd.read_csv("../data/test.csv", index_col=0)

target = trn_df["target"]


In [8]:
train_features1 = [
    'ps_car_13',
    'ps_reg_03',
    'ps_ind_03',
    'ps_ind_15',
    'ps_car_14',
    'ps_ind_01',
    'ps_reg_02',
    'ps_car_11_cat',
    #'ps_reg_01_plus_ps_car_04_cat_avg',
    'ps_ind_05_cat',
    'ps_car_01_cat',
    'ps_car_15',
    #'ps_reg_01_plus_ps_car_02_cat_avg',
    'ps_ind_17_bin',
    'ps_car_12',
    #'ps_reg_01_plus_ps_car_04_cat',
    'ps_calc_05',
    'ps_calc_09',
    'ps_car_09_cat',
    'ps_car_06_cat',
    'ps_car_07_cat',
    'ps_ind_02_cat',
    'ps_car_03_cat',
    'ps_car_11',
    #'ps_reg_01_plus_ps_car_02_cat',
]

In [9]:
len(train_features1)

21

In [10]:
len(np.unique(train_features1))

21

In [11]:
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
    "ps_car_11_cat" # Very nice spot from Tilii : https://www.kaggle.com/tilii7
]

l = [
    'ps_car_11_cat',
    'ps_ind_05_cat',
    'ps_car_01_cat',
    'ps_car_09_cat',
    'ps_car_06_cat'
#     'ps_car_01_cat',
#     'ps_car_07_cat',
#     'ps_ind_02_cat',
#     'ps_ind_05_cat',
#     'ps_car_03_cat',
]    
    
# l = ['ps_ind_05_cat',
#  'ps_car_01_cat',
#  'ps_car_07_cat',
#  'ps_car_03_cat',
#  'ps_car_06_cat',
#  'ps_car_04_cat',
#  'ps_car_09_cat',
#  'ps_car_02_cat',
#  'ps_ind_02_cat',
#  'ps_car_05_cat',
#  'ps_car_08_cat',
#  'ps_ind_04_cat'
# ]

l_bins = ['ps_ind_17_bin',
 'ps_ind_16_bin',
 'ps_ind_07_bin',
 'ps_ind_06_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_18_bin',
 'ps_ind_12_bin']

# add combinations
combos = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
    ('ps_reg_01', 'ps_car_11_cat'),
    ('ps_reg_02', 'ps_car_11_cat'),
    ('ps_reg_03', 'ps_car_11_cat'),
    ('ps_car_13', 'ps_ind_05_cat'),
    ('ps_car_13', 'ps_car_11_cat'),
    ('ps_car_13', 'ps_car_01_cat'),
    ('ps_car_13', 'ps_car_09_cat'),
    ('ps_ind_03', 'ps_car_11_cat'),
    ('ps_ind_15', 'ps_car_11_cat'),
    ('ps_car_14', 'ps_car_11_cat'),
    ('ps_ind_01', 'ps_car_11_cat')
]

# f_ind_reg =
# [
#     'ps_reg_03',
#     'ps_ind_03',
#     'ps_ind_15',
#     'ps_car_14',
#     'ps_ind_01',
# ]
f_ind_reg = [
    'ps_car_13',
    'ps_reg_03',
    'ps_ind_03',
    'ps_ind_15',
    'ps_reg_02',
    'ps_car_14',
    'ps_car_12',
    'ps_reg_01',
    'ps_car_15',
    'ps_ind_01',
    'ps_car_11',
    'ps_calc_09',
    'ps_calc_05',
    'ps_ind_14'
]

# transformations = ['sq', 'sqrt', 'exp', 'div_sqrt', 'cbrt', 'pow_3', 'pow_5', 'sin', 'log']

transformations = ['sin']

In [12]:
def trans(t, x):
    if (x == -1) | (float(x) == -1.0):
        return -1
    if float(x) == 0.0:
        x = float(x) + 0.001 # increment x by delta
    if t == 'sq':
        return x * x
    elif t == 'sqrt':
        return math.sqrt(x)
    elif t == 'exp':
        return math.exp(x)
    elif t == 'div_sqrt':
        return 1./math.sqrt(x)
    elif t == 'cbrt':
        return x ** (1./3)
    elif t == 'pow_3':
        return 3 ** x
    elif t == 'pow_5':
        return 5 ** x
    elif t == 'pow_5':
        return 5 ** x
    elif t == 'sin':
        return math.sin(x)
    elif t == 'log':
        return math.log(x)

In [13]:
# for n_c, (f1, f2) in enumerate(combos):
#     name1 = f1 + "_plus_" + f2
#     train_features.append(name1)

In [14]:
import math

In [15]:
# transforms
for i in tqdm(range(len(f_ind_reg))):
    col = f_ind_reg[i]
    for t in transformations:
        new_col = col + '_t_' + str(t)
#         print(new_col)
        trn_df[new_col] = trn_df[col].map(lambda x: trans(t,x))
        sub_df[new_col] = sub_df[col].map(lambda x: trans(t,x))
        
        train_features.append(new_col)
#         combos.append(new_col)

100%|██████████| 14/14 [00:20<00:00,  1.54s/it]


In [16]:
combos = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
#     ('ps_reg_02', 'ps_car_02_cat'),
#     ('ps_reg_02', 'ps_car_04_cat'),
#     ('ps_reg_03', 'ps_car_02_cat'),
#     ('ps_reg_03', 'ps_car_04_cat')
   
]

In [17]:
def special_trans_reg_03(x):
    return (4*x) ** 2

In [18]:
col = 'ps_reg_03'
new_col = 'ps_reg_03_special'
trn_df[new_col] = trn_df[col].map(lambda x: special_trans_reg_03(x))
sub_df[new_col] = sub_df[col].map(lambda x: special_trans_reg_03(x))

train_features.append(new_col)
# combos.append(new_col)


In [19]:
# for i in tqdm(range(len(f_ind_reg))):
#     for j in range(i+1, len(f_ind_reg)):
#         col1 = f_ind_reg[i]
#         col2 = f_ind_reg[j]
#         new_col = col1 + '_m_' + col2
#         trn_df[new_col] = trn_df[col1] * trn_df[col2]
#         trn_df[new_col] = trn_df[new_col][trn_df[new_col] < 0] = -1
        
#         sub_df[new_col] = sub_df[col1] * sub_df[col2]
#         sub_df[new_col] = sub_df[new_col][sub_df[new_col] < 0] = -1
        
#         train_features.append(new_col)
# #         combos.append(new_col)
        
#         v1 = trn_df[col1].values
#         v2 = trn_df[col2].values
#         if (0 not in v2) & (0.0 not in v2):
#             new_col = col1 + '_d_' + col2
#             trn_df[new_col] = trn_df[col1]/trn_df[col2]
#             trn_df[new_col] = trn_df[new_col][trn_df[new_col] < 0] = -1
            
#             sub_df[new_col] = sub_df[col1] / sub_df[col2]
#             sub_df[new_col] = sub_df[new_col][sub_df[new_col] < 0] = -1
            
#             train_features.append(new_col)
# #             combos.append(new_col)
        
#         if (0 not in v1) & (0.0 not in v1):
#             new_col = col2 + '_d_' + col1
#             trn_df[new_col] = trn_df[col2]/trn_df[col1]
#             trn_df[new_col] = trn_df[new_col][trn_df[new_col] < 0] = -1
            
#             sub_df[new_col] = sub_df[col2] / sub_df[col1]
#             sub_df[new_col] = sub_df[new_col][sub_df[new_col] < 0] = -1
            
#             train_features.append(new_col)
# #             combos.append(new_col)
        
#         new_col = col1 + '_a_' + col2
#         trn_df[new_col] = trn_df[col1] + trn_df[col2]
#         trn_df[new_col] = trn_df[new_col][trn_df[new_col] < 0] = -1
        
#         sub_df[new_col] = sub_df[col1] + sub_df[col2]
#         sub_df[new_col] = sub_df[new_col][sub_df[new_col] < 0] = -1
        
#         train_features.append(new_col)
#         combos.append(new_col)
        
#         new_col = col1 + '_s_' + col2
#         trn_df[new_col] = trn_df[col1] - trn_df[col2]
#         trn_df[new_col] = trn_df[new_col][trn_df[new_col] < 0] = -1
        
#         sub_df[new_col] = sub_df[col1] - sub_df[col2]
#         sub_df[new_col] = sub_df[new_col][sub_df[new_col] < 0] = -1
        
#         train_features.append(new_col)
#         combos.append(new_col)
        
#         new_col = col2 + '_s_' + col1
#         trn_df[new_col] = trn_df[col2] - trn_df[col1]
#         trn_df[new_col] = trn_df[new_col][trn_df[new_col] < 0] = -1
        
#         sub_df[new_col] = sub_df[col2] - sub_df[col1]
#         sub_df[new_col] = sub_df[new_col][sub_df[new_col] < 0] = -1
        
#         train_features.append(new_col)
#         combos.append(new_col)

In [20]:
print(combos)

[('ps_reg_01', 'ps_car_02_cat'), ('ps_reg_01', 'ps_car_04_cat')]


In [21]:
start = time.time()
for n_c, (f1, f2) in enumerate(combos):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
    sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
    trn_df[name1] = lbl.transform(list(trn_df[name1].values))
    sub_df[name1] = lbl.transform(list(sub_df[name1].values))

    train_features.append(name1)

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0

In [22]:
f_cats = [f for f in train_features if "_cat" in f]

In [23]:
f_cats

['ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_07_cat',
 'ps_car_03_cat',
 'ps_car_06_cat',
 'ps_car_04_cat',
 'ps_car_09_cat',
 'ps_car_02_cat',
 'ps_ind_02_cat',
 'ps_car_05_cat',
 'ps_car_08_cat',
 'ps_ind_04_cat',
 'ps_car_11_cat',
 'ps_reg_01_plus_ps_car_02_cat',
 'ps_reg_01_plus_ps_car_04_cat']

In [24]:
# ps_car_01_cat [-1  0  1  2  3  4  5  6  7  8  9 10 11]
# ps_car_02_cat [-1  0  1]
# ps_car_03_cat [-1  0  1]
# ps_car_04_cat [0 1 2 3 4 5 6 7 8 9]
# ps_car_05_cat [-1  0  1]
# ps_car_06_cat [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
# ps_car_07_cat [-1  0  1]
# ps_car_08_cat [0 1]
# ps_car_09_cat [-1  0  1  2  3  4]
# ps_car_10_cat [0 1 2]
# ps_car_11_cat [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
#   19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
#   37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
#   55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
#   73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
#   91  92  93  94  95  96  97  98  99 100 101 102 103 104]

In [25]:
col_calc_mean = ['ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']

In [26]:
for f in f_cats:
    for col in col_calc_mean:

        new_col1 = '{}_{}_mean'.format(col, f) 
        new_col2 = '{}_{}_median'.format(col, f) 
        trn_df[new_col1] = 0
        trn_df[new_col2] = 0
        
        sub_df[new_col1] = 0
        sub_df[new_col2] = 0

In [28]:
for col in tqdm(col_calc_mean):
    for f in f_cats:
        new_col1 = '{}_{}_mean'.format(col, f) 
        new_col2 = '{}_{}_median'.format(col, f) 
        unique_f = np.unique(trn_df[f].values)
        for val in unique_f:
            if val == -1:
                continue
            data1 = trn_df[col][trn_df[f] == val]
            mean1 = data1.mean()
            median1 = data1.median()
            
            trn_df[new_col1][trn_df[f] == val] = mean1
            trn_df[new_col2][trn_df[f] == val] = median1
            
            data2 = sub_df[col][sub_df[f] == val]
            mean2 = data2.mean()
            median2 = data2.median()
            
            sub_df[new_col1][sub_df[f] == val] = mean2
            sub_df[new_col2][sub_df[f] == val] = median2
        
        
        train_features.append(new_col1)
        train_features.append(new_col2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 4/4 [04:36<00:00, 69.05s/it]


In [29]:
sum(trn_df.isnull().any()==True)

0

In [30]:
sum(sub_df.isnull().any()==True)

0

In [31]:
pd.set_option('display.max_columns', None)

In [32]:
trn_df.head()

Unnamed: 0_level_0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,ps_car_13_t_sin,ps_reg_03_t_sin,ps_ind_03_t_sin,ps_ind_15_t_sin,ps_reg_02_t_sin,ps_car_14_t_sin,ps_car_12_t_sin,ps_reg_01_t_sin,ps_car_15_t_sin,ps_ind_01_t_sin,ps_car_11_t_sin,ps_calc_09_t_sin,ps_calc_05_t_sin,ps_ind_14_t_sin,ps_reg_03_special,ps_reg_01_plus_ps_car_02_cat,ps_reg_01_plus_ps_car_04_cat,ps_car_12_ps_ind_05_cat_mean,ps_car_12_ps_ind_05_cat_median,ps_car_13_ps_ind_05_cat_mean,ps_car_13_ps_ind_05_cat_median,ps_car_14_ps_ind_05_cat_mean,ps_car_14_ps_ind_05_cat_median,ps_car_15_ps_ind_05_cat_mean,ps_car_15_ps_ind_05_cat_median,ps_car_12_ps_car_01_cat_mean,ps_car_12_ps_car_01_cat_median,ps_car_13_ps_car_01_cat_mean,ps_car_13_ps_car_01_cat_median,ps_car_14_ps_car_01_cat_mean,ps_car_14_ps_car_01_cat_median,ps_car_15_ps_car_01_cat_mean,ps_car_15_ps_car_01_cat_median,ps_car_12_ps_car_07_cat_mean,ps_car_12_ps_car_07_cat_median,ps_car_13_ps_car_07_cat_mean,ps_car_13_ps_car_07_cat_median,ps_car_14_ps_car_07_cat_mean,ps_car_14_ps_car_07_cat_median,ps_car_15_ps_car_07_cat_mean,ps_car_15_ps_car_07_cat_median,ps_car_12_ps_car_03_cat_mean,ps_car_12_ps_car_03_cat_median,ps_car_13_ps_car_03_cat_mean,ps_car_13_ps_car_03_cat_median,ps_car_14_ps_car_03_cat_mean,ps_car_14_ps_car_03_cat_median,ps_car_15_ps_car_03_cat_mean,ps_car_15_ps_car_03_cat_median,ps_car_12_ps_car_06_cat_mean,ps_car_12_ps_car_06_cat_median,ps_car_13_ps_car_06_cat_mean,ps_car_13_ps_car_06_cat_median,ps_car_14_ps_car_06_cat_mean,ps_car_14_ps_car_06_cat_median,ps_car_15_ps_car_06_cat_mean,ps_car_15_ps_car_06_cat_median,ps_car_12_ps_car_04_cat_mean,ps_car_12_ps_car_04_cat_median,ps_car_13_ps_car_04_cat_mean,ps_car_13_ps_car_04_cat_median,ps_car_14_ps_car_04_cat_mean,ps_car_14_ps_car_04_cat_median,ps_car_15_ps_car_04_cat_mean,ps_car_15_ps_car_04_cat_median,ps_car_12_ps_car_09_cat_mean,ps_car_12_ps_car_09_cat_median,ps_car_13_ps_car_09_cat_mean,ps_car_13_ps_car_09_cat_median,ps_car_14_ps_car_09_cat_mean,ps_car_14_ps_car_09_cat_median,ps_car_15_ps_car_09_cat_mean,ps_car_15_ps_car_09_cat_median,ps_car_12_ps_car_02_cat_mean,ps_car_12_ps_car_02_cat_median,ps_car_13_ps_car_02_cat_mean,ps_car_13_ps_car_02_cat_median,ps_car_14_ps_car_02_cat_mean,ps_car_14_ps_car_02_cat_median,ps_car_15_ps_car_02_cat_mean,ps_car_15_ps_car_02_cat_median,ps_car_12_ps_ind_02_cat_mean,ps_car_12_ps_ind_02_cat_median,ps_car_13_ps_ind_02_cat_mean,ps_car_13_ps_ind_02_cat_median,ps_car_14_ps_ind_02_cat_mean,ps_car_14_ps_ind_02_cat_median,ps_car_15_ps_ind_02_cat_mean,ps_car_15_ps_ind_02_cat_median,ps_car_12_ps_car_05_cat_mean,ps_car_12_ps_car_05_cat_median,ps_car_13_ps_car_05_cat_mean,ps_car_13_ps_car_05_cat_median,ps_car_14_ps_car_05_cat_mean,ps_car_14_ps_car_05_cat_median,ps_car_15_ps_car_05_cat_mean,ps_car_15_ps_car_05_cat_median,ps_car_12_ps_car_08_cat_mean,ps_car_12_ps_car_08_cat_median,ps_car_13_ps_car_08_cat_mean,ps_car_13_ps_car_08_cat_median,ps_car_14_ps_car_08_cat_mean,ps_car_14_ps_car_08_cat_median,ps_car_15_ps_car_08_cat_mean,ps_car_15_ps_car_08_cat_median,ps_car_12_ps_ind_04_cat_mean,ps_car_12_ps_ind_04_cat_median,ps_car_13_ps_ind_04_cat_mean,ps_car_13_ps_ind_04_cat_median,ps_car_14_ps_ind_04_cat_mean,ps_car_14_ps_ind_04_cat_median,ps_car_15_ps_ind_04_cat_mean,ps_car_15_ps_ind_04_cat_median,ps_car_12_ps_car_11_cat_mean,ps_car_12_ps_car_11_cat_median,ps_car_13_ps_car_11_cat_mean,ps_car_13_ps_car_11_cat_median,ps_car_14_ps_car_11_cat_mean,ps_car_14_ps_car_11_cat_median,ps_car_15_ps_car_11_cat_mean,ps_car_15_ps_car_11_cat_median,ps_car_12_ps_reg_01_plus_ps_car_02_cat_mean,ps_car_12_ps_reg_01_plus_ps_car_02_cat_median,ps_car_13_ps_reg_01_plus_ps_car_02_cat_mean,ps_car_13_ps_reg_01_plus_ps_car_02_cat_median,ps_car_14_ps_reg_01_plus_ps_car_02_cat_mean,ps_car_14_ps_reg_01_plus_ps_car_02_cat_median,ps_car_15_ps_reg_01_plus_ps_car_02_cat_mean,ps_car_15_ps_reg_01_plus_ps_car_02_cat_median,ps_car_12_ps_reg_01_plus_ps_car_04_cat_mean,ps_car_12_ps_reg_01_plus_ps_car_04_cat_median,ps_car_13_ps_reg_01_plus_ps_car_04_cat_mean,ps_car_13_ps_reg_01_plus_ps_car_04_cat_median,ps_car_14_ps_reg_01_plus_ps_car_04_cat_mean,ps_car_14_ps_reg_01_plus_ps_car_04_cat_median,ps_car_15_ps_reg_01_plus_ps_car_04_cat_mean,ps_car_15_ps_reg_01_plus_ps_car_04_cat_median
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1
7,0,2,2,5,1,0,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.71807,10,1,-1,0,1,4,1,0,0,1,12,2,0.4,0.883679,0.37081,3.605551,0.6,0.5,0.2,3,1,10,1,10,1,5,9,1,5,8,0,1,1,0,0,1,0.773078,0.657933,-0.958924,-0.99999,0.198669,0.36237,0.389418,0.644218,-0.447492,0.909297,0.909297,0.841471,0.841471,0.001,8.25,19,70,0.380469,0.374166,0.814889,0.766553,0.27753,0.368782,3.067873,3.316625,0.391929,0.4,0.865479,0.802128,0.310299,0.385487,3.135044,3.316625,0.378428,0.374166,0.803957,0.763947,0.277064,0.368782,3.060768,3.316625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367296,0.4,0.773211,0.745427,0.343222,0.37081,3.182322,3.316625,0.367082,0.374166,0.761576,0.740616,0.270192,0.366606,3.024636,3.162278,0.380256,0.374166,0.814952,0.762641,0.30914,0.379473,3.064638,3.316625,0.367553,0.374166,0.764163,0.738087,0.269678,0.366606,3.040633,3.316625,0.365796,0.374166,0.773175,0.742107,0.266021,0.365103,3.06971,3.316625,0.390513,0.39975,0.832608,0.769337,0.253556,0.359722,2.930494,3.162278,0.384054,0.387298,0.998247,0.927852,0.149042,0.353553,3.67522,3.741657,0.373392,0.374166,0.804762,0.763122,0.273869,0.366742,3.101007,3.316625,0.4,0.4,0.825397,0.821798,0.359531,0.37081,3.414482,3.464102,0.367026,0.374166,0.764963,0.737485,0.272548,0.366742,3.040675,3.316625,0.366447,0.374166,0.759965,0.740114,0.273323,0.366742,3.022089,3.162278
9,0,1,1,7,0,0,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,11,1,-1,0,-1,11,1,1,2,1,19,3,0.316228,0.618817,0.388716,2.44949,0.3,0.1,0.3,2,1,9,5,8,1,7,3,1,1,9,0,1,1,0,1,0,0.580072,0.693314,0.656987,0.14112,0.389418,0.379,0.310984,0.717356,0.638158,0.841471,0.14112,0.841471,0.841471,0.001,9.39,21,80,0.380469,0.374166,0.814889,0.766553,0.27753,0.368782,3.067873,3.316625,0.388844,0.4,0.858722,0.807173,0.263036,0.368511,3.155021,3.316625,0.378428,0.374166,0.803957,0.763947,0.277064,0.368782,3.060768,3.316625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.374233,0.374166,0.756987,0.715169,0.308645,0.368782,2.962629,3.162278,0.367082,0.374166,0.761576,0.740616,0.270192,0.366606,3.024636,3.162278,0.377333,0.374166,0.807165,0.765069,0.263259,0.362629,3.090635,3.316625,0.367553,0.374166,0.764163,0.738087,0.269678,0.366606,3.040633,3.316625,0.384766,0.387298,0.826747,0.774463,0.279591,0.37081,3.067131,3.316625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379116,0.374166,0.775934,0.73524,0.301929,0.37081,2.942934,3.162278,0.384622,0.387298,0.819302,0.767805,0.27798,0.37081,3.040733,3.316625,0.31595,0.31607,0.55472,0.579291,0.212453,0.36606,1.618905,2.0,0.368355,0.374166,0.776586,0.745538,0.267163,0.366742,3.095946,3.316625,0.367405,0.374166,0.771957,0.747765,0.269615,0.367423,3.077871,3.316625
13,0,5,4,9,1,0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,-1.0,7,1,-1,0,-1,14,1,1,2,1,60,1,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1,2,2,9,1,8,2,7,4,2,7,7,0,1,1,0,1,0,0.598467,-1.0,0.412118,-0.536573,0.001,0.340337,0.310984,0.001,-0.17414,-0.958924,0.841471,0.909297,0.909297,0.001,16.0,1,0,0.380469,0.374166,0.814889,0.766553,0.27753,0.368782,3.067873,3.316625,0.36729,0.374166,0.756952,0.732552,0.26568,0.364005,2.994601,3.162278,0.378428,0.374166,0.803957,0.763947,0.277064,0.368782,3.060768,3.316625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382101,0.4,0.785353,0.729485,0.250851,0.368511,3.001509,3.162278,0.367082,0.374166,0.761576,0.740616,0.270192,0.366606,3.024636,3.162278,0.377333,0.374166,0.807165,0.765069,0.263259,0.362629,3.090635,3.316625,0.367553,0.374166,0.764163,0.738087,0.269678,0.366606,3.040633,3.316625,0.369465,0.374166,0.777829,0.744747,0.265928,0.361939,2.994064,3.316625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379116,0.374166,0.775934,0.73524,0.301929,0.37081,2.942934,3.162278,0.373392,0.374166,0.804762,0.763122,0.273869,0.366742,3.101007,3.316625,0.316228,0.316228,0.645717,0.63839,0.336433,0.347275,3.277474,3.316625,0.370035,0.374166,0.778173,0.747447,0.250053,0.364005,3.082192,3.316625,0.366667,0.374166,0.768562,0.747171,0.246199,0.361939,3.069502,3.316625
16,0,0,1,2,0,0,1,0,0,0,0,0,0,0,0,8,1,0,0,0.9,0.2,0.580948,7,1,0,0,1,11,1,1,3,1,104,1,0.374166,0.542949,0.294958,2.0,0.6,0.9,0.1,2,4,7,1,8,4,2,2,2,4,9,0,0,0,0,0,0,0.516663,0.548816,0.909297,0.989358,0.198669,0.290699,0.365496,0.783327,0.909297,0.001,0.841471,-0.756802,-0.756802,0.001,5.4,24,90,0.380469,0.374166,0.814889,0.766553,0.27753,0.368782,3.067873,3.316625,0.36729,0.374166,0.756952,0.732552,0.26568,0.364005,2.994601,3.162278,0.378428,0.374166,0.803957,0.763947,0.277064,0.368782,3.060768,3.316625,0.388782,0.387298,0.792103,0.72983,0.173881,0.321403,2.78537,3.0,0.374233,0.374166,0.756987,0.715169,0.308645,0.368782,2.962629,3.162278,0.367082,0.374166,0.761576,0.740616,0.270192,0.366606,3.024636,3.162278,0.367884,0.374166,0.632474,0.619909,0.197483,0.342856,2.09738,2.44949,0.367553,0.374166,0.764163,0.738087,0.269678,0.366606,3.040633,3.316625,0.384766,0.387298,0.826747,0.774463,0.279591,0.37081,3.067131,3.316625,0.390513,0.39975,0.832608,0.769337,0.253556,0.359722,2.930494,3.162278,0.379116,0.374166,0.775934,0.73524,0.301929,0.37081,2.942934,3.162278,0.384622,0.387298,0.819302,0.767805,0.27798,0.37081,3.040733,3.316625,0.429322,0.424264,0.977018,0.925745,0.249136,0.391791,2.958673,3.316625,0.366922,0.374166,0.758668,0.736336,0.27454,0.366742,3.027009,3.316625,0.369631,0.374166,0.767393,0.744761,0.278789,0.368511,3.014207,3.162278
17,0,0,2,0,1,0,1,0,0,0,0,0,0,0,0,9,1,0,0,0.7,0.6,0.840759,11,1,-1,0,-1,14,1,1,2,1,82,3,0.31607,0.565832,0.365103,2.0,0.4,0.6,0.0,2,2,6,3,10,2,12,3,1,1,3,0,0,0,1,1,0,0.536118,0.745149,0.001,0.412118,0.564642,0.357045,0.310833,0.644218,0.909297,0.001,0.14112,0.909297,0.909297,0.001,11.31,19,70,0.380469,0.374166,0.814889,0.766553,0.27753,0.368782,3.067873,3.316625,0.388844,0.4,0.858722,0.807173,0.263036,0.368511,3.155021,3.316625,0.378428,0.374166,0.803957,0.763947,0.277064,0.368782,3.060768,3.316625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382101,0.4,0.785353,0.729485,0.250851,0.368511,3.001509,3.162278,0.367082,0.374166,0.761576,0.740616,0.270192,0.366606,3.024636,3.162278,0.377333,0.374166,0.807165,0.765069,0.263259,0.362629,3.090635,3.316625,0.367553,0.374166,0.764163,0.738087,0.269678,0.366606,3.040633,3.316625,0.365796,0.374166,0.773175,0.742107,0.266021,0.365103,3.06971,3.316625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379116,0.374166,0.775934,0.73524,0.301929,0.37081,2.942934,3.162278,0.373392,0.374166,0.804762,0.763122,0.273869,0.366742,3.101007,3.316625,0.316183,0.316228,0.650229,0.649125,0.229346,0.368511,2.834002,3.0,0.367026,0.374166,0.764963,0.737485,0.272548,0.366742,3.040675,3.316625,0.366447,0.374166,0.759965,0.740114,0.273323,0.366742,3.022089,3.162278


In [33]:
for f in trn_df.columns:
    print(f, np.unique(trn_df[f].values))

target [0 1]
ps_ind_01 [0 1 2 3 4 5 6 7]
ps_ind_02_cat [-1  1  2  3  4]
ps_ind_03 [ 0  1  2  3  4  5  6  7  8  9 10 11]
ps_ind_04_cat [-1  0  1]
ps_ind_05_cat [-1  0  1  2  3  4  5  6]
ps_ind_06_bin [0 1]
ps_ind_07_bin [0 1]
ps_ind_08_bin [0 1]
ps_ind_09_bin [0 1]
ps_ind_10_bin [0 1]
ps_ind_11_bin [0 1]
ps_ind_12_bin [0 1]
ps_ind_13_bin [0 1]
ps_ind_14 [0 1 2 3 4]
ps_ind_15 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]
ps_ind_16_bin [0 1]
ps_ind_17_bin [0 1]
ps_ind_18_bin [0 1]
ps_reg_01 [ 0.   0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9]
ps_reg_02 [ 0.   0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9  1.   1.1  1.2  1.3  1.4
  1.5  1.6  1.7  1.8]
ps_reg_03 [-1.          0.06123724  0.075      ...,  3.49079146  3.78772689
  4.03794502]
ps_car_01_cat [-1  0  1  2  3  4  5  6  7  8  9 10 11]
ps_car_02_cat [-1  0  1]
ps_car_03_cat [-1  0  1]
ps_car_04_cat [0 1 2 3 4 5 6 7 8 9]
ps_car_05_cat [-1  0  1]
ps_car_06_cat [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
ps_car_07_cat [-1  0  1

In [None]:
# ps_car_12 = cc,
# ps_car_13 = vehicle value,
# ps_car_14 = vehicle weight Kg
# ps_car_15 = manufacture year


In [34]:
len(f_cats)

15

In [35]:
for f in f_cats:
    trn_df[f + "_avg"], sub_df[f + "_avg"] = target_encode(trn_series=trn_df[f],
                                         tst_series=sub_df[f],
                                         target=target,
                                         min_samples_leaf=200,
                                         smoothing=10,
                                         noise_level=0)

In [20]:
f = 'target ~ '

In [21]:
s = []
for i in range(len(l)):
    for j in range(i+1,len(l)):
        col1 = l[i]
        col2 = l[j]
        
        s.append('C({}):C({})'.format(col1, col2))
s = ' + '.join(s)
s += ' -1'

In [22]:
f = f + ' ' + s

In [23]:
f

'target ~  C(ps_car_11_cat):C(ps_ind_05_cat) + C(ps_car_11_cat):C(ps_car_01_cat) + C(ps_car_11_cat):C(ps_car_09_cat) + C(ps_car_11_cat):C(ps_car_06_cat) + C(ps_ind_05_cat):C(ps_car_01_cat) + C(ps_ind_05_cat):C(ps_car_09_cat) + C(ps_ind_05_cat):C(ps_car_06_cat) + C(ps_car_01_cat):C(ps_car_09_cat) + C(ps_car_01_cat):C(ps_car_06_cat) + C(ps_car_09_cat):C(ps_car_06_cat) -1'

In [24]:
sub_df['target']=0
df_all = pd.concat([trn_df, sub_df], axis=0)

In [25]:
y,X = patsy.dmatrices(f, df_all, return_type='dataframe')

In [26]:
X.shape

(1488028, 4955)

In [27]:
x_cols = list(X.columns)

In [28]:
new_x_cols = ['col_'+str(i) for i in range(len(x_cols))]

In [29]:
X.columns = new_x_cols

In [30]:
del trn_df["target"]

In [31]:
# for c in X.columns:
#     X[c] = X[c].astype('int')

In [32]:
X.shape

(1488028, 4955)

In [33]:
# drop columns with constant value
X = X.loc[:, (X != X.iloc[0]).any()] 

In [34]:
X.shape

(1488028, 3284)

In [None]:
df_all.shape

(1488028, 432)

In [None]:
df_all = pd.concat([df_all, X], axis=1)

In [None]:
trn_df = df_all.iloc[:len(trn_df)]
sub_df = df_all.iloc[len(trn_df):]

In [None]:
trn_df.shape

(595212, 3716)

In [None]:
sub_df.shape

(892816, 3716)

In [None]:
sub_df = sub_df.drop('target', axis=1)

In [None]:
len(train_features)

397

In [None]:
train_features1 = train_features + list(X.columns)

In [None]:
len(train_features)

397

In [None]:

# for i in range(len(l)):
#     for j in range(i+1, len(l)):
#         f1 = l[i]
#         f2 = l[j]
#         name1 = f1 + "_plus_" + f2
#         print('current feature %60s %4d in %5.1f'
#               % (name1, n_c + 1, (time.time() - start) / 60), end='')
#         print('\r' * 75, end='')
#         trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
#         sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
#         print('\n')
#         lbl = LabelEncoder()
#         lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
#         train_features.append(name1)
#         combos.append(name1)
        
# for i in range(len(l_bins)):
#     for j in range(i+1, len(l_bins)):
#         f1 = l_bins[i]
#         f2 = l_bins[j]
#         name1 = f1 + "_" + f2 + '_cat'
#         print('current feature %60s %4d in %5.1f'
#               % (name1, n_c + 1, (time.time() - start) / 60), end='')
#         print('\r' * 75, end='')
#         trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
#         sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
#         print('\n')
#         lbl = LabelEncoder()
#         lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
#         train_features.append(name1)
#         combos.append(name1)
        
# for i in range(len(l_bins)):
#     for j in range(len(l)):
#         f1 = l_bins[i]
#         f2 = l[j]
#         name1 = f1 + "_" + f2
#         print('current feature %60s %4d in %5.1f'
#               % (name1, n_c + 1, (time.time() - start) / 60), end='')
#         print('\r' * 75, end='')
#         trn_df[name1] = trn_df[f1].apply(lambda x: str(x)) + "_" + trn_df[f2].apply(lambda x: str(x))
#         sub_df[name1] = sub_df[f1].apply(lambda x: str(x)) + "_" + sub_df[f2].apply(lambda x: str(x))
#         print('\n')
# #         lbl = LabelEncoder()
# #         lbl.fit(list(trn_df[name1].values) + list(sub_df[name1].values))
#         train_features.append(name1)
#         combos.append(name1)

In [None]:
trn_df = trn_df[train_features1]
sub_df = sub_df[train_features1]


In [25]:
trn_df.shape

(595212, 397)

In [26]:
sub_df.shape

(892816, 397)

In [27]:
trn_df.columns[100]

'ps_car_15_t_pow_3'

In [None]:
# for f in f_cats:
#     trn_df = trn_df.drop(f, axis=1)
#     sub_df = sub_df.drop(f, axis=1)

In [None]:
# for i in range(len(l)):
#     for j in range(i+1, len(l)):
#         f1 = l[i]
#         f2 = l[j]
#         f = f1 + "_plus_" + f2
#         trn_df = trn_df.drop(f, axis=1)
#         sub_df = sub_df.drop(f, axis=1)

In [36]:
len(train_features)

172

In [37]:
len(np.unique(train_features))

172

In [38]:
trn_df = trn_df[np.unique(train_features)]
sub_df = sub_df[np.unique(train_features)]

In [None]:
del df_all
del X
gc.collect()

In [39]:
n_splits = 5
n_estimators = 200
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(trn_df.columns), n_splits))
xgb_evals = np.zeros((n_estimators, n_splits))
oof = np.empty(len(trn_df))
sub_preds = np.zeros(len(sub_df))
increase = True
np.random.seed(0)


In [40]:
trn_df.columns

Index(['ps_calc_05', 'ps_calc_05_t_sin', 'ps_calc_09', 'ps_calc_09_t_sin',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat',
       ...
       'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_01_plus_ps_car_02_cat',
       'ps_reg_01_plus_ps_car_04_cat', 'ps_reg_01_t_sin', 'ps_reg_02',
       'ps_reg_02_t_sin', 'ps_reg_03', 'ps_reg_03_special', 'ps_reg_03_t_sin'],
      dtype='object', length=172)

In [41]:
# col_names = ['col_'+str(i) for i in range(len(list(trn_df.columns)))]

In [42]:
# trn_df.columns = col_names
# sub_df.columns = col_names

In [43]:
trn_df.shape

(595212, 172)

In [44]:
sub_df.shape

(892816, 172)

In [45]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(target, target)):
    trn_dat, trn_tgt = trn_df.iloc[trn_idx], target.iloc[trn_idx]
    val_dat, val_tgt = trn_df.iloc[val_idx], target.iloc[val_idx]

    clf = XGBClassifier(n_estimators=n_estimators,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=.1, 
                        subsample=.8, 
                        colsample_bytree=.8,
                        gamma=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        missing=-1,
                        nthread=5)
    # Upsample during cross validation to avoid having the same samples
    # in both train and validation sets
    # Validation set is not up-sampled to monitor overfitting
    if increase:
        # Get positive examples
        pos = pd.Series(trn_tgt == 1)
        # Add positive examples
        trn_dat = pd.concat([trn_dat, trn_dat.loc[pos]], axis=0)
        trn_tgt = pd.concat([trn_tgt, trn_tgt.loc[pos]], axis=0)
        # Shuffle data
        idx = np.arange(len(trn_dat))
        np.random.shuffle(idx)
        trn_dat = trn_dat.iloc[idx]
        trn_tgt = trn_tgt.iloc[idx]
        
    clf.fit(trn_dat, trn_tgt, 
            eval_set=[(trn_dat, trn_tgt), (val_dat, val_tgt)],
            eval_metric=gini_xgb,
            early_stopping_rounds=None,
            verbose=False)
            
    # Keep feature importances
    imp_df[:, fold_] = clf.feature_importances_

    # Find best round for validation set
    xgb_evals[:, fold_] = clf.evals_result_["validation_1"]["gini"]
    # Xgboost provides best round starting from 0 so it has to be incremented
    best_round = np.argsort(xgb_evals[:, fold_])[::-1][0]
    print(best_round)
    
    # Predict OOF and submission probas with the best round
    oof[val_idx] = clf.predict_proba(val_dat, ntree_limit=int(best_round))[:, 1]
    # Update submission
    sub_preds += clf.predict_proba(sub_df, ntree_limit=int(best_round))[:, 1] / n_splits

    # Display results
    print("Fold %2d : %.6f @%4d / best score is %.6f @%4d"
          % (fold_ + 1,
             eval_gini(val_tgt, oof[val_idx]),
             n_estimators,
             xgb_evals[best_round, fold_],
             best_round))
          


153
Fold  1 : 0.277302 @ 200 / best score is 0.277593 @ 153
198
Fold  2 : 0.278973 @ 200 / best score is 0.279028 @ 198
181
Fold  3 : 0.302631 @ 200 / best score is 0.302913 @ 181
179
Fold  4 : 0.279068 @ 200 / best score is 0.279612 @ 179
109
Fold  5 : 0.274237 @ 200 / best score is 0.274445 @ 109


In [46]:
print("Full OOF score : %.6f" % eval_gini(target, oof))
# org with clipping, Full OOF score : 0.284952, LB: 0.275
# org no clipping, Full OOF score : 0.284952, LB: 0.275
# org, removed ntree_limit, kaggle/python: Full OOF score : 0.283630, LB: 0.274
# org, with ntree_limit, kaggle/python: Full OOF score : 0.284745, LB: 0.282
# org, with my changes (l,l_bin and combos), Full OOF score : 0.286360, LB: 
# org, with (l and combos): Full OOF score : 0.285772, LB:
# org, (lm l_bin): Full OOF score : 0.286804, sub: 2017_11_25_18_45_18 GMT, LB: 0.280
# above, with clipping: 0.273
# org, with l combo, no sp, Full OOF score : 0.286919, sub: 2017_11_26_08_19_53GMT, LB: 0.281
# org with patsy, all cols, Full OOF score : 0.282860, LB:0.279
# org with trans and new l, Full OOF score : 0.271490, LB:0.271
# removed feature add/s: Full OOF score : 0.272667
# removed a trans except sin: Full OOF score : 0.272805
# back to old train_features: Full OOF score : 0.282251
# above + 4 counts: Full OOF score : 0.282421

Full OOF score : 0.282421


In [47]:
# Compute mean score and std
mean_eval = np.mean(xgb_evals, axis=1)
std_eval = np.std(xgb_evals, axis=1)
best_round = np.argsort(mean_eval)[::-1][0]

print("Best mean score : %.6f + %.6f @%4d"
      % (mean_eval[best_round], std_eval[best_round], best_round))
    


Best mean score : 0.282293 + 0.010322 @ 179


In [48]:
np.min(trn_df['ps_car_13'])

0.25061906820000002

In [49]:
np.max(trn_df['ps_car_13'])

3.7206260026

In [50]:
importances = sorted([(trn_df.columns[i], imp) for i, imp in enumerate(imp_df.mean(axis=1))],
                     key=lambda x: x[1])

for f, imp in importances[::-1]:
    print("%-34s : %10.4f" % (f, imp))

ps_car_13                          :     0.0682
ps_reg_03                          :     0.0574
ps_ind_03                          :     0.0467
ps_reg_03_t_sin                    :     0.0463
ps_car_14                          :     0.0396
ps_ind_15                          :     0.0332
ps_car_13_t_sin                    :     0.0319
ps_ind_01                          :     0.0251
ps_reg_02                          :     0.0218
ps_ind_17_bin                      :     0.0193
ps_ind_05_cat                      :     0.0188
ps_ind_15_t_sin                    :     0.0161
ps_car_14_ps_car_11_cat_mean       :     0.0153
ps_car_11_cat                      :     0.0151
ps_car_15_ps_car_11_cat_mean       :     0.0147
ps_car_14_ps_car_11_cat_median     :     0.0146
ps_car_15                          :     0.0145
ps_car_14_ps_reg_01_plus_ps_car_04_cat_mean :     0.0137
ps_car_07_cat                      :     0.0134
ps_ind_01_t_sin                    :     0.0133
ps_car_12_ps_ind_02_cat_mean   

In [51]:
importances = sorted([(trn_df.columns[i], imp) for i, imp in enumerate(imp_df.mean(axis=1))],
                     key=lambda x: x[1])

for f, imp in importances[::-1]:
    print("%-34s : %10.4f" % (f, imp))

ps_car_13                          :     0.0682
ps_reg_03                          :     0.0574
ps_ind_03                          :     0.0467
ps_reg_03_t_sin                    :     0.0463
ps_car_14                          :     0.0396
ps_ind_15                          :     0.0332
ps_car_13_t_sin                    :     0.0319
ps_ind_01                          :     0.0251
ps_reg_02                          :     0.0218
ps_ind_17_bin                      :     0.0193
ps_ind_05_cat                      :     0.0188
ps_ind_15_t_sin                    :     0.0161
ps_car_14_ps_car_11_cat_mean       :     0.0153
ps_car_11_cat                      :     0.0151
ps_car_15_ps_car_11_cat_mean       :     0.0147
ps_car_14_ps_car_11_cat_median     :     0.0146
ps_car_15                          :     0.0145
ps_car_14_ps_reg_01_plus_ps_car_04_cat_mean :     0.0137
ps_car_07_cat                      :     0.0134
ps_ind_01_t_sin                    :     0.0133
ps_car_12_ps_ind_02_cat_mean   

In [52]:
len(importances)

172

In [53]:
len(trn_df.columns)

172

In [54]:
sub_preds

array([ 0.05181502,  0.05162514,  0.05030902, ...,  0.07674067,
        0.05242858,  0.06076244])

In [55]:
np.min(sub_preds)

0.013631624635308981

In [56]:
np.max(sub_preds)

0.80232667922973633

In [39]:
# sub_preds1 = np.clip(sub_preds, a_min=0.05, a_max=0.95) #!!! no clipping must be done here

In [57]:
sub_df["target"] = sub_preds
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.xgb.{}GMT'.format(now)
sub_df[["target"]].to_csv(fn, index=True, float_format="%.9f")

In [58]:
print(now)

2017_11_27_10_04_06


In [59]:
sub_df.tail()

Unnamed: 0_level_0,ps_calc_05,ps_calc_05_t_sin,ps_calc_09,ps_calc_09_t_sin,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_11,ps_car_11_cat,ps_car_11_t_sin,ps_car_12,ps_car_12_ps_car_01_cat_mean,ps_car_12_ps_car_01_cat_median,ps_car_12_ps_car_02_cat_mean,ps_car_12_ps_car_02_cat_median,ps_car_12_ps_car_03_cat_mean,ps_car_12_ps_car_03_cat_median,ps_car_12_ps_car_04_cat_mean,ps_car_12_ps_car_04_cat_median,ps_car_12_ps_car_05_cat_mean,ps_car_12_ps_car_05_cat_median,ps_car_12_ps_car_06_cat_mean,ps_car_12_ps_car_06_cat_median,ps_car_12_ps_car_07_cat_mean,ps_car_12_ps_car_07_cat_median,ps_car_12_ps_car_08_cat_mean,ps_car_12_ps_car_08_cat_median,ps_car_12_ps_car_09_cat_mean,ps_car_12_ps_car_09_cat_median,ps_car_12_ps_car_11_cat_mean,ps_car_12_ps_car_11_cat_median,ps_car_12_ps_ind_02_cat_mean,ps_car_12_ps_ind_02_cat_median,ps_car_12_ps_ind_04_cat_mean,ps_car_12_ps_ind_04_cat_median,ps_car_12_ps_ind_05_cat_mean,ps_car_12_ps_ind_05_cat_median,ps_car_12_ps_reg_01_plus_ps_car_02_cat_mean,ps_car_12_ps_reg_01_plus_ps_car_02_cat_median,ps_car_12_ps_reg_01_plus_ps_car_04_cat_mean,ps_car_12_ps_reg_01_plus_ps_car_04_cat_median,ps_car_12_t_sin,ps_car_13,ps_car_13_ps_car_01_cat_mean,ps_car_13_ps_car_01_cat_median,ps_car_13_ps_car_02_cat_mean,ps_car_13_ps_car_02_cat_median,ps_car_13_ps_car_03_cat_mean,ps_car_13_ps_car_03_cat_median,ps_car_13_ps_car_04_cat_mean,ps_car_13_ps_car_04_cat_median,ps_car_13_ps_car_05_cat_mean,ps_car_13_ps_car_05_cat_median,ps_car_13_ps_car_06_cat_mean,ps_car_13_ps_car_06_cat_median,ps_car_13_ps_car_07_cat_mean,ps_car_13_ps_car_07_cat_median,ps_car_13_ps_car_08_cat_mean,ps_car_13_ps_car_08_cat_median,ps_car_13_ps_car_09_cat_mean,ps_car_13_ps_car_09_cat_median,ps_car_13_ps_car_11_cat_mean,ps_car_13_ps_car_11_cat_median,ps_car_13_ps_ind_02_cat_mean,ps_car_13_ps_ind_02_cat_median,ps_car_13_ps_ind_04_cat_mean,ps_car_13_ps_ind_04_cat_median,ps_car_13_ps_ind_05_cat_mean,ps_car_13_ps_ind_05_cat_median,ps_car_13_ps_reg_01_plus_ps_car_02_cat_mean,ps_car_13_ps_reg_01_plus_ps_car_02_cat_median,ps_car_13_ps_reg_01_plus_ps_car_04_cat_mean,ps_car_13_ps_reg_01_plus_ps_car_04_cat_median,ps_car_13_t_sin,ps_car_14,ps_car_14_ps_car_01_cat_mean,ps_car_14_ps_car_01_cat_median,ps_car_14_ps_car_02_cat_mean,ps_car_14_ps_car_02_cat_median,ps_car_14_ps_car_03_cat_mean,ps_car_14_ps_car_03_cat_median,ps_car_14_ps_car_04_cat_mean,ps_car_14_ps_car_04_cat_median,ps_car_14_ps_car_05_cat_mean,ps_car_14_ps_car_05_cat_median,ps_car_14_ps_car_06_cat_mean,ps_car_14_ps_car_06_cat_median,ps_car_14_ps_car_07_cat_mean,ps_car_14_ps_car_07_cat_median,ps_car_14_ps_car_08_cat_mean,ps_car_14_ps_car_08_cat_median,ps_car_14_ps_car_09_cat_mean,ps_car_14_ps_car_09_cat_median,ps_car_14_ps_car_11_cat_mean,ps_car_14_ps_car_11_cat_median,ps_car_14_ps_ind_02_cat_mean,ps_car_14_ps_ind_02_cat_median,ps_car_14_ps_ind_04_cat_mean,ps_car_14_ps_ind_04_cat_median,ps_car_14_ps_ind_05_cat_mean,ps_car_14_ps_ind_05_cat_median,ps_car_14_ps_reg_01_plus_ps_car_02_cat_mean,ps_car_14_ps_reg_01_plus_ps_car_02_cat_median,ps_car_14_ps_reg_01_plus_ps_car_04_cat_mean,ps_car_14_ps_reg_01_plus_ps_car_04_cat_median,ps_car_14_t_sin,ps_car_15,ps_car_15_ps_car_01_cat_mean,ps_car_15_ps_car_01_cat_median,ps_car_15_ps_car_02_cat_mean,ps_car_15_ps_car_02_cat_median,ps_car_15_ps_car_03_cat_mean,ps_car_15_ps_car_03_cat_median,ps_car_15_ps_car_04_cat_mean,ps_car_15_ps_car_04_cat_median,ps_car_15_ps_car_05_cat_mean,ps_car_15_ps_car_05_cat_median,ps_car_15_ps_car_06_cat_mean,ps_car_15_ps_car_06_cat_median,ps_car_15_ps_car_07_cat_mean,ps_car_15_ps_car_07_cat_median,ps_car_15_ps_car_08_cat_mean,ps_car_15_ps_car_08_cat_median,ps_car_15_ps_car_09_cat_mean,ps_car_15_ps_car_09_cat_median,ps_car_15_ps_car_11_cat_mean,ps_car_15_ps_car_11_cat_median,ps_car_15_ps_ind_02_cat_mean,ps_car_15_ps_ind_02_cat_median,ps_car_15_ps_ind_04_cat_mean,ps_car_15_ps_ind_04_cat_median,ps_car_15_ps_ind_05_cat_mean,ps_car_15_ps_ind_05_cat_median,ps_car_15_ps_reg_01_plus_ps_car_02_cat_mean,ps_car_15_ps_reg_01_plus_ps_car_02_cat_median,ps_car_15_ps_reg_01_plus_ps_car_04_cat_mean,ps_car_15_ps_reg_01_plus_ps_car_04_cat_median,ps_car_15_t_sin,ps_ind_01,ps_ind_01_t_sin,ps_ind_02_cat,ps_ind_03,ps_ind_03_t_sin,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_12_bin,ps_ind_14,ps_ind_14_t_sin,ps_ind_15,ps_ind_15_t_sin,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_01_plus_ps_car_02_cat,ps_reg_01_plus_ps_car_04_cat,ps_reg_01_t_sin,ps_reg_02,ps_reg_02_t_sin,ps_reg_03,ps_reg_03_special,ps_reg_03_t_sin,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1
1488022,1,0.841471,3,0.14112,0,1,-1,8,0,14,0,1,0,2,18,0.909297,0.565685,0.381031,0.374166,0.367579,0.374166,0.0,0.0,0.485194,0.489898,0.380714,0.374166,0.382202,0.4,0.385553,0.374166,0.379143,0.374166,0.380548,0.374166,0.530323,0.5,0.384725,0.387298,0.384675,0.387298,0.380457,0.374166,0.36608,0.374166,0.493327,0.489898,0.535995,1.528774,0.808243,0.763061,0.764339,0.738228,0.0,0.0,1.165948,1.139604,0.860292,0.803788,0.78579,0.729669,0.811303,0.771175,0.776575,0.735746,0.815924,0.763757,1.418453,1.354471,0.827063,0.774714,0.819891,0.768263,0.81531,0.767092,0.755032,0.732129,1.174202,1.160071,0.999117,0.565685,0.339593,0.384838,0.269885,0.366606,0.0,0.0,0.351172,0.441022,0.240227,0.348999,0.247344,0.368511,0.297747,0.378021,0.301606,0.370405,0.309998,0.380789,0.181428,0.565685,0.280086,0.37081,0.278452,0.37081,0.277754,0.368782,0.270355,0.366606,0.373008,0.467974,0.535995,3.605551,2.998644,3.162278,3.043182,3.316625,0.0,0.0,3.252571,3.464102,3.266927,3.464102,3.002376,3.162278,3.066317,3.316625,2.946068,3.162278,3.067005,3.316625,3.644757,3.605551,3.068627,3.316625,3.042927,3.316625,3.070936,3.316625,3.011719,3.316625,3.211989,3.464102,-0.447492,0,0.001,1,6,-0.279415,0,0,0,1,0,0,0,0,0.001,2,0.909297,0,0,1,0.5,15,58,0.479426,0.3,0.29552,1.048809,17.6,0.86683,0.196913
1488023,3,0.14112,4,-0.756802,8,1,-1,0,0,10,1,0,0,2,83,0.909297,0.424264,0.393072,0.39975,0.367579,0.374166,0.0,0.0,0.367074,0.374166,0.380714,0.374166,0.402979,0.387298,0.378394,0.374166,0.383997,0.387298,0.380548,0.374166,0.42013,0.424264,0.372636,0.374166,0.373359,0.374166,0.380457,0.374166,0.366934,0.374166,0.366236,0.374166,0.41165,1.040567,0.86936,0.797199,0.764339,0.738228,0.0,0.0,0.761713,0.740728,0.860292,0.803788,0.945906,0.931378,0.804274,0.76438,0.997587,0.927852,0.815924,0.763757,0.881209,0.896296,0.797841,0.753545,0.804825,0.763153,0.81531,0.767092,0.764995,0.738087,0.759231,0.739571,0.862691,0.418569,0.313976,0.384838,0.269885,0.366606,0.0,0.0,0.270248,0.366606,0.240227,0.348999,0.384701,0.382099,0.277215,0.368782,0.151481,0.353553,0.309998,0.380789,0.392648,0.397492,0.273246,0.366606,0.273619,0.366742,0.277754,0.368782,0.275028,0.366742,0.275325,0.366742,0.406453,3.741657,3.11182,3.316625,3.043182,3.316625,0.0,0.0,3.027113,3.162278,3.266927,3.464102,3.059024,3.162278,3.06357,3.316625,3.675138,3.741657,3.067005,3.316625,2.646256,2.828427,3.065812,3.316625,3.103625,3.316625,3.070936,3.316625,3.044841,3.316625,3.024498,3.162278,-0.564696,5,-0.958924,3,5,-0.958924,1,0,0,0,1,0,0,0,0.001,11,-0.99999,1,0,0,0.7,19,70,0.644218,1.0,0.841471,1.246495,24.86,0.947874,0.074922
1488024,1,0.841471,2,0.909297,11,1,-1,0,-1,11,1,0,1,2,17,0.909297,0.374166,0.388854,0.4,0.367579,0.374166,0.0,0.0,0.367074,0.374166,0.0,0.0,0.374242,0.374166,0.378394,0.374166,0.383997,0.387298,0.412411,0.4,0.392533,0.374166,0.384725,0.387298,0.384675,0.387298,0.380457,0.374166,0.366524,0.374166,0.364602,0.374166,0.365496,0.968992,0.859282,0.807429,0.764339,0.738228,0.0,0.0,0.761713,0.740728,0.0,0.0,0.757179,0.71566,0.804274,0.76438,0.997587,0.927852,0.96121,0.89961,0.93407,0.930709,0.827063,0.774714,0.819891,0.768263,0.81531,0.767092,0.759738,0.733073,0.752017,0.733665,0.824316,0.390384,0.262476,0.368511,0.269885,0.366606,0.0,0.0,0.270248,0.366606,0.0,0.0,0.309842,0.368782,0.277215,0.368782,0.151481,0.353553,0.252645,0.381445,-0.124505,0.327567,0.280086,0.37081,0.278452,0.37081,0.277754,0.368782,0.269989,0.366606,0.267982,0.366333,0.380544,3.741657,3.159415,3.316625,3.043182,3.316625,0.0,0.0,3.027113,3.162278,0.0,0.0,2.965958,3.162278,3.06357,3.316625,3.675138,3.741657,3.281189,3.316625,3.594526,3.605551,3.068627,3.316625,3.042927,3.316625,3.070936,3.316625,3.02305,3.316625,3.007434,3.162278,-0.564696,0,0.001,1,5,-0.958924,0,0,1,0,0,0,0,0,0.001,5,-0.958924,0,0,1,0.4,12,40,0.389418,0.0,0.001,0.609303,5.94,0.572296,0.076741
1488025,1,0.841471,1,0.841471,10,0,-1,0,1,10,1,1,0,3,28,0.14112,0.387298,0.392051,0.4,0.440368,0.424264,0.0,0.0,0.367074,0.374166,0.390556,0.39975,0.402979,0.387298,0.378394,0.374166,0.379143,0.374166,0.380548,0.374166,0.377455,0.374166,0.384725,0.387298,0.373359,0.374166,0.380457,0.374166,0.438961,0.424264,0.367109,0.374166,0.377688,0.725125,0.866685,0.80308,1.054239,1.008395,0.0,0.0,0.761713,0.740728,0.833353,0.770357,0.945906,0.931378,0.804274,0.76438,0.776575,0.735746,0.815924,0.763757,0.850205,0.858706,0.827063,0.774714,0.804825,0.763153,0.81531,0.767092,1.040088,1.003074,0.761197,0.740751,0.663229,0.378153,0.31213,0.386005,0.308388,0.398623,0.0,0.0,0.270248,0.366606,0.252886,0.359722,0.384701,0.382099,0.277215,0.368782,0.301606,0.370405,0.309998,0.380789,0.366169,0.378814,0.280086,0.37081,0.273619,0.366742,0.277754,0.368782,0.3084,0.4,0.272798,0.366742,0.369205,2.645751,3.137541,3.316625,3.191033,3.316625,0.0,0.0,3.027113,3.162278,2.932911,3.162278,3.059024,3.162278,3.06357,3.316625,2.946068,3.162278,3.067005,3.316625,2.990482,3.0,3.068627,3.316625,3.103625,3.316625,3.070936,3.316625,3.185538,3.316625,3.028108,3.162278,0.475772,6,-0.279415,1,5,-0.958924,1,0,0,0,0,1,0,0,0.001,13,0.420167,1,0,0,0.6,16,60,0.564642,0.6,0.564642,0.920937,13.57,0.796169,0.052429
1488026,4,-0.756802,4,-0.756802,9,1,1,0,0,4,1,1,2,1,48,0.841471,0.316228,0.391679,0.399875,0.367579,0.374166,0.385374,0.387298,0.367074,0.374166,0.380714,0.374166,0.367057,0.4,0.378394,0.374166,0.379143,0.374166,0.377153,0.374166,0.316193,0.316228,0.384725,0.387298,0.373359,0.374166,0.380457,0.374166,0.366938,0.374166,0.369653,0.374166,0.310984,0.637175,0.870332,0.812833,0.764339,0.738228,0.908334,0.853045,0.761713,0.740728,0.860292,0.803788,0.771874,0.744803,0.804274,0.76438,0.776575,0.735746,0.806974,0.7647,0.641991,0.637078,0.827063,0.774714,0.804825,0.763153,0.81531,0.767092,0.758541,0.736756,0.76771,0.744928,0.594927,0.296648,0.21735,0.329242,0.269885,0.366606,0.184433,0.324191,0.270248,0.366606,0.240227,0.348999,0.343694,0.37081,0.277215,0.368782,0.301606,0.370405,0.262818,0.362629,0.331089,0.374833,0.280086,0.37081,0.273619,0.366742,0.277754,0.368782,0.273534,0.366742,0.277791,0.368511,0.292316,3.316625,3.146919,3.316625,3.043182,3.316625,3.361617,3.605551,3.027113,3.162278,3.266927,3.464102,3.181228,3.316625,3.06357,3.316625,2.946068,3.162278,3.092783,3.316625,2.966689,3.162278,3.068627,3.316625,3.103625,3.316625,3.070936,3.316625,3.02939,3.316625,3.016759,3.162278,-0.17414,7,0.656987,1,4,-0.756802,1,0,0,0,0,1,0,0,0.001,12,-0.536573,1,0,0,0.9,24,90,0.783327,0.8,0.717356,0.992157,15.75,0.837207,0.060762


In [None]:
#best
ps_car_13                          :     0.1226
ps_reg_03                          :     0.0891
ps_ind_03                          :     0.0610
ps_ind_15                          :     0.0485
ps_car_14                          :     0.0484
ps_ind_01                          :     0.0360
ps_reg_02                          :     0.0357
ps_car_11_cat_avg                  :     0.0340
ps_reg_01_plus_ps_car_04_cat_avg   :     0.0333
ps_ind_05_cat_avg                  :     0.0288
ps_car_11_cat                      :     0.0266
ps_car_01_cat_avg                  :     0.0257
ps_car_15                          :     0.0256
ps_reg_01_plus_ps_car_02_cat_avg   :     0.0251
ps_ind_17_bin                      :     0.0226
ps_car_12                          :     0.0206
ps_reg_01_plus_ps_car_04_cat       :     0.0193
ps_calc_05                         :     0.0187
ps_calc_09                         :     0.0185
ps_car_09_cat_avg                  :     0.0184
ps_car_06_cat                      :     0.0170
ps_car_01_cat                      :     0.0161
ps_car_06_cat_avg                  :     0.0152
ps_car_07_cat                      :     0.0148
ps_ind_02_cat_avg                  :     0.0135
ps_ind_05_cat                      :     0.0123
ps_car_03_cat                      :     0.0123
ps_car_11                          :     0.0108
ps_reg_01_plus_ps_car_02_cat       :     0.0108
ps_ind_02_cat                      :     0.0107

In [None]:
l1 = [
    col_1 
col_0 
col_3
col_4 
col_6
col_14 
col_1846 
col_5 
col_1844
col_10 
col_13  
col_1845  
col_1832 
col_34   
col_1833
col_26 
]

In [46]:
f_ind_reg = [f for f in trn_df.columns if "_cat" not in f and '_bin' not in f and '_sin' not in f]

In [47]:
f_ind_reg

['ps_car_13',
 'ps_reg_03',
 'ps_ind_03',
 'ps_ind_15',
 'ps_reg_02',
 'ps_car_14',
 'ps_car_12',
 'ps_reg_01',
 'ps_car_15',
 'ps_ind_01',
 'ps_car_11',
 'ps_calc_09',
 'ps_calc_05',
 'ps_ind_14',
 'ps_reg_03_special']

In [48]:
for f in f_ind_reg:
    print(f)
    print(f, np.min(trn_df[f].values), np.max(trn_df[f]))
    print(f, np.min(sub_df[f].values), np.max(sub_df[f]))
    print('\n')

ps_car_13
ps_car_13 0.2506190682 3.7206260026
ps_car_13 0.2757783875 4.0313005715


ps_reg_03
ps_reg_03 -1.0 4.0379450219
ps_reg_03 -1.0 4.4235167005


ps_ind_03
ps_ind_03 0 11
ps_ind_03 0 11


ps_ind_15
ps_ind_15 0 13
ps_ind_15 0 13


ps_reg_02
ps_reg_02 0.0 1.8
ps_reg_02 0.0 1.8


ps_car_14
ps_car_14 -1.0 0.6363961031
ps_car_14 -1.0 0.6363961031


ps_car_12
ps_car_12 -1.0 1.2649110641
ps_car_12 0.1414213562 1.2649110641


ps_reg_01
ps_reg_01 0.0 0.9
ps_reg_01 0.0 0.9


ps_car_15
ps_car_15 0.0 3.7416573868
ps_car_15 0.0 3.7416573868


ps_ind_01
ps_ind_01 0 7
ps_ind_01 0 7


ps_car_11
ps_car_11 -1 3
ps_car_11 -1 3


ps_calc_09
ps_calc_09 0 7
ps_calc_09 0 7


ps_calc_05
ps_calc_05 0 6
ps_calc_05 0 6


ps_ind_14
ps_ind_14 0 4
ps_ind_14 0 4


ps_reg_03_special
ps_reg_03_special 0.0600000000596 260.879999998
ps_reg_03_special 0.0600000000596 313.079999994




In [None]:
use_delta = []
use_delta = [1 if int(np.min(f) = 0]

In [None]:
ps_car_12 = cc,
ps_car_13 = vehicle value,
ps_car_14 = vehicle weight Kg
ps_car_15 = manufacture year

(4*ps_reg_03)^2 maps to 2dp resolution
(4*ps_reg_03)^2

Column "ps_car_15" contains below unique values, 
which are square roots of integer 0 ~ 14. Is it devised deliberately? It looks non-sense.


In [None]:
for f in f_cats:
    if f not in combos:
        print(f, np.unique(trn_df[f].values))

In [None]:
# very serious - 7 points

# serious - 5 points

# average - 4 points

# light - 3 points

# i don't know if this is important or if this exists in this competition

# another detail, there are 2 types of licenses,
# provisional (usually the first license) and permanent. the time of these licenses is 1 year, 
# and 10 years (if i'm not wrong)

In [None]:
# dat["cont1"] = sqrt(dat["cont1"])
# dat$cont2 = asin(sqrt(dat$cont2))
# dat$cont3 = exp(dat$cont3)
# dat["cont4"] = 1/sqrt(dat["cont4"])
# dat["cont5"] = 1/sqrt(dat["cont5"])
# #D$cont8 = log10(D$cont8)
# dat["cont10"] = Math.cbrt(dat["cont10"])
# dat["cont11"] = 1/sqrt(dat["cont11"])
# dat["cont12"] = 1/exp(dat["cont12"])
# dat["cont6"] = sqrt(dat["cont6"])
# #D["cont7"] = exp(D["cont7"])
# dat$cont7 = 3 ^ dat$cont7
# dat["cont9"] = sin(dat["cont9"])
# dat$cont13 = asin(sqrt(dat$cont13))
# dat$cont14 = 5 ^ dat$cont14


In [None]:
transformations = ['sq', 'sqrt', 'exp', 'div_sqrt', 'cbrt', 'pow_3', 'pow_5', 'sin', 'log']

In [None]:
int(0.2)