In [1]:
import numpy as np
import pandas as pd
# from xgboost import XGBClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc

In [2]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [3]:
# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


In [4]:
def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)


In [5]:
# Read data
train_df = pd.read_csv('../data/train.csv', na_values="-1") # .iloc[0:200,:]
test_df = pd.read_csv('../data/test.csv', na_values="-1")

In [6]:
train_df.shape

(595212, 59)

In [7]:
# from olivier
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]


In [8]:
# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

In [9]:
df = pd.DataFrame()
df['id'] = test_df['id'].values
df.to_csv('../cache/test_id.csv', index=False)

In [10]:
df = pd.DataFrame()
df['id'] = train_df['id'].values
df.to_csv('../cache/train_id.csv', index=False)

In [11]:
df = pd.DataFrame()
df['y'] = train_df['target'].values
df.to_csv('../cache/train_labels.csv', index=False)

In [12]:
start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.1

In [13]:
train_features

['ps_car_13',
 'ps_reg_03',
 'ps_ind_05_cat',
 'ps_ind_03',
 'ps_ind_15',
 'ps_reg_02',
 'ps_car_14',
 'ps_car_12',
 'ps_car_01_cat',
 'ps_car_07_cat',
 'ps_ind_17_bin',
 'ps_car_03_cat',
 'ps_reg_01',
 'ps_car_15',
 'ps_ind_01',
 'ps_ind_16_bin',
 'ps_ind_07_bin',
 'ps_car_06_cat',
 'ps_car_04_cat',
 'ps_ind_06_bin',
 'ps_car_09_cat',
 'ps_car_02_cat',
 'ps_ind_02_cat',
 'ps_car_11',
 'ps_car_05_cat',
 'ps_calc_09',
 'ps_calc_05',
 'ps_ind_08_bin',
 'ps_car_08_cat',
 'ps_ind_09_bin',
 'ps_ind_04_cat',
 'ps_ind_18_bin',
 'ps_ind_12_bin',
 'ps_ind_14',
 'ps_reg_01_plus_ps_car_02_cat',
 'ps_reg_01_plus_ps_car_04_cat']

In [14]:
len(train_features)

36

In [15]:
X = train_df[train_features]

In [16]:
X.shape

(595212, 36)

In [17]:
X['id'] = id_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
X['target'] = train_df['target'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
X.head()

Unnamed: 0,ps_car_13,ps_reg_03,ps_ind_05_cat,ps_ind_03,ps_ind_15,ps_reg_02,ps_car_14,ps_car_12,ps_car_01_cat,ps_car_07_cat,...,ps_car_08_cat,ps_ind_09_bin,ps_ind_04_cat,ps_ind_18_bin,ps_ind_12_bin,ps_ind_14,ps_reg_01_plus_ps_car_02_cat,ps_reg_01_plus_ps_car_04_cat,id,target
0,0.883679,0.71807,0.0,5,11,0.2,0.37081,0.4,10.0,1.0,...,0,0,1.0,0,0,0,19,70,7,0
1,0.618817,0.766078,0.0,7,3,0.4,0.388716,0.316228,11.0,1.0,...,1,0,0.0,1,0,0,21,80,9,0
2,0.641586,,0.0,9,12,0.0,0.347275,0.316228,7.0,1.0,...,1,0,1.0,0,0,0,1,0,13,0
3,0.542949,0.580948,0.0,2,8,0.2,0.294958,0.374166,7.0,1.0,...,1,0,0.0,0,0,0,23,90,16,0
4,0.565832,0.840759,0.0,0,9,0.6,0.365103,0.31607,11.0,1.0,...,1,0,1.0,0,0,0,19,70,17,0


In [20]:
f_cats = [f for f in X.columns if '_cat' in f]

In [21]:
# for f in f_cats:
#     X[f] = X[f].astype('int')

In [22]:
X.target.value_counts()

0    573518
1     21694
Name: target, dtype: int64

In [23]:
X.to_csv('../cache/train_X.csv', index=False)

In [24]:
test_df = test_df[train_features]

In [25]:
test_df.to_csv('../cache/test_Y.csv', index=False)

In [26]:
f_cats = [f for f in X.columns if '_cat' in f]

In [27]:
f_cats

['ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_07_cat',
 'ps_car_03_cat',
 'ps_car_06_cat',
 'ps_car_04_cat',
 'ps_car_09_cat',
 'ps_car_02_cat',
 'ps_ind_02_cat',
 'ps_car_05_cat',
 'ps_car_08_cat',
 'ps_ind_04_cat',
 'ps_reg_01_plus_ps_car_02_cat',
 'ps_reg_01_plus_ps_car_04_cat']

In [28]:
f_bin = [f for f in X.columns if '_bin' in f]

In [29]:
f_num = [f for f in X.columns if '_cat' not in f]

In [30]:
f_num

['ps_car_13',
 'ps_reg_03',
 'ps_ind_03',
 'ps_ind_15',
 'ps_reg_02',
 'ps_car_14',
 'ps_car_12',
 'ps_ind_17_bin',
 'ps_reg_01',
 'ps_car_15',
 'ps_ind_01',
 'ps_ind_16_bin',
 'ps_ind_07_bin',
 'ps_ind_06_bin',
 'ps_car_11',
 'ps_calc_09',
 'ps_calc_05',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_18_bin',
 'ps_ind_12_bin',
 'ps_ind_14',
 'id',
 'target']

In [31]:
X.columns

Index(['ps_car_13', 'ps_reg_03', 'ps_ind_05_cat', 'ps_ind_03', 'ps_ind_15',
       'ps_reg_02', 'ps_car_14', 'ps_car_12', 'ps_car_01_cat', 'ps_car_07_cat',
       'ps_ind_17_bin', 'ps_car_03_cat', 'ps_reg_01', 'ps_car_15', 'ps_ind_01',
       'ps_ind_16_bin', 'ps_ind_07_bin', 'ps_car_06_cat', 'ps_car_04_cat',
       'ps_ind_06_bin', 'ps_car_09_cat', 'ps_car_02_cat', 'ps_ind_02_cat',
       'ps_car_11', 'ps_car_05_cat', 'ps_calc_09', 'ps_calc_05',
       'ps_ind_08_bin', 'ps_car_08_cat', 'ps_ind_09_bin', 'ps_ind_04_cat',
       'ps_ind_18_bin', 'ps_ind_12_bin', 'ps_ind_14',
       'ps_reg_01_plus_ps_car_02_cat', 'ps_reg_01_plus_ps_car_04_cat', 'id',
       'target'],
      dtype='object')

In [32]:
f_cats1 = [f for f in train_df.columns if '_cat' in f]

In [33]:
f_cats1

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat',
 'ps_reg_01_plus_ps_car_02_cat',
 'ps_reg_01_plus_ps_car_04_cat']

In [None]:
#!/usr/bin/env python3
# from criteo
# import subprocess, sys, os, time

# NR_THREAD = 1

# start = time.time()

# python count.py --src_csv_path ../cache/train_X.csv --dest_csv_path ../cache/fc.trva.t10.txt

# cmd = './utils/count.py tr.csv > fc.trva.t10.txt'
# subprocess.call(cmd, shell=True) 

# cmd = 'converters/parallelizer-a.py -s {nr_thread} converters/pre-a.py tr.csv tr.gbdt.dense tr.gbdt.sparse'.format(nr_thread=NR_THREAD)
# subprocess.call(cmd, shell=True) 

# cmd = 'converters/parallelizer-a.py -s {nr_thread} converters/pre-a.py te.csv te.gbdt.dense te.gbdt.sparse'.format(nr_thread=NR_THREAD)
# subprocess.call(cmd, shell=True) 

# cmd = './gbdt -t 30 -s {nr_thread} te.gbdt.dense te.gbdt.sparse tr.gbdt.dense tr.gbdt.sparse te.gbdt.out tr.gbdt.out'.format(nr_thread=NR_THREAD) 
# subprocess.call(cmd, shell=True)

# cmd = 'rm -f te.gbdt.dense te.gbdt.sparse tr.gbdt.dense tr.gbdt.sparse'
# subprocess.call(cmd, shell=True)

# cmd = 'converters/parallelizer-b.py -s {nr_thread} converters/pre-b.py tr.csv tr.gbdt.out tr.ffm'.format(nr_thread=NR_THREAD)
# subprocess.call(cmd, shell=True) 

# cmd = 'converters/parallelizer-b.py -s {nr_thread} converters/pre-b.py te.csv te.gbdt.out te.ffm'.format(nr_thread=NR_THREAD)
# subprocess.call(cmd, shell=True) 

# cmd = 'rm -f te.gbdt.out tr.gbdt.out'
# subprocess.call(cmd, shell=True) 

# cmd = './ffm-train -k 4 -t 18 -s {nr_thread} -p te.ffm tr.ffm model'.format(nr_thread=NR_THREAD) 
# subprocess.call(cmd, shell=True)

# cmd = './ffm-predict te.ffm model te.out'.format(nr_thread=NR_THREAD) 
# subprocess.call(cmd, shell=True)

# cmd = './utils/calibrate.py te.out te.out.cal'.format(nr_thread=NR_THREAD) 
# subprocess.call(cmd, shell=True)

# cmd = './utils/make_submission.py te.out.cal submission.csv'.format(nr_thread=NR_THREAD) 
# subprocess.call(cmd, shell=True)

# print('time used = {0:.0f}'.format(time.time()-start))

In [None]:
# do this after running count.py

In [40]:
t10 = pd.read_csv('../cache/fc.trva.t10.txt')

In [41]:
t10.head()

Unnamed: 0,Field,Value,Neg,Pos,Total,Ratio
0,ps_reg_01_plus_ps_car_04_cat,14.0,10,0,10,0.0
1,ps_reg_01_plus_ps_car_04_cat,15.0,8,2,10,0.2
2,ps_reg_01_plus_ps_car_04_cat,27.0,9,1,10,0.1
3,ps_reg_01_plus_ps_car_04_cat,67.0,11,0,11,0.0
4,ps_reg_01_plus_ps_car_04_cat,57.0,9,2,11,0.18182


In [44]:
tFreq = t10[t10.Total > 60000 ]

In [45]:
tFreq

Unnamed: 0,Field,Value,Neg,Pos,Total,Ratio
163,ps_car_01_cat,6.0,60536,1857,62393,0.02976
164,ps_car_03_cat,0.0,70375,2897,73272,0.03954
165,ps_car_08_cat,0.0,95457,4491,99948,0.04493
166,ps_car_02_cat,0.0,96206,5011,101217,0.04951
167,ps_car_06_cat,0.0,106735,3685,110420,0.03337
168,ps_car_03_cat,1.0,105362,5347,110709,0.0483
169,ps_car_06_cat,1.0,114339,4047,118386,0.03418
170,ps_ind_02_cat,2.0,118860,4713,123573,0.03814
171,ps_car_06_cat,11.0,127355,4172,131527,0.03172
172,ps_reg_01_plus_ps_car_02_cat,23.0,147186,5848,153034,0.03821


In [47]:
tFreq.Value.fillna('', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [48]:
tFreq

Unnamed: 0,Field,Value,Neg,Pos,Total,Ratio
163,ps_car_01_cat,6.0,60536,1857,62393,0.02976
164,ps_car_03_cat,0.0,70375,2897,73272,0.03954
165,ps_car_08_cat,0.0,95457,4491,99948,0.04493
166,ps_car_02_cat,0.0,96206,5011,101217,0.04951
167,ps_car_06_cat,0.0,106735,3685,110420,0.03337
168,ps_car_03_cat,1.0,105362,5347,110709,0.0483
169,ps_car_06_cat,1.0,114339,4047,118386,0.03418
170,ps_ind_02_cat,2.0,118860,4713,123573,0.03814
171,ps_car_06_cat,11.0,127355,4172,131527,0.03172
172,ps_reg_01_plus_ps_car_02_cat,23.0,147186,5848,153034,0.03821


In [53]:
tFreq['FV'] = tFreq['Field'].astype('str') + '-' + tFreq['Value'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [54]:
tFreq['FV'].values

array(['ps_car_01_cat-6.0', 'ps_car_03_cat-0.0', 'ps_car_08_cat-0.0',
       'ps_car_02_cat-0.0', 'ps_car_06_cat-0.0', 'ps_car_03_cat-1.0',
       'ps_car_06_cat-1.0', 'ps_ind_02_cat-2.0', 'ps_car_06_cat-11.0',
       'ps_reg_01_plus_ps_car_02_cat-23.0', 'ps_car_05_cat-0.0',
       'ps_reg_01_plus_ps_car_04_cat-90.0', 'ps_car_05_cat-1.0',
       'ps_car_01_cat-7.0', 'ps_car_09_cat-0.0', 'ps_car_01_cat-11.0',
       'ps_ind_04_cat-1.0', 'ps_car_05_cat-', 'ps_ind_04_cat-0.0',
       'ps_car_09_cat-2.0', 'ps_car_03_cat-', 'ps_ind_02_cat-1.0',
       'ps_car_02_cat-1.0', 'ps_car_08_cat-1.0', 'ps_car_04_cat-0.0',
       'ps_ind_05_cat-0.0', 'ps_car_07_cat-1.0'], dtype=object)

In [56]:
tFreq['FV'] = tFreq['FV'].map(lambda x: x.split('.')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [57]:
tFreq['FV'].values

array(['ps_car_01_cat-6', 'ps_car_03_cat-0', 'ps_car_08_cat-0',
       'ps_car_02_cat-0', 'ps_car_06_cat-0', 'ps_car_03_cat-1',
       'ps_car_06_cat-1', 'ps_ind_02_cat-2', 'ps_car_06_cat-11',
       'ps_reg_01_plus_ps_car_02_cat-23', 'ps_car_05_cat-0',
       'ps_reg_01_plus_ps_car_04_cat-90', 'ps_car_05_cat-1',
       'ps_car_01_cat-7', 'ps_car_09_cat-0', 'ps_car_01_cat-11',
       'ps_ind_04_cat-1', 'ps_car_05_cat-', 'ps_ind_04_cat-0',
       'ps_car_09_cat-2', 'ps_car_03_cat-', 'ps_ind_02_cat-1',
       'ps_car_02_cat-1', 'ps_car_08_cat-1', 'ps_car_04_cat-0',
       'ps_ind_05_cat-0', 'ps_car_07_cat-1'], dtype=object)

In [1]:
(0.252099 +
0.250353 + 
0.276489 + 
0.261354 + 
0.237295)/5. 

0.255518