Attempt to reproduce https://www.kaggle.com/rsakata/bnp-paribas-cardif-claims-management/xgboost-with-combination-of-factors/code and translate it to python

In [1]:
from collections import Counter
from itertools import combinations

import pandas as pd
import numpy as np

from natsort import natsorted
from tqdm import tqdm

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit, StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import log_loss

import xgboost as xgb

In [3]:
df_train = pd.read_csv("../input/train.csv")
df_train['source'] = 'TRAIN'

cv = StratifiedShuffleSplit(df_train.target, n_iter=1, test_size=0.10, random_state=1)
_, validation = next(cv.__iter__())
df_train.loc[validation, 'source'] = 'VAL'

df_test = pd.read_csv("../input/test.csv")
df_test['source'] = 'TEST'
test_id = df_test.ID

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=1)
df_all = df_all[['ID', 'target', 'source', 
                 'v10', 'v12', 'v14', 'v21', 'v22', 'v24', 'v30', 'v31', 'v34', 
                 'v38', 'v40', 'v47', 'v50', 'v52', 'v56', 'v62', 'v66', 'v72', 'v75', 
                 'v79', 'v91', 'v112', 'v113', 'v114', 'v129']]
del df_train, df_test

In [4]:
all_columns = natsorted(set(df_all.columns) - {'ID', 'target', 'source'})

categorical = [col for col in all_columns if df_all[col].dtype == 'O']
numerical   = [col for col in all_columns if df_all[col].dtype != 'O']

In [5]:
combination_featuers = []

combs = list(combinations(categorical, 2))
for v1, v2 in tqdm(combs):
    name = '%s_%s' % (v1, v2)
    df_all[name] = \
            df_all[v1].fillna('*') + '_' + df_all[v2].fillna('*')
    combination_featuers.append(name)

combs = list(combinations(set(categorical) - {'v22'}, 2))
for v1, v2 in tqdm(combs):
    name = 'v22_%s_%s' % (v1, v2)
    df_all[name] = \
            df_all['v22'].fillna('*') + '_' + df_all[v1].fillna('*') + '_' + df_all[v2].fillna('*')
    combination_featuers.append(name)

combs = list(combinations(set(categorical) - {'v22'}, len(categorical) - 3))
for f_10 in tqdm(combs):
    f_10 = list(f_10)
    name = 'v22_' + '_'.join(f_10)
    df_all[name] = df_all[['v22'] + f_10].fillna('').sum(axis=1)
    combination_featuers.append(name)



In [6]:
df_all.loc[:, categorical] = df_all.loc[:, categorical].fillna('')

In [7]:
def target_mean(data, columns, train_mask=None, inplace=False, target_column='target', id_column='ID',
               verbose=True):
    if train_mask is None:
        train_mask = data.source == 'TRAIN'

    df_train = data[train_mask]
    df_test  = data[~train_mask]
    y = df_train[target_column]

    if inplace:
        result = data
    else:
        result = pd.DataFrame({id_column: data[id_column]})

    cv = StratifiedKFold(y, n_folds=4, shuffle=True, random_state=100)

    for col in columns:
        if verbose:
            print 'processing %s...' % col

        col_result = np.zeros(len(df_train))

        for train, test in cv:
            means = pd.groupby(y.iloc[train], by=df_train.iloc[train][col]).mean().to_dict()
            col_result[test] = df_train.iloc[test][col].apply(means.get).values
    
        result.loc[train_mask, col] = col_result

        means = pd.groupby(y, by=df_train[col]).mean().to_dict()
        result.loc[~train_mask, col] = result.loc[~train_mask, col].apply(means.get).values

    if not inplace:
        return result

In [8]:
target_mean(df_all, categorical + combination_featuers, train_mask=(df_all.source == 'TRAIN'), inplace=1)

processing v22...
processing v24...
processing v30...
processing v31...
processing v47...
processing v52...
processing v56...
processing v66...
processing v75...
processing v79...
processing v91...
processing v112...
processing v113...
processing v22_v24...
processing v22_v30...
processing v22_v31...
processing v22_v47...
processing v22_v52...
processing v22_v56...
processing v22_v66...
processing v22_v75...
processing v22_v79...
processing v22_v91...
processing v22_v112...
processing v22_v113...
processing v24_v30...
processing v24_v31...
processing v24_v47...
processing v24_v52...
processing v24_v56...
processing v24_v66...
processing v24_v75...
processing v24_v79...
processing v24_v91...
processing v24_v112...
processing v24_v113...
processing v30_v31...
processing v30_v47...
processing v30_v52...
processing v30_v56...
processing v30_v66...
processing v30_v75...
processing v30_v79...
processing v30_v91...
processing v30_v112...
processing v30_v113...
processing v31_v47...
processing

In [9]:
features = numerical + categorical + combination_featuers

X = df_all.loc[df_all.source == 'TRAIN', features].values
y = df_all.target[df_all.source == 'TRAIN'].values

X_val = df_all.loc[df_all.source == 'VAL', features].values
y_val = df_all.target[df_all.source == 'VAL'].values

X_test = df_all.loc[df_all.source == 'TEST', features].values

In [10]:
X.shape

(102888, 235)

## XGB

In [11]:
dtrain = xgb.DMatrix(X, label=y, feature_names=features, missing=np.nan)
dvalid = xgb.DMatrix(X_val, label=y_val, feature_names=features, missing=np.nan)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

In [14]:
results = []

In [15]:
metric = 'logloss'
early_stopping_rounds = 50
n_estimators = 1500

xgb_pars = {
    #'estimators': 1140
    'eta': 0.05, 
    'max_depth': 6,
    'colsample_bytree': 0.45,
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss',
    'nthread': 8,
    'seed': 42
}

In [16]:
xgb_model = xgb.train(xgb_pars, dtrain, num_boost_round=n_estimators, evals=watchlist, verbose_eval=25,
                      early_stopping_rounds=early_stopping_rounds)

Will train until eval error hasn't decreased in 50 rounds.
[0]	train-logloss:0.674656	eval-logloss:0.674524
[25]	train-logloss:0.484674	eval-logloss:0.484398
[50]	train-logloss:0.448620	eval-logloss:0.451338
[75]	train-logloss:0.436006	eval-logloss:0.442476
[100]	train-logloss:0.428403	eval-logloss:0.439326
[125]	train-logloss:0.423133	eval-logloss:0.437739
[150]	train-logloss:0.418449	eval-logloss:0.436589
[175]	train-logloss:0.414488	eval-logloss:0.435872
[200]	train-logloss:0.410202	eval-logloss:0.435557
[225]	train-logloss:0.406535	eval-logloss:0.435370
[250]	train-logloss:0.403134	eval-logloss:0.435210
[275]	train-logloss:0.400382	eval-logloss:0.435022
[300]	train-logloss:0.396688	eval-logloss:0.434940
[325]	train-logloss:0.394130	eval-logloss:0.434873
[350]	train-logloss:0.391053	eval-logloss:0.435014
Stopping. Best iteration:
[320]	train-logloss:0.394449	eval-logloss:0.434849



In [18]:
X_test = df_all.loc[df_all.source == 'TEST', features].values
dtest = xgb.DMatrix(X_test, feature_names=features, missing=np.nan)
y_score = xgb_model.predict(dtest, ntree_limit=320)

In [19]:
result = pd.DataFrame({'ID': test_id, 'PredictedProb': y_score})
result.to_csv('xgb.csv', index=False)

Private leaderboard score: 0.43583