In [2]:
#https://www.kaggle.com/aharless/xgboost-cv-lb-284

In [3]:
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50  
# Note: I set EARLY_STOPPING_ROUNDS high so that (when OPTIMIZE_ROUNDS is set)
#       I will get lots of information to make my own judgment.  You should probably
#       reduce EARLY_STOPPING_ROUNDS if you want to do actual early stopping.

In [46]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc
from datetime import datetime

In [55]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [65]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
     assert( len(actual) == len(pred) )
     all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
     all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
     totalLosses = all[:,0].sum()
     giniSum = all[:,0].cumsum().sum() / totalLosses
 
     giniSum -= (len(actual) + 1) / 2.
     return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [66]:
# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

def gini_xgb1(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

In [7]:
# Read data
train_df = pd.read_csv('../data/train.csv', na_values="-1") # .iloc[0:200,:]
test_df = pd.read_csv('../data/test.csv', na_values="-1")

In [8]:
# from olivier
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]





In [9]:
# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']
Y = train_df['target'].values


In [10]:
start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)
    


current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0

In [11]:
X = train_df[train_features]
test_df = test_df[train_features]

In [12]:
f_cats = [f for f in X.columns if "_cat" in f]

In [13]:
y_valid_pred = 0*y
y_test_pred = 0

In [14]:
# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)

In [15]:
sp = sum(Y == 0)/sum(Y==1)

In [16]:
print(sp)

26.4367106112


In [17]:
#Set up classifier

model = XGBClassifier(    
    n_estimators=MAX_ROUNDS,
    max_depth=4,
    objective="binary:logistic",
    learning_rate=LEARNING_RATE, 
    subsample=.8,
    min_child_weight=6,
    colsample_bytree=.8,
    scale_pos_weight=1.26,
    gamma=10,
    reg_alpha=8,
    reg_lambda=1.3,
    )

In [18]:
# X.fillna(-1, inplace=True)
# test_df.fillna(-1, inplace=True)

In [19]:
train_df.shape

(595212, 61)

In [20]:
X.shape

(595212, 36)

In [21]:
# Run CV

for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
    # Enocode data
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )
    # Run model for this fold
    if OPTIMIZE_ROUNDS:
        eval_set=[(X_valid,y_valid)]
        fit_model = model.fit( X_train, y_train, 
                               eval_set=eval_set,
                               eval_metric=gini_xgb,
                               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                               verbose=False
                             )
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
    else:
        fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_valid, y_train



Fold  0
  Gini =  0.28406574509604676

Fold  1
  Gini =  0.2806541157466428

Fold  2
  Gini =  0.2757547963716266

Fold  3
  Gini =  0.3000107882690157

Fold  4
  Gini =  0.2827013037567464


In [24]:
y_test_pred /= K  # Average test set predictions

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)
# org: 0.2844175528242565, LB: 0.283
# org, added missing: 0.28532607696136825, LB: 0.282


Gini for full training set:


0.2844175528242565

In [25]:
# Save validation predictions for stacking/ensembling
val = pd.DataFrame()
val['id'] = id_train
val['target'] = y_valid_pred.values
val.to_csv('../cache/xgb_valid.csv', float_format='%.6f', index=False)

In [26]:
# Create submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_test_pred
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.xgb.{}_GMT'.format(now)
sub.to_csv(fn, float_format='%.6f', index=False)

In [27]:
print(now)

2017_11_27_14_23_17


In [28]:
X1 = pd.concat([X, test_df], axis=0)

In [29]:
X1.shape

(1488028, 36)

In [30]:
# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)

In [31]:
len(y)

595212

In [32]:
len(y_test_pred)

892816

In [33]:
df_y = pd.DataFrame()

In [34]:
df_y['y'] = y

In [35]:
df_y.head()

Unnamed: 0,y
0,0
1,0
2,0
3,0
4,0


In [80]:
df_y1 = pd.DataFrame()

In [84]:
y_test_pred

array([ 0.00607441,  0.00672562,  0.00622999, ...,  0.00878617,
        0.00591863,  0.00792685], dtype=float32)

In [89]:
np.max(y_test_pred)

0.10850623

In [85]:
df_y1['y'] = [int(round(x)) for x in y_test_pred]

In [90]:
df_y1['y'].value_counts()

0    892816
Name: y, dtype: int64

In [91]:
df_ps = pd.concat([df_y, df_y1], axis=0)

In [92]:
df_ps.shape

(1488028, 1)

In [93]:
y_ps = df_ps['y']

In [94]:
y_valid_pred_ps = 0*y_ps
y_test_pred_ps = 0

In [42]:
#Set up classifier



In [43]:
Y = np.concatenate((y, y_test_pred), axis=0)

In [44]:
X_ps = np.concatenate((X, test_df), axis=0)

In [52]:
from sklearn.metrics import make_scorer
def normalized_gini(solution, submission):
    normalized_gini = eval_gini(solution, submission)/eval_gini(solution, solution)
    return normalized_gini
gini_scorer = make_scorer(normalized_gini, greater_is_better = True)

In [58]:
from sklearn.metrics import log_loss

In [70]:
X_ps.shape

(1488028, 36)

In [71]:
test_df.shape

(892816, 36)

In [73]:
denom = 0
fold = 5 #Change to 5, 1 for Kaggle Limits
for i in range(fold):
    params = {
        'eta': 0.07,
        'max_depth': 4,
        'subsample' : 0.8,
        'colsample_bytree':0.8,
        'objective': 'binary:logistic',
        'feval': gini_xgb1,
        'seed': i,
        'silent': True
    }
    x1, x2, y1, y2 = train_test_split(X_ps, Y, test_size=0.2, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 400,  watchlist, verbose_eval=50, early_stopping_rounds=100)
    score1 = eval_gini(y2, model.predict(xgb.DMatrix(x2), 
                                                ntree_limit=model.best_ntree_limit))
    print(score1)
    #if score < 0.9:
    print(x1.shape)
    print(x2.shape)
    print(test_df.shape)
    if denom != 0:
        pred = model.predict(xgb.DMatrix(test_df), ntree_limit=model.best_ntree_limit)
        preds += pred
    else:
        pred = model.predict(xgb.DMatrix(test_df), ntree_limit=model.best_ntree_limit)
        preds = pred.copy()
    denom += 1
    


[0]	train-error:0.020056	valid-error:0.019886
[50]	train-error:0.020056	valid-error:0.019886
[100]	train-error:0.020056	valid-error:0.019886
[150]	train-error:0.020056	valid-error:0.019886
[200]	train-error:0.020055	valid-error:0.019886
[250]	train-error:0.020054	valid-error:0.019886
[300]	train-error:0.020054	valid-error:0.019886
[350]	train-error:0.020053	valid-error:0.019886
[399]	train-error:0.020052	valid-error:0.019886
0.26971251201631374
(1190422, 36)
(297606, 36)
(892816, 36)


ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35'] ['ps_car_13', 'ps_reg_03', 'ps_ind_05_cat', 'ps_ind_03', 'ps_ind_15', 'ps_reg_02', 'ps_car_14', 'ps_car_12', 'ps_car_01_cat', 'ps_car_07_cat', 'ps_ind_17_bin', 'ps_car_03_cat', 'ps_reg_01', 'ps_car_15', 'ps_ind_01', 'ps_ind_16_bin', 'ps_ind_07_bin', 'ps_car_06_cat', 'ps_car_04_cat', 'ps_ind_06_bin', 'ps_car_09_cat', 'ps_car_02_cat', 'ps_ind_02_cat', 'ps_car_11', 'ps_car_05_cat', 'ps_calc_09', 'ps_calc_05', 'ps_ind_08_bin', 'ps_car_08_cat', 'ps_ind_09_bin', 'ps_ind_04_cat', 'ps_ind_18_bin', 'ps_ind_12_bin', 'ps_ind_14', 'ps_reg_01_plus_ps_car_02_cat', 'ps_reg_01_plus_ps_car_04_cat']
expected f3, f18, f2, f27, f9, f8, f24, f1, f13, f19, f12, f0, f31, f23, f4, f15, f22, f32, f16, f14, f6, f20, f33, f34, f11, f10, f30, f25, f26, f17, f5, f21, f28, f29, f35, f7 in input data
training data did not have the following fields: ps_ind_15, ps_car_14, ps_car_01_cat, ps_car_15, ps_reg_01_plus_ps_car_04_cat, ps_ind_07_bin, ps_car_07_cat, ps_car_02_cat, ps_ind_02_cat, ps_car_12, ps_ind_06_bin, ps_ind_18_bin, ps_car_11, ps_car_04_cat, ps_ind_16_bin, ps_ind_12_bin, ps_ind_03, ps_reg_01, ps_car_03_cat, ps_ind_17_bin, ps_car_06_cat, ps_ind_14, ps_reg_01_plus_ps_car_02_cat, ps_car_05_cat, ps_car_08_cat, ps_calc_05, ps_ind_08_bin, ps_car_13, ps_reg_02, ps_ind_01, ps_car_09_cat, ps_ind_09_bin, ps_ind_05_cat, ps_calc_09, ps_reg_03, ps_ind_04_cat

In [None]:
preds /= denom

In [None]:
# Create submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = preds
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.xgb.{}_GMT'.format(now)
sub.to_csv(fn, float_format='%.6f', index=False)

In [77]:
model = XGBClassifier(    
    n_estimators=MAX_ROUNDS,
    max_depth=4,
    objective="binary:logistic",
    learning_rate=0.07, 
    subsample=.8,
    min_child_weight=6,
    colsample_bytree=.8,
    scale_pos_weight=1.26,
    gamma=10,
    reg_alpha=8,
    reg_lambda=1.3,
    )

In [95]:
# Run CV

for i, (train_index, test_index) in enumerate(kf.split(X1)):
    
    # Create data for this fold
    y_train, y_valid = y_ps.iloc[train_index].copy(), y_ps.iloc[test_index]
    X_train, X_valid = X1.iloc[train_index,:].copy(), X1.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
#     # Enocode data
#     for f in f_cats:
#         X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
#                                                         trn_series=X_train[f],
#                                                         val_series=X_valid[f],
#                                                         tst_series=X_test[f],
#                                                         target=y_train,
#                                                         min_samples_leaf=200,
#                                                         smoothing=10,
#                                                         noise_level=0
#                                                         )
    # Run model for this fold
    if True:
        eval_set=[(X_valid,y_valid)]
        print(X_train.shape)
        print(y_train.shape)
        fit_model = model.fit( X_train,
                              y_train, 
                               eval_set=eval_set,
                               eval_metric=gini_xgb,
                               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                               verbose=False
                             )
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
    else:
        print('1')
        print(X_train.shape)
        print(y_train.shape)
        fit_model = model.fit( X_train, y_train )
        print('2')
    # Generate validation predictions for this fold
    print('3')
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred_ps.iloc[test_index] = pred
    print('4')
    # Accumulate test set predictions
    y_test_pred_ps += fit_model.predict_proba(X_test)[:,1]
    print('5')
    del X_test, X_train, X_valid, y_train



Fold  0
(1190422, 36)
(1190422,)
  Best N trees =  1
  Best gini =  -0.356379
3
  Gini =  0.22220774804407117
4
5

Fold  1
(1190422, 36)
(1190422,)
  Best N trees =  1
  Best gini =  -0.342666
3
  Gini =  0.2456575424275046
4
5

Fold  2
(1190422, 36)
(1190422,)


KeyboardInterrupt: 

In [None]:
y_test_pred_ps /= K  # Average test set predictions

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred_ps)
# org: 0.2844175528242565, LB: 0.283
# org, added missing: 0.28532607696136825, LB: 0.282