# Introduction

In the end of this kernel I want to have:
XGB and Mixture Bayes trained on 2/3-3/4 data
Neural net then trained on the remaining 1/3-1/4 data and inputs from the previous ones.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange
%matplotlib inline

In [2]:
plt.style.use('bmh')
plt.rcParams['figure.figsize'] = (10, 10)
title_config = {'fontsize': 20, 'y': 1.05}

In [3]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [4]:
X_a = train.iloc[:, 2:].values.astype('float64')
y_a = train['target'].values
y_a = y_a.astype('int')
X_test = test.iloc[:, 1:].values.astype('float64')
from sklearn.model_selection import train_test_split as ttsplit
X_train, X_val, y_train, y_val = ttsplit(X_a, y_a, test_size=0.2)
X_train_s1, X_train_s2, y_train_s1, y_train_s2 = ttsplit(X_train, y_train, test_size=0.3)

#Above lines were there to validate the model


In [5]:
#X_a, y_a = augment(X_a, y_a) seriously I wrote that
X_train_s1, X_train_s2, y_train_s1, y_train_s2 = ttsplit(X_a, y_a, test_size=0.15)

y_train_s1 = y_train_s1.astype('int')
y_train_s2 = y_train_s2.astype('int')


In [6]:
X_a.shape

(200000, 200)

**The code for GaussianMixtureNB has been forked.**

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.mixture import GaussianMixture
from scipy.special import logsumexp


class GaussianMixtureNB(BaseEstimator, ClassifierMixin):
    def __init__(self, n_components=1, reg_covar=1e-06):
        self.n_components = n_components
        self.reg_covar = reg_covar
    def fit(self, X, y):
        self.log_prior_ = np.log(np.bincount(y) / len(y))
        # shape of self.log_pdf_
        shape = (len(self.log_prior_), X.shape[1])
        self.log_pdf_ = [[GaussianMixture(n_components=self.n_components,
                                          reg_covar=self.reg_covar)
                          .fit(X[y == i, j:j + 1])
                          .score_samples for j in range(shape[1])]
                         for i in range(shape[0])]
    def predict_proba(self, X):
        # shape of log_likelihood before summing
        shape = (len(self.log_prior_), X.shape[1], X.shape[0])
        log_likelihood = np.sum([[self.log_pdf_[i][j](X[:, j:j + 1])
                                  for j in range(shape[1])]
                                 for i in range(shape[0])], axis=1).T
        log_joint = self.log_prior_ + log_likelihood
        return np.exp(log_joint - logsumexp(log_joint, axis=1, keepdims=True))
    def predict(self, X):
        return self.predict_proba(X).argmax(axis=1)

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler
pipeline = make_pipeline(StandardScaler(), GaussianMixtureNB(n_components=3, reg_covar=0.03))
pipeline.fit(X_train_s1, y_train_s1)
print(f'Training AUC is {roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1])}.')
print(f'Validation AUC is {roc_auc_score(y_val, pipeline.predict_proba(X_val)[:, 1])}.')

Training AUC is 0.9030119060143509.
Validation AUC is 0.9023875277015142.


**And Mixture Bayes works well, now xgb, hyperparameters have been optimised separately.**

In [9]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train_s1, y_train_s1)
dval = xgb.DMatrix(X_train_s2, y_train_s2) #okay this really shouldn't make it overfit. Although I could still reshuffle etc.

In [10]:
param = {'objective':'binary:logistic', 'eval_metric':['auc'], 'eta':0.015, 'max_depth':2, 'colsample_bytree':0.75,
         'colsample_bylevel':0.75, 'colsample_bynode':0.75, 'subsample':0.8, 'n_estimators':1500, 'tree_method': 'gpu_hist'
        }
evallist = [ (dtrain, 'train'), (dval, 'eval')]
num_round = 500000
nfold = 5
bst1 = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds = 300, verbose_eval = 100)

[0]	train-auc:0.5741	eval-auc:0.581519
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 300 rounds.
[100]	train-auc:0.696713	eval-auc:0.691902
[200]	train-auc:0.741607	eval-auc:0.734317
[300]	train-auc:0.777759	eval-auc:0.768738
[400]	train-auc:0.799551	eval-auc:0.790622
[500]	train-auc:0.814091	eval-auc:0.804421
[600]	train-auc:0.824951	eval-auc:0.814626
[700]	train-auc:0.835465	eval-auc:0.824356
[800]	train-auc:0.842413	eval-auc:0.830781
[900]	train-auc:0.849093	eval-auc:0.836222
[1000]	train-auc:0.854774	eval-auc:0.841081
[1100]	train-auc:0.859913	eval-auc:0.845322
[1200]	train-auc:0.863771	eval-auc:0.848303
[1300]	train-auc:0.867589	eval-auc:0.851603
[1400]	train-auc:0.870806	eval-auc:0.853908
[1500]	train-auc:0.873785	eval-auc:0.856339
[1600]	train-auc:0.876075	eval-auc:0.858102
[1700]	train-auc:0.878848	eval-auc:0.86042
[1800]	train-auc:0.881176	eval-auc:0.862305
[1900]	train-auc:0.88343	eval-auc:0.8

In [11]:
s2i1 = pipeline.predict_proba(X_train_s2)[:, 1]  #stage 2 input 1
s2i2 = bst1.predict(xgb.DMatrix(X_train_s2), ntree_limit=bst1.best_ntree_limit) #stage 2 input 2


Now I try NN:

In [12]:
import tensorflow as tf
import keras
from keras import backend as K
from keras.layers import Dense, Dropout, BatchNormalization, ELU, Input, Concatenate
from keras.models import Sequential, Model
from keras.callbacks import EarlyStopping
from keras.optimizers import Nadam

num_cores = 4
config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,
                        inter_op_parallelism_threads=num_cores, 
                        allow_soft_placement=True,
                        device_count = {'CPU' : 1,
                                        'GPU' : 1}
                       )

session = tf.Session(config=config)
K.set_session(session)

opt = Nadam(lr = 0.002)



Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.


In [13]:
input_a = Input((200,))
model_a = Sequential()
bonus_1 = Dense(200, input_dim = 200, activation = 'tanh') #Ie not trainable part of the final model.
bonus_2 = Dense(100, input_dim = 200, activation = 'tanh') #Ie not trainable part of the final model.
#model_a.add(bonus_1)

model_a.add(bonus_2)
model_a.add(Dense(1, input_shape = (200,), activation = 'sigmoid'))
output_a = model_a(input_a)

Adding stuff to the above breaks it for some reason

In [14]:
model_a.compile(optimizer = opt,
    loss = 'binary_crossentropy',
    metrics = ['binary_accuracy']
)

stop = EarlyStopping(patience = 4, verbose =1, restore_best_weights = True)
model_a.fit(X_train_s1, y_train_s1,
    steps_per_epoch=10,
    validation_split = 0.1,
    validation_steps = 2,
    callbacks = [stop],
    epochs=250)

bonus_1.trainable = False
bonus_2.trainable = False

Instructions for updating:
Use tf.cast instead.
Train on 153000 samples, validate on 17000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Restoring model weights from the end of the best epoch
Epoch 00031: early stopping


Result above should be below 0.27, otherwise regression converged to a wrong minimum


In [15]:
input_aux1 = Input((1,))
input_aux2 = Input((1,))
input_b = Concatenate()([output_a, input_aux1, input_aux2])
model_b = Sequential()
#model_b.add(Dense(20, input_dim = 12))
#model_b.add(ELU())
model_b.add(Dense(1))
output_b=model_b(input_b)
model = Model(inputs=[input_a, input_aux1, input_aux2], outputs=output_b)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 1)            20201       input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
concatenat

In [16]:
model.compile(optimizer = opt,
    loss = 'binary_crossentropy',
    metrics = ['binary_accuracy']
)
stop = EarlyStopping(patience = 15, verbose =1, restore_best_weights= True)

model.fit([X_train_s2, s2i1, s2i2], y_train_s2,
    steps_per_epoch=10,
    validation_split = 0.1,
    validation_steps = 2,
    callbacks = [stop],
    epochs=500)

Train on 27000 samples, validate on 3000 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/50

<keras.callbacks.History at 0x7f2dcac512b0>

The above code should get loss smaller than the ones below, otherwise early stop indicates the minimum hasn't been found.


In [17]:
print(log_loss(y_train_s2, s2i1), log_loss(y_train_s2, s2i2))

0.20897613898377027 0.2077384282095154


For some reason Keras gets a worse result than just using previous predictions would get.

In [18]:
sVi1 = pipeline.predict_proba(X_val)[:, 1]  #stage - validation -  input 1
sVi2 = bst1.predict(xgb.DMatrix(X_val), ntree_limit=bst1.best_ntree_limit) #stage - validation - input 2

Code below is only for testing if at the beginning a validation set has been set apart.

In [19]:
pred = model.predict([X_val, sVi1, sVi2])
print(log_loss(y_val, pred), log_loss(y_val, sVi1), log_loss(y_val, sVi2))
print(roc_auc_score(y_val, pred), roc_auc_score(y_val, sVi1), roc_auc_score(y_val, sVi2))


nan 0.20107871929278218 0.17653931730318217
0.9237854611514358 0.9023875277015142 0.9285227995386464


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


 And here for the final predicition:

*Note for myself - diminishing sets (to split for s1 and s2) can be circumvented by retraining a few times and uploading averaged out results.*


In [20]:
sTi1 = pipeline.predict_proba(X_test)[:, 1]  #stage - test -  input 1
sTi2 = bst1.predict(xgb.DMatrix(X_test), ntree_limit=bst1.best_ntree_limit) #stage - test - input 2
pred = model.predict([X_test, sTi1, sTi2])


In [21]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['target'] = pred
submission.to_csv('submission0.csv', index=False)
submission = pd.read_csv('../input/sample_submission.csv')
submission['target'] = sTi1
submission.to_csv('submission1.csv', index=False)
submission = pd.read_csv('../input/sample_submission.csv')
submission['target'] = sTi2
submission.to_csv('submission2.csv', index=False)


The end result got so bad.