# Jane Street: Ensemble

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os, gc
# import cudf
import pandas as pd
import numpy as np
# import cupy as cp
import janestreet
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold, train_test_split
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from joblib import dump, load

import random
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

2021-09-13 14:55:04.412809: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


# Preprocessing & Utils

In [2]:
# print('Loading...')
# train = cudf.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
features = [c for c in train.columns if 'feature' in c]

print('Filling...')
f_mean = train[features[1:]].mean()
# train = train.query('date > 85').reset_index(drop = True) 
train = train.query('weight > 0').reset_index(drop = True)
train[features[1:]] = train[features[1:]].fillna(f_mean)
train['action'] = (train['resp'] > 0).astype('int')
# print('Converting...')
# train = train.to_pandas()
# f_mean = f_mean.values.get()
# np.save('f_mean.npy', f_mean)

# print('Finish.')

Filling...


In [3]:
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
X = train[features]
y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #Multitarget
weighted_values = abs(train['resp']) * (train["weight"].transform("sqrt").transform("sqrt"))

In [4]:
def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df 
train = reduce_memory_usage(train)

Memory usage of dataframe is 2101.1269302368164 MB


  0%|          | 0/139 [00:00<?, ?it/s]

Memory usage of dataframe after reduction 525.2818241119385 MB
Reduced by 74.99999564268427 % 


In [5]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
seed_everything(seed=34)

In [6]:

def log_t(u, t):
    epsilon = 1e-7
    """Compute log_t for `u`."""
    if t == 1.0:
        return tf.math.log(u + epsilon)
    else:
        return (u**(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u`."""
    if t == 1.0:
        return tf.math.exp(u)
    else:
        return tf.math.maximum(0.0, 1.0 + (1.0 - t) * u) ** (1.0 / (1.0 - t))

def compute_normalization_fixed_point(y_pred, t2, num_iters=5):
    """Returns the normalization value for each example (t > 1.0).
    Args:
    y_pred: A multi-dimensional tensor with last dimension `num_classes`.
    t2: A temperature 2 (> 1.0 for tail heaviness).
    num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as y_pred with the last dimension being 1.
    """
    mu = tf.math.reduce_max(y_pred, -1, keepdims=True)
    normalized_y_pred_step_0 = y_pred - mu
    normalized_y_pred = normalized_y_pred_step_0
    i = 0
    while i < num_iters:
        i += 1
        logt_partition = tf.math.reduce_sum(exp_t(normalized_y_pred, t2),-1, keepdims=True)
        normalized_y_pred = normalized_y_pred_step_0 * (logt_partition ** (1.0 - t2))
  
    logt_partition = tf.math.reduce_sum(exp_t(normalized_y_pred, t2), -1, keepdims=True)
    return -log_t(1.0 / logt_partition, t2) + mu

def compute_normalization(y_pred, t2, num_iters=5):
    """Returns the normalization value for each example.
    Args:
    y_pred: A multi-dimensional tensor with last dimension `num_classes`.
    t2: A temperature 2 (< 1.0 for finite support, > 1.0 for tail heaviness).
    num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    if t2 < 1.0:
        return None # not implemented as these values do not occur in the authors experiments...
    else:
        return compute_normalization_fixed_point(y_pred, t2, num_iters)

In [7]:
def bi_tempered_logistic_loss(y_pred, y_true, t1, label_smoothing=0.0):
    """Bi-Tempered Logistic Loss with custom gradient.
    Args:
    y_pred: A multi-dimensional probability tensor with last dimension `num_classes`.
    y_true: A tensor with shape and dtype as y_pred.
    t1: Temperature 1 (< 1.0 for boundedness).
    label_smoothing: A float in [0, 1] for label smoothing.
    Returns:
    A loss tensor.
    """
    y_pred = tf.cast(y_pred, tf.float32)
    y_true = tf.cast(y_true, tf.float32)

    if label_smoothing > 0.0:
        num_classes = tf.cast(tf.shape(y_true)[-1], tf.float32)
        y_true = (1 - num_classes /(num_classes - 1) * label_smoothing) * y_true + label_smoothing / (num_classes - 1)

    temp1 = (log_t(y_true + 1e-7, t1) - log_t(y_pred, t1)) * y_true
    temp2 = (1 / (2 - t1)) * (tf.math.pow(y_true, 2 - t1) - tf.math.pow(y_pred, 2 - t1))
    loss_values = temp1 - temp2

    return tf.math.reduce_sum(loss_values, -1)

class BiTemperedLogisticLoss(tf.keras.losses.Loss):
    def __init__(self, t1, label_smoothing=0.0):
        super(BiTemperedLogisticLoss, self).__init__()
        self.t1 = t1
        self.label_smoothing = label_smoothing

    def call(self, y_true, y_pred):
        return bi_tempered_logistic_loss(y_pred, y_true, self.t1, self.label_smoothing)

In [8]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [9]:
n_splits = 4
group_gap = 20

# 1. Denoising Autoencoder (DAE) + MLP

## Autoencoders with Swap Noise
#### Copied from notebook https://www.kaggle.com/ottpocket/denoising-autoencoder-swap-noise/data

In [10]:
class SwapNoise(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data, swap_perc = .15, batch_size = 128, shuffle=True):
        'Initialization'
        self.batch_size = batch_size
        self.data = data
        self.swap_perc = swap_perc

        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return self.data.shape[0] // self.batch_size

    def __getitem__(self, index):
        'Generate one batch of data'
        #Generate indexes of the batch
        indices = [i for i in range(index*self.batch_size, (index+1)*self.batch_size)]
        #print(indices)
        noisy = self.swap_noise(self.data[indices])
        
        return (noisy), self.data[indices]

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        if self.shuffle == True:
            np.random.shuffle(self.data)
    def get_data(self):
        return self.data
    def swap_noise(self, temp):
        'Generates data containing batch_size samples' 
        
        # Add Swap_noise by row numerical features.
        for index in range(temp.shape[0]):            
            num_swaps = np.random.binomial(temp.shape[1], self.swap_perc, size=1)# how many swaps this row
            if num_swaps ==0:
                num_swaps=1
            swap_rvs = np.random.normal(size=num_swaps) #new values to be swapped in 
            positions_in_row = np.random.choice(a= range(2,temp.shape[1]) , size=num_swaps, replace=False)# new positions to be swapped in
            temp[index, positions_in_row] = swap_rvs#swapping


        #Adding Noise by column for 'feature_0'.  These have distr of [0,1]
        num_swaps = np.random.binomial(temp.shape[0], self.swap_perc, size=1)
        if num_swaps ==0:
            num_swaps=1
        swap_rvs = np.random.choice(a= np.array([0,1]), size=num_swaps, replace=True) #new values to be swapped in 
        positions_in_col = np.random.choice(a= np.array(range(temp.shape[0])), size=num_swaps, replace=False)# new positions to be swapped in
        temp[positions_in_col, 0] = swap_rvs#swapping new values for cp_time
        #print(temp)
        
        return temp

In [11]:
gc.collect()

63

In [12]:
def create_dae(num_features):
    
    inp = tf.keras.layers.Input(shape = (num_features, ))
    x = layers.BatchNormalization()(inp)
    
    x = layers.Dense(num_features, activation="swish")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(num_features, activation="swish")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(num_features, activation="swish")(x)
    x = layers.Dense(num_features * 0.75, activation="swish")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(num_features * 0.75, activation="swish")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(num_features * 0.5, activation="swish")(x)
    
    y = layers.Dense(num_features * 0.75, activation="relu")(x)
    y = layers.BatchNormalization()(y)
    y = layers.Dense(num_features * 0.75, activation="relu")(x)
    y = layers.BatchNormalization()(y)
    y = layers.Dense(num_features, activation="relu")(x)
    y = layers.BatchNormalization()(y)
    y = layers.Dense(num_features, activation="relu")(x)
    y = layers.BatchNormalization()(y)
    y = layers.Dense(num_features, activation="relu")(x)
    y = layers.BatchNormalization()(y)
    
    out = layers.Dense(num_features, activation="linear")(y)
    loss = "mse"
    encoder = Model(inputs=inp,outputs=x)
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3), metrics=["mae"],
                  loss = loss,)
    
    return model, encoder

In [13]:
# train_swap, test_swap, _, _ = train_test_split(X, X, test_size=0.2, random_state=42)
# batch_size = 4096
# train_gen = SwapNoise(train_swap.to_numpy(), swap_perc = .3, batch_size = batch_size, shuffle=True)
# val_gen = SwapNoise(test_swap.to_numpy(), swap_perc = .3, batch_size = batch_size, shuffle=True)
model, x = create_dae(X.shape[1])
# model.fit(train_gen, validation_data=val_gen, epochs=100, callbacks=[EarlyStopping(monitor = 'val_loss', min_delta = 1e-4, patience = 5, mode = 'min', 
#                        baseline = None, restore_best_weights = True, verbose = 0)])
# x.save_weights("js-dae.h5")

2021-09-13 14:58:37.341749: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-09-13 14:58:37.347014: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-09-13 14:58:37.388884: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-13 14:58:37.389567: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0
coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s
2021-09-13 14:58:37.389633: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-09-13 14:58:37.420237: I tensorflow/stream_executor/platform/def

In [14]:
x.load_weights("../input/jsautoencoderweights/js-dae.h5")

In [15]:
x.trainable = False

In [16]:
# del train_swap, test_swap, _, model

## DAE + MLP Training

In [17]:
def create_dae_mlp(num_columns, num_labels, dropout_rates, label_smoothing, learning_rate, encoder):

    inp1 = tf.keras.layers.Input(shape = (num_columns, ))
    encoder = encoder(inp1)
    x = layers.Concatenate()([encoder, inp1])
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(dropout_rates[0])(x)
    
    #First Stem
    x1 = layers.Dense(81)(x)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Activation("swish")(x1)
    x1 = layers.Dropout(0.25)(x1)
    
    b3 = layers.Dense(78)(x1)
    b3 = layers.BatchNormalization()(b3)
    b3 = layers.Activation("swish")(b3)
    
    
    #Second Stem
    x2 = layers.Dense(114)(x)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Activation("swish")(x2)
    x2 = layers.Dropout(0.25)(x2)
    
    b1 = layers.Dense(76)(x2)
    b1 = layers.BatchNormalization()(b1)
    b1 = layers.Activation("swish")(b1)
    
    #First Concat
    b2 = layers.Concatenate()([x2, x1])
    b2 = layers.Activation("swish")(b2) 
    b2 = layers.Dropout(0.25)(b2)
    
    #Second Concat
    c1 = layers.Concatenate()([b1, b2])
    c1 = layers.Activation("swish")(c1) 
    c1 = layers.Dropout(0.2)(c1)
    
    
    #Final Layer
    z = layers.Concatenate()([b3, c1])
    z = layers.Activation("swish")(z)
    z = layers.Dropout(0.25)(z)
    
    
    x = tf.keras.layers.Dense(num_labels)(z)
    out = tf.keras.layers.Activation('sigmoid')(x)
    
    model = tf.keras.models.Model(inputs = inp1, outputs = out)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate,),
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = label_smoothing), 
                  metrics = [tf.keras.metrics.AUC(multi_label=True, name="AUC")], 
                 )
    
    return model

In [18]:
batch_size = 4096
dropout_rates = [0.0]
label_smoothing = 1e-2
learning_rate = 1e-3


oof1 = np.zeros(y.shape)
gkf = PurgedGroupTimeSeriesSplit(n_splits = n_splits, group_gap = group_gap)
for fold, (tr, te) in enumerate(gkf.split(train['action'].values, train['action'].values, train['date'].values)):
    
    X_tr, X_val = X.loc[tr, :], X.loc[te, :]
    y_tr, y_val = y[tr], y[te]
    
    ckp_path = f'JSModel_dae{fold}.hdf5'
    model = create_dae_mlp(X_tr.shape[1], 5, dropout_rates, label_smoothing, learning_rate, x)
    rlr = ReduceLROnPlateau(monitor = 'val_AUC', factor = 0.15, patience = 3, verbose = 0, 
                                min_delta = 1e-4, mode = 'max')
    ckp = ModelCheckpoint(ckp_path, monitor = 'val_AUC', verbose = 0, 
                            save_best_only = True, save_weights_only = True, mode = 'max')
    es = EarlyStopping(monitor = 'val_AUC', min_delta = 1e-4, patience = 7, mode = 'max', 
                        baseline = None, restore_best_weights = True, verbose = 0)
    model.fit(X_tr, y_tr, validation_data = (X_val, y_val), epochs = 1000, 
                batch_size = batch_size, callbacks = [rlr, ckp, es], verbose = 1, sample_weight = weighted_values.loc[tr])
                
    oof1[te] = model.predict(X_val, batch_size = batch_size * 4)
    
    # Finetune 3 epochs on validation set with small learning rate
    model = create_dae_mlp(X_tr.shape[1], 5, dropout_rates, label_smoothing, learning_rate / 100, x)
    model.load_weights(ckp_path)
    model.fit(X_val, y_val, epochs = 3, batch_size = batch_size, verbose = 0)
    model.save_weights(ckp_path)
    
    K.clear_session()
    del model
    rubbish = gc.collect()

2021-09-13 14:58:44.823591: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-09-13 14:58:44.834798: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2000189999 Hz


Epoch 1/1000


2021-09-13 14:58:47.234723: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11


11/93 [==>...........................] - ETA: 0s - loss: 0.0149 - AUC: 0.4986

2021-09-13 14:58:47.961513: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000


2021-09-13 14:59:17.929810: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 726209120 exceeds 10% of free system memory.


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000


2021-09-13 15:00:20.136012: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1107409680 exceeds 10% of free system memory.


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000


2021-09-13 15:01:28.690079: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1517381840 exceeds 10% of free system memory.


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000


In [19]:
score_oof = roc_auc_score(y, oof1)
print(score_oof)

0.5176115036498


# 2. Non Linear Topology MLP

In [20]:
seed_everything(seed=34)

In [21]:
def create_non_linear_mlp(num_columns, num_labels, dropout_rates, label_smoothing, learning_rate,):

    inp = tf.keras.layers.Input(shape = (num_columns, ))
#     encoder = encoder(inp)
#     x = layers.Concatenate()([inp, encoder])
    x = layers.BatchNormalization()(inp)
    x = layers.Dropout(dropout_rates[0])(x)
    
    #First Stem
    x1 = layers.Dense(81)(x)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Activation("swish")(x1)
    x1 = layers.Dropout(0.25)(x1)
    
    b3 = layers.Dense(78)(x1)
    b3 = layers.BatchNormalization()(b3)
    b3 = layers.Activation("swish")(b3)
    
    
    #Second Stem
    x2 = layers.Dense(114)(x)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Activation("swish")(x2)
    x2 = layers.Dropout(0.25)(x2)
    
    b1 = layers.Dense(76)(x2)
    b1 = layers.BatchNormalization()(b1)
    b1 = layers.Activation("swish")(b1)
    
    #First Concat
    b2 = layers.Concatenate()([x2, x1])
    b2 = layers.Activation("swish")(b2) 
    b2 = layers.Dropout(0.25)(b2)
    
    #Second Concat
    c1 = layers.Concatenate()([b1, b2])
    c1 = layers.Activation("swish")(c1) 
    c1 = layers.Dropout(0.2)(c1)
    
    
    #Final Layer
    z = layers.Concatenate()([b3, c1])
    z = layers.Activation("swish")(z)
    z = layers.Dropout(0.25)(z)
    
    
    x = tf.keras.layers.Dense(num_labels)(z)
    out = tf.keras.layers.Activation('sigmoid')(x)
    
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate,),
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = label_smoothing), 
                  metrics = [tf.keras.metrics.AUC(multi_label=True, name="AUC")], 
                 )
    
    return model

## MLP Inference

In [22]:
gc.collect()

3120

In [23]:
# batch_size = 4096
# dropout_rates = [0.0]
# label_smoothing = 1e-2
# learning_rate = 1e-3

# seed_everything(seed=34)
# oof3 = y.copy()
# gkf = GroupKFold(n_splits = n_splits)
# for fold, (tr, te) in enumerate(gkf.split(train['action'].values, train['action'].values, train['date'].values)):
    
#     X_tr, X_val = X.loc[tr, :], X.loc[te, :]
#     y_tr, y_val = y[tr], y[te]
    
#     ckp_path = f'JSModel_nonlinear{fold}.hdf5'
#     model = create_non_linear_mlp(X_tr.shape[1], 5, dropout_rates, label_smoothing, learning_rate)
#     rlr = ReduceLROnPlateau(monitor = 'val_AUC', factor = 0.13521639718529144, patience = 3, verbose = 0, 
#                             min_delta = 1e-4, mode = 'max')
#     ckp = ModelCheckpoint(ckp_path, monitor = 'val_AUC', verbose = 0, 
#                           save_best_only = True, save_weights_only = True, mode = 'max')
#     es = EarlyStopping(monitor = 'val_AUC', min_delta = 1e-4, patience = 7, mode = 'max', 
#                        baseline = None, restore_best_weights = True, verbose = 0)
#     model.fit(X_tr, y_tr, validation_data = (X_val, y_val), epochs = 1000, 
#               batch_size = batch_size, callbacks = [rlr, ckp, es], verbose = 1, sample_weight = weighted_values.loc[tr])
                
#     oof3[te] = model.predict(X_val, batch_size = batch_size * 4)
    
#     # Finetune 3 epochs on validation set with small learning rate
#     model = create_non_linear_mlp(X_tr.shape[1], 5, dropout_rates, label_smoothing, learning_rate / 100)
#     model.load_weights(ckp_path)
#     model.fit(X_val, y_val, epochs = 3, batch_size = batch_size, verbose = 0)
#     model.save_weights(ckp_path)
    
#     K.clear_session()
#     del model
#     rubbish = gc.collect()

In [24]:
batch_size = 4096
dropout_rates = [0.0]
label_smoothing = 1e-2
learning_rate = 1e-3


oof2 = np.zeros(y.shape)
gkf = PurgedGroupTimeSeriesSplit(n_splits = n_splits, group_gap = group_gap)
for fold, (tr, te) in enumerate(gkf.split(train['action'].values, train['action'].values, train['date'].values)):
    
    X_tr, X_val = X.loc[tr, :], X.loc[te, :]
    y_tr, y_val = y[tr], y[te]
    
    model = create_non_linear_mlp(X_tr.shape[1], 5, dropout_rates, label_smoothing, learning_rate)
    model.load_weights(f"../input/jsautoencoderweights/JSModel_nonlinear{fold}.hdf5")
                
    oof2[te] = model.predict(X_val, batch_size = batch_size * 4)
    
    K.clear_session()
    del model
    rubbish = gc.collect()

In [25]:
score_oof = roc_auc_score(y, oof2)
print(score_oof)

0.5220479137705926


# GroupKFold MLP

In [26]:
# batch_size = 4096
# dropout_rates = [0.0]
# label_smoothing = 1e-2
# learning_rate = 1e-3

# seed_everything(seed=34)
# oof3 = y.copy()
# gkf = GroupKFold(n_splits = n_splits)
# for fold, (tr, te) in enumerate(gkf.split(train['action'].values, train['action'].values, train['date'].values)):
    
#     X_tr, X_val = X.loc[tr, :], X.loc[te, :]
#     y_tr, y_val = y[tr], y[te]
    
#     ckp_path = f'JSModel_nonlinear{fold}.hdf5'
#     model = create_non_linear_mlp(X_tr.shape[1], 5, dropout_rates, label_smoothing, learning_rate)
#     rlr = ReduceLROnPlateau(monitor = 'val_AUC', factor = 0.13521639718529144, patience = 3, verbose = 0, 
#                             min_delta = 1e-4, mode = 'max')
#     ckp = ModelCheckpoint(ckp_path, monitor = 'val_AUC', verbose = 0, 
#                           save_best_only = True, save_weights_only = True, mode = 'max')
#     es = EarlyStopping(monitor = 'val_AUC', min_delta = 1e-4, patience = 7, mode = 'max', 
#                        baseline = None, restore_best_weights = True, verbose = 0)
#     model.fit(X_tr, y_tr, validation_data = (X_val, y_val), epochs = 1000, 
#               batch_size = batch_size, callbacks = [rlr, ckp, es], verbose = 1)
                
#     oof3[te] = model.predict(X_val, batch_size = batch_size * 4)
    
#     # Finetune 3 epochs on validation set with small learning rate
#     model = create_non_linear_mlp(X_tr.shape[1], 5, dropout_rates, label_smoothing, learning_rate / 100)
#     model.load_weights(ckp_path)
#     model.fit(X_val, y_val, epochs = 3, batch_size = batch_size, verbose = 0)
#     model.save_weights(ckp_path)
    
#     K.clear_session()
#     del model
#     rubbish = gc.collect()

# Ensemble CV Score

In [27]:
oof = oof2 + oof1
oof /= 2
score_oof = roc_auc_score(y, oof)
print(score_oof)

0.5222380533194644


# Load Models

In [28]:
# Non linear MLP

num_models = 2

models = []
selected_folds = [2, 3]
for i in range(num_models):
    clf = create_non_linear_mlp(len(features), 5, dropout_rates, label_smoothing, learning_rate)
#     clf.load_weights(f'../input/js-nn-models/JSModel_{selected_folds[i]}.hdf5')
    clf.load_weights(f'../input/jsautoencoderweights/JSModel_nonlinear{selected_folds[i]}.hdf5')
    clf.call = tf.function(clf.call, experimental_relax_shapes=True)
    models.append(clf)

In [29]:
# # MLP + GroupKFold


# num_models = 2

# selected_folds = [2, 3]
# for i in range(num_models):
#     clf = create_non_linear_mlp(len(features), 5, dropout_rates, label_smoothing, learning_rate)
# #     clf.load_weights(f'../input/js-nn-models/JSModel_{selected_folds[i]}.hdf5')
#     clf.load_weights(f'JSModel_nonlinear{selected_folds[i]}.hdf5')
#     clf.call = tf.function(clf.call, experimental_relax_shapes=True)
#     models.append(clf)


In [30]:
# DAE + MLP

num_models = 2


selected_folds = [2, 3]
for i in range(num_models):
    clf = create_dae_mlp(len(features), 5, dropout_rates, label_smoothing, learning_rate,x)
#     clf.load_weights(f'../input/js-nn-models/JSModel_{selected_folds[i]}.hdf5')
    clf.load_weights(f'./JSModel_dae{selected_folds[i]}.hdf5')
    clf.call = tf.function(clf.call, experimental_relax_shapes=True)
    models.append(clf)

In [31]:
f_mean = np.load('../input/js-nn-models/f_mean.npy')
# f_mean = np.load('./f_mean.npy')

# Submitting

Just use two models to reduce running time.

In [32]:
def return_confident(lst):
    return min(lst) if min(lst) < (1 - max(lst)) else max(lst)

In [33]:
env = janestreet.make_env()
env_iter = env.iter_test()

In [34]:
opt_th = 0.5
f = np.mean
for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
        pred = np.apply_along_axis(return_confident, 0, [model(x_tt, training=False).numpy() for model in models])
        pred = f(pred)
        pred_df.action = np.where(pred >= opt_th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)

0it [00:00, ?it/s]