# Import Libraries & Load Datasets

In [None]:
import sys, os, io, gc
import numpy as np
import pandas as pd
from tqdm import tqdm
import janestreet

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers import Input, BatchNormalization, Dense, Dropout, Concatenate, Lambda, Activation, GaussianNoise
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow_addons.optimizers import RectifiedAdam
from tensorflow.keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings("ignore")
print("Setup Complete")

In [None]:
train = pd.read_csv('../input/jane-street-market-prediction/train.csv')

In [None]:
def reduce_memory_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage of dataframe is {:.2f}' 
                     'MB').format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage after optimization is: {:.2f}' 
                              'MB').format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) 
                                             / start_mem))
    
    return df

def percentage_missing_values(df):
    missing_values_count = df.isnull().sum()
    total_cells = np.product(df.shape)
    total_missing = missing_values_count.sum()
    print ("Percentage of Missing Data = ",(total_missing/total_cells) * 100,"%")

# Data Preprocessing

In [None]:
features = [c for c in train.columns if 'feature' in c]
resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']

In [None]:
train = train.query('date > 85').reset_index(drop=True)

In [None]:
train.fillna(train.mean(), inplace=True)
f_mean = np.mean(train[features[1:]].values,axis=0)

In [None]:
train = train[train['weight'] != 0]
train['action'] = ((train['resp'].values) > 0).astype(int)

In [None]:
# train[features[1:]] = train[features[1:]].fillna(train[features[1:]].mean())
# train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')

In [None]:
train = reduce_memory_usage(train)

In [None]:
print('Train Size: ',train.shape)

In [None]:
X_train = train.loc[:, train .columns.str.contains('feature')]
y_train = np.stack([(train[c]>0).astype(int) for c in resp_cols]).T

In [None]:
del train
gc.collect()

# Encoded MLP

## Configuration Parameters

In [None]:
SEED=42
tf.random.set_seed(SEED)
np.random.seed(SEED)

In [None]:
num_columns = len(features)
num_labels = 5
noise = 0.1
hidden_units = [150, 150, 150]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3
epochs = 200
batch_size = 5000
validation_split = 0
th = 0.503

## Autoencoder

In [None]:
def create_autoencoder(num_columns, num_labels, noise):
    inp = Input(shape=(num_columns,))
    encoded = BatchNormalization()(inp)
    encoded = GaussianNoise(noise)(encoded)
    encoded = Dense(64, activation='relu')(encoded)
    
    decoded = Dropout(0.2)(encoded)
    decoded = Dense(num_columns, name='decoded')(decoded)
    
    x = Dense(32, activation='relu')(decoded)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(num_labels, activation='sigmoid', name='label_output')(x)
    
    encoder = Model(inputs=inp, outputs=encoded)
    autoencoder = Model(inputs=inp, outputs=[decoded,x])
    autoencoder.compile(
        optimizer=RectifiedAdam(learning_rate=learning_rate),
        loss={'decoded':'mse', 'label_output':'binary_crossentropy'})
    return autoencoder, encoder

In [None]:
autoencoder, encoder = create_autoencoder(num_columns, num_labels, noise)
autoencoder.fit(X_train, (X_train, y_train),
               epochs = epochs,
               batch_size = batch_size,
               validation_split=validation_split,
               callbacks=[EarlyStopping('val_loss', patience=10, restore_best_weights=True)])

In [None]:
encoder.save_weights('./encoder.hdf5')

## MLP

In [None]:
def mlp(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate, encoder):
    inp = Input(shape=(num_columns,))
    
    x = encoder(inp)
    x = Concatenate()([x, inp])
    
    x = BatchNormalization()(inp)
    x = Dropout(dropout_rates[0])(x)
    
    for i in range(len(hidden_units)):
        x = Dense(hidden_units[i])(x)
        x = BatchNormalization()(x)
        x = Activation(tf.keras.activations.relu)(x) #swish
        x = Dropout(dropout_rates[i+1])(x)
    
    x = Dense(num_labels)(x)
    out = Activation('sigmoid')(x)
    
    model = Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=RectifiedAdam(learning_rate=learning_rate),
        loss=BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name='AUC'))
    return model

In [None]:
clf = mlp(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate, encoder)
clf.summary()

In [None]:
clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

In [None]:
clf.save(f'model.h5')
models = []
models.append(clf)

# Inference

In [None]:
env = janestreet.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, prediction_df) in tqdm(iter_test):
    if test_df['weight'].item() > 0:
        test_df = test_df.loc[:, features].values
        if np.isnan(test_df[:, 1:].sum()):
            test_df[:, 1:] = np.nan_to_num(test_df[:, 1:]) + np.isnan(test_df[:, 1:]) * f_mean
        pred = np.mean([model(test_df, training = False).numpy() for model in models], axis=0)
        pred = np.median(pred)
        prediction_df.action = np.where(pred >= th, 1, 0).astype(int)
    else:
        prediction_df.action = 0
    env.predict(prediction_df)
print('Infrence is Completed')