In [1]:
!pip -q install keras-tuner datatable

import datatable
import numpy as np 
import pandas as pd
from tensorflow.keras.callbacks import TensorBoard
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.layers import MaxPooling1D, Dense, LeakyReLU, Conv1D 
from tensorflow.keras.layers import Flatten, Activation, BatchNormalization, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
import time
import pickle
import tensorflow as tf

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jane-street-market-prediction/example_sample_submission.csv
/kaggle/input/jane-street-market-prediction/features.csv
/kaggle/input/jane-street-market-prediction/example_test.csv
/kaggle/input/jane-street-market-prediction/train.csv
/kaggle/input/jane-street-market-prediction/janestreet/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/jane-street-market-prediction/janestreet/__init__.py


In [3]:
def feature_transforms(df):
    # Generate Features using Linear shifting, Natural Logarithm and Square Root
    for f in [f'feature_{i}' for i in range(1,130)]: 
        # linear shifting to value above 1.0
        df['pos_'+str(f)] = (df[f]+abs(train[f].min())+1).astype(np.float16)
    for f in [f'feature_{i}' for i in range(1,130)]: 
        # Natural log of all the values
        df['log_'+str(f)] = np.log(df['pos_'+str(f)]).astype(np.float16)
    for f in [f'feature_{i}' for i in range(1,130)]: 
        # Square root of all the values
        df['sqrt_'+str(f)] = np.sqrt(df['pos_'+str(f)]).astype(np.float16)
    
    # Linearly shifted values are used for log and sqrt transformations
    # However they are useless since we have our original values which are 100% correlated
    # Let's drop them from our data
    df.drop([f'pos_feature_{i}' for i in range(1,130)], inplace=True, axis=1)
    
    # From the Shap Dependence plots, the following features seem to have cubic relationship with target
    cubic = [3, 4, 5, 6, 28, 39, 128]
    for i in cubic:
        f = f'feature_{i}'
        threes = np.full((len(df[f])), 3)
        df['cub_'+f] =np.power(df[f], threes) 
        
    # From the Shap Dependence plots, the following features seem to have quadratic relationship with target
    quad = [3, 4, 5, 6, 8, 13, 17, 19, 23, 26, 27, 28, 32, 37, 39, 40, 41, 44, 45, 53, 54, 55, 57, 60, 61, 77, 83, 121, 124, 125, 127, 128]
    for i in quad:
        f = f'feature_{i}'
        df['quad_'+f] =np.square(df[f]) 
    
    return df

def manipulate_pairs(df):
    # features that can be added together or subtracted
    add_pairs = [(6,4), (13,66),(23,66),(32,61),(37,45),(40,45),(54,45),(81,66),(111,112)]
    for i,j in add_pairs:
        df[f'add_{i}_{j}'] = df[f'feature_{i}']+df[f'feature_{j}']
        df[f'sub_{i}_{j}'] = df[f'feature_{i}']-df[f'feature_{j}']

    add_log_pairs = [(4,6), (5,42),(9,97),(16,87),(17,42),(21,105),(28,95),(33,42),(46,69),(73,25),(95,126),(128,126)]
    for i,j in add_log_pairs:
        df[f'add_{i}_log{j}'] = df[f'feature_{i}']+df[f'log_feature_{j}']
        df[f'sub_{i}_log{j}'] = df[f'feature_{i}']-df[f'log_feature_{j}']
    # features that can be multiplied together
    mul_pairs = [(13,66), (19,95), (23,66), (27,92), (32,61),(38,42),(61,19)]
    for i,j in mul_pairs:
        df[f'mul_{i}_{j}'] = df[f'feature_{i}']*df[f'feature_{j}']

    mul_log_pairs = [(5,42), (17,42), (21,105), (26,105), (33,42), (47,102)]
    for i,j in mul_log_pairs:
        df[f'mul_{i}_log{j}'] = df[f'feature_{i}']*df[f'log_feature_{j}']
    return df

In [4]:
%%time

path = '../input/jane-street-market-prediction/'
train_path = 'train.csv'
# use datatable to load big data file
train_file = datatable.fread(path+train_path).to_pandas()

# we see there are only two datatypes - float64 and int32
# reduce memory usage by adopting suitable datatypes

for c in train_file.columns:
    min_val, max_val = train_file[c].min(), train_file[c].max()
    if train_file[c].dtype == 'float64':
        if min_val>np.finfo(np.float16).min and max_val<np.finfo(np.float16).max:
            train_file[c] = train_file[c].astype(np.float16)
        elif min_val>np.finfo(np.float32).min and max_val<np.finfo(np.float32).max:
            train_file[c] = train_file[c].astype(np.float32)
    elif train_file[c].dtype == 'int32':
        if min_val>np.iinfo(np.int8).min and max_val<np.iinfo(np.int8).max:
            train_file[c] = train_file[c].astype(np.int8)
        elif min_val>np.iinfo(np.int16).min and max_val<np.iinfo(np.int16).max:
            train_file[c] = train_file[c].astype(np.int16)


train = train_file.copy()

train = train.query('date > 200').reset_index(drop = True) 
train = train.query('date <= 300')
train = train[train['weight'] != 0]


CPU times: user 55.5 s, sys: 40.8 s, total: 1min 36s
Wall time: 1min 32s


In [5]:
print(f'There are {train_file.isnull().sum().sum()} NAN values in the train data')
features = [f'feature_{i}' for i in range(130)]

val_range = train_file[features].max() - train_file[features].min()
filler = pd.Series(train_file[features].min()-0.01*val_range, index=features)
# This filler value will be used as a constant replacement of missing values 


# A function to maintain data type consistency of dataframe
dtype_dict = dict(train_file[features].dtypes)
def consistent_dtype(df):
    return df.astype(dtype_dict)

def fill_missing(df):
    df[features] = np.nan_to_num(df[features]) + filler*np.isnan(df[features])
    return df  

train = fill_missing(train)
train = consistent_dtype(train).reset_index()

assert train.isnull().sum().sum() == 0
train.info()

There are 6762701 NAN values in the train data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374360 entries, 0 to 374359
Columns: 139 entries, index to ts_id
dtypes: float16(135), int16(1), int32(1), int64(1), int8(1)
memory usage: 101.8 MB


In [6]:
SEED = 123

tf.random.set_seed(SEED)
np.random.seed(SEED)

train['action'] = ((train["resp_4"].values - (train['resp_1'].values)) > 0.1).astype(int)

features = [c for c in train.columns if "feature" in c]

f_mean = np.mean(train[features[1:]].values,axis=0)

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

X = train.loc[:, train.columns.str.contains('feature')]
y_train = (train.loc[:, 'action'])

Y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T


leaky_relu_alpha =0.05
LeakyReLU(alpha=leaky_relu_alpha)

<tensorflow.python.keras.layers.advanced_activations.LeakyReLU at 0x7f6cbca65f90>

In [7]:
x_train = X.values.reshape(-1, X.shape[1], 1)  # reshaping for convnet
#x_test = x_test.values.reshape(-1, x_test.shape[1], 1)  # reshaping for convnet

In [8]:
def build_model():  # random search passes this hyperparameter() object 
    model = keras.models.Sequential()
    model.add(Conv1D(180, 2, input_shape=x_train.shape[1:]))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=leaky_relu_alpha))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.15))  
    model.add(Flatten()) 
    
    model.add(Dense(180))
    model.add(LeakyReLU(alpha=leaky_relu_alpha))
    model.add(Dropout(0.15))  
    model.add(Dense(5))
    model.add(Activation("sigmoid"))

    model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),
                  loss=BinaryCrossentropy(label_smoothing=0.095),
                  metrics=[tf.keras.metrics.AUC(name = 'auc'), "accuracy"])

    return model

In [9]:
model = build_model()

In [10]:
model.fit(x=x_train,
          y=Y,
          epochs=10,
          batch_size=1024)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f6cb60c4a10>

In [11]:
from tqdm import tqdm
f = np.median
th = 0.5000
import janestreet
env = janestreet.make_env()
for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
        pred = np.mean([model(x_tt.reshape(-1, x_tt.shape[1], 1), training = False).numpy()],axis=0)
        pred = f(pred)
        pred_df.action = np.where(pred >= th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)


15219it [04:17, 59.21it/s]
