In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
import tensorflow as tf
from tensorflow.python.data import Dataset
import keras
from keras.utils import to_categorical
from keras import models
from keras import layers

from sklearn.model_selection import KFold

## Load and preprocess data 

In [20]:
df_train = pd.read_csv('train.csv', index_col=0)
df_test = pd.read_csv('test.csv', index_col=0)

In [21]:
# identify and replace missing values
print(df_train.isnull().sum())
print(df_test.isnull().sum())

print(df_train['opened_position_qty '].mean())
print(df_train['opened_position_qty '].median())

print(df_train['closed_position_qty'].mean())
print(df_train['closed_position_qty'].median())

# replace missing values with median (less sensitive to outliers)
df_train['opened_position_qty '].fillna(df_train['opened_position_qty '].median(),inplace=True)
df_test['opened_position_qty '].fillna(df_train['opened_position_qty '].median(),inplace=True)
df_train['closed_position_qty'].fillna(df_train['closed_position_qty'].median(),inplace=True)
df_test['closed_position_qty'].fillna(df_train['closed_position_qty'].median(), inplace=True)

last_price                   0
mid                          0
opened_position_qty     172460
closed_position_qty     172460
transacted_qty               0
d_open_interest              0
bid1                         0
bid2                         0
bid3                         0
bid4                         0
bid5                         0
ask1                         0
ask2                         0
ask3                         0
ask4                         0
ask5                         0
bid1vol                      0
bid2vol                      0
bid3vol                      0
bid4vol                      0
bid5vol                      0
ask1vol                      0
ask2vol                      0
ask3vol                      0
ask4vol                      0
ask5vol                      0
y                            0
dtype: int64
last_price                  0
mid                         0
opened_position_qty     53656
closed_position_qty     53656
transacted_qty              0


In [22]:
# Normalize data
from sklearn import preprocessing

x_train = df_train[df_train.columns[:26]]
y_train = df_train['y']
x_test = df_test

# Normalize training data by subtracting mean and scaling to unit variance
std_scale = preprocessing.StandardScaler().fit(x_train)
x_train_norm = std_scale.transform(x_train)
x_train = pd.DataFrame(x_train_norm, index=x_train.index, columns=x_train.columns)

# Normalize testing data by using mean and SD of training set
x_test_norm = std_scale.transform(x_test)
x_test = pd.DataFrame(x_test_norm, index=x_test.index, columns=x_test.columns) 

# Convert to numpy array
x_train = x_train.rename_axis('ID').values
y_train = y_train.rename_axis('ID').values
x_test = x_test.rename_axis('ID').values

## Build and train model

In [24]:
model = keras.Sequential([
    keras.layers.Dense(64, activation=tf.nn.relu, input_shape=(x_train.shape[1],)),
    keras.layers.Dense(50, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(60, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(100, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(100, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(100, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(80, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(90, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(70, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    #keras.layers.Dense(2, activation=  'softmax')
])

#    keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation="relu", padding="same"),
#    keras.layers.MaxPooling2D(pool_size=(2, 2), padding='same'),
#    keras.layers.Conv2D(70, (3, 3), strides=(1, 1), activation="relu", padding="same"),
#    keras.layers.MaxPooling2D(pool_size=(2, 2), padding='same'),
#    keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation="relu", padding="same"),
#    keras.layers.Flatten(),

In [25]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [26]:
num_fold = 5
num_epoch = 25
batch = 32
index = np.array(range(len(x_train)))
train_error = 0
val_error = 0
kf = KFold(n_splits=num_fold)
ind_split = [0, 0]
i = 1
for train_index, test_index in kf.split(index):
    print("Fold " + str(i))
    x_train_val, x_test_val = x_train[train_index], x_train[test_index]
    y_train_val, y_test_val = y_train[train_index], y_train[test_index]
    train_model = model.fit(x_train_val, y_train_val, epochs= num_epoch, batch_size = batch)
    eval_train = model.evaluate(x_train_val, y_train_val)
    eval_test = model.evaluate(x_test_val, y_test_val)
    if (eval_test[1] > ind_split[0]):
        ind_split = [eval_test[1], train_index]
    train_error += eval_train[1] / num_fold
    val_error += eval_test[1] / num_fold
    i += 1
    
# train the best model again
train_model = model.fit(x_train[ind_split[1]], y_train[ind_split[1]], epochs= num_epoch, batch_size = batch)
print("In Sample Accuracy: " + str(train_error))
print("Out of Sample Accuracy: " + str(val_error))


Fold 1
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Fold 2
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
 18080/473904 [>.............................] - ETA: 34s - loss: 4.2485 - accuracy: 0.6412

KeyboardInterrupt: 

In [None]:
 train_model = model.fit(x_train, y_train, epochs= num_epoch, batch_size = batch)

## Save results

In [16]:
df_test['Predicted'] = model.predict_proba(x_test)[:,1]
df_test[['Predicted']].to_csv('submission7.csv')