In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import tensorflow as tf
from tensorflow.python.data import Dataset
import keras
from keras.utils import to_categorical
from keras import models
from keras import layers

from sklearn.model_selection import KFold

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load and preprocess data 

In [3]:
df_train = pd.read_csv('train.csv', index_col=0)
df_test = pd.read_csv('test.csv', index_col=0)

In [4]:
# identify and replace missing values
print(df_train.isnull().sum())
print(df_test.isnull().sum())

print(df_train['opened_position_qty '].mean())
print(df_train['opened_position_qty '].median())

print(df_train['closed_position_qty'].mean())
print(df_train['closed_position_qty'].median())

# replace missing values with median (less sensitive to outliers)
df_train['opened_position_qty '].fillna(df_train['opened_position_qty '].median(),inplace=True)
df_test['opened_position_qty '].fillna(df_train['opened_position_qty '].median(),inplace=True)
df_train['closed_position_qty'].fillna(df_train['closed_position_qty'].median(),inplace=True)
df_test['closed_position_qty'].fillna(df_train['closed_position_qty'].median(), inplace=True)

last_price                   0
mid                          0
opened_position_qty     172460
closed_position_qty     172460
transacted_qty               0
d_open_interest              0
bid1                         0
bid2                         0
bid3                         0
bid4                         0
bid5                         0
ask1                         0
ask2                         0
ask3                         0
ask4                         0
ask5                         0
bid1vol                      0
bid2vol                      0
bid3vol                      0
bid4vol                      0
bid5vol                      0
ask1vol                      0
ask2vol                      0
ask3vol                      0
ask4vol                      0
ask5vol                      0
y                            0
dtype: int64
last_price                  0
mid                         0
opened_position_qty     53656
closed_position_qty     53656
transacted_qty              0


In [5]:
# Normalize data
from sklearn import preprocessing

x_train = df_train[df_train.columns[:26]]
y_train = df_train['y']
x_test = df_test

# Normalize training data by subtracting mean and scaling to unit variance
std_scale = preprocessing.StandardScaler().fit(x_train)
x_train_norm = std_scale.transform(x_train)
x_train = pd.DataFrame(x_train_norm, index=x_train.index, columns=x_train.columns)

# Normalize testing data by using mean and SD of training set
x_test_norm = std_scale.transform(x_test)
x_test = pd.DataFrame(x_test_norm, index=x_test.index, columns=x_test.columns) 
print(x_train.iloc[np.array([1, 2, 3, 4, 6])])
#print(df_train.loc[df_train['id'] < 10]

    last_price       mid  opened_position_qty   closed_position_qty  \
id                                                                    
1    -1.501678 -1.494346              2.464481            21.097156   
2    -1.487008 -1.483343              2.987055            33.581616   
3    -1.489453 -1.494346              0.896758            14.409052   
4    -1.496788 -1.498013              0.896758            16.192547   
6    -1.491898 -1.483343              0.374183            15.746673   

    transacted_qty  d_open_interest      bid1      bid2      bid3      bid4  \
id                                                                            
1        13.913985       -18.265229 -1.496986 -1.496637 -1.498903 -1.501215   
2        21.583751       -29.330098 -1.487205 -1.486856 -1.489121 -1.488987   
3         9.153441       -12.732794 -1.496986 -1.496637 -1.498903 -1.501215   
4        10.211339       -14.860654 -1.499431 -1.501527 -1.503794 -1.503661   
6         9.682390       -14

## Build and train model

In [6]:
model = keras.Sequential([
    keras.layers.Dense(64, activation=tf.nn.relu, input_shape=(x_train.shape[1],)),
    keras.layers.Dense(50, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(60, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(100, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(100, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(100, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(90, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(70, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(50, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(20, activation=tf.nn.relu),
    keras.layers.Dropout(0.1, noise_shape=None, seed=None),
    keras.layers.Dense(2, activation=  'softmax')
])

In [7]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

ValueError: Could not interpret optimizer identifier: <tensorflow.python.keras.optimizers.Adam object at 0x00000220DE628B70>

In [None]:
num_fold = 10
index = np.array(range(len(x_train)))
train_error = 0
val_error = 0
kf = KFold(n_splits=num_fold)
for train_index, test_index in kf.split(index):
    x_train_val, x_test_val = x_train.iloc[train_index], x_train.iloc[test_index]
    y_train_val, y_test_val = y_train.iloc[train_index], y_train.iloc[test_index]
    train_model = model.fit(x_train_val, y_train_val, epochs= 10, batch_size = 128)
    eval_train = model.evaluate(x_train_val, y_train_val)
    eval_test = model.evaluate(x_test_val, y_test_val)
    train_error += eval_train[1] / num_fold
    val_error += eval_test[1] / num_fold
print("In Sample Accuracy: " + str(train_error))
print("Out of Sample Accuracy: " + str(val_error))

## Save results

In [None]:
df_test['Predicted'] = model.predict_proba(x_test)[:,1]
df_test[['Predicted']].to_csv('submission2.csv')