In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [4]:
df_num_corr = df_train.corr()['y'][1:-1]
df_num_corr.sort_values(ascending=True, inplace=False)

ask1vol                -0.126212
ask2vol                -0.025560
d_open_interest        -0.015503
ask5vol                -0.009476
ask3vol                -0.008274
ask4vol                -0.007621
bid5vol                 0.003877
bid4vol                 0.007519
bid3vol                 0.009437
bid1                    0.018901
mid                     0.018940
ask1                    0.018978
bid2                    0.018984
bid5                    0.018989
bid3                    0.018990
bid4                    0.018991
ask2                    0.019092
ask3                    0.019109
ask4                    0.019119
ask5                    0.019128
last_price              0.019276
bid2vol                 0.029456
opened_position_qty     0.032599
closed_position_qty     0.046323
transacted_qty          0.056453
bid1vol                 0.086308
Name: y, dtype: float64

In [31]:
df_clean = df_train.drop(columns=['id', 
                                  'bid3vol', 'bid4vol', 'bid5vol', 
                                  'ask3vol', 'ask4vol', 'ask5vol', 
                                  'mid', 
                                  'opened_position_qty ', 'closed_position_qty', 
                                  'ask3', 'ask4', 'ask5', 
                                  'bid3', 'bid4', 'bid5'])

In [34]:
df_clean

Unnamed: 0,last_price,transacted_qty,d_open_interest,bid1,bid2,ask1,ask2,bid1vol,bid2vol,ask1vol,ask2vol,y
0,3842.4,103.0,0,3842.4,3842.0,3842.8,3843.4,8,1,6,1,1
1,3842.8,55.0,-43,3843.0,3842.8,3843.8,3844.0,7,6,1,4,0
2,3844.0,84.0,-69,3843.8,3843.6,3844.8,3845.0,3,1,1,16,0
3,3843.8,37.0,-30,3843.0,3842.8,3843.8,3844.0,10,13,2,7,1
4,3843.2,41.0,-35,3842.8,3842.4,3843.4,3844.0,14,12,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
592375,4110.2,2.0,1,4110.2,4110.0,4110.4,4111.8,2,2,2,3,1
592376,4109.4,11.0,1,4109.2,4109.0,4111.8,4112.0,3,2,3,9,0
592377,4109.4,0.0,0,4109.2,4109.0,4111.8,4112.0,3,2,3,9,0
592378,4109.4,0.0,0,4109.2,4109.0,4111.8,4112.0,3,2,3,9,0


In [35]:
cols_to_norm = ['last_price', 'bid1', 'bid2', 'ask1', 'ask2']
df_clean[cols_to_norm] = df_clean[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [36]:
df_clean

Unnamed: 0,last_price,transacted_qty,d_open_interest,bid1,bid2,ask1,ask2,bid1vol,bid2vol,ask1vol,ask2vol,y
0,0.100066,103.0,0,0.100658,0.100000,0.100066,0.101382,8,1,6,1,1
1,0.101382,55.0,-43,0.102632,0.102632,0.103357,0.103357,7,6,1,4,0
2,0.105332,84.0,-69,0.105263,0.105263,0.106649,0.106649,3,1,1,16,0
3,0.104674,37.0,-30,0.102632,0.102632,0.103357,0.103357,10,13,2,7,1
4,0.102699,41.0,-35,0.101974,0.101316,0.102041,0.103357,14,12,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
592375,0.981567,2.0,1,0.981579,0.981579,0.980908,0.984858,2,2,2,3,1
592376,0.978934,11.0,1,0.978289,0.978289,0.985517,0.985517,3,2,3,9,0
592377,0.978934,0.0,0,0.978289,0.978289,0.985517,0.985517,3,2,3,9,0
592378,0.978934,0.0,0,0.978289,0.978289,0.985517,0.985517,3,2,3,9,0


In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [51]:
X_train = np.asarray(df_clean.iloc[:, 0:11])
y_train = np.asarray(df_clean['y'])

In [53]:
clf = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=100)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [61]:
y_train_pred = clf.predict(X_train)

In [63]:
error = 0
len_data = len(y_train)
for i in range(len_data): 
    if y_train[i] != y_train_pred[i]: 
        error += 1
print("training error: " + str(error / len_data))

training error: 0.3569583713157095


In [54]:
df_test_clean = df_test.drop(columns=['id', 
                                      'bid3vol', 'bid4vol', 'bid5vol', 
                                      'ask3vol', 'ask4vol', 'ask5vol', 
                                      'mid', 
                                      'opened_position_qty ', 'closed_position_qty', 
                                      'ask3', 'ask4', 'ask5', 
                                      'bid3', 'bid4', 'bid5'])
df_test_clean[cols_to_norm] = df_test_clean[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [56]:
X_test = np.asarray(df_test_clean.iloc[:, 0:11])

In [59]:
test_pred = clf.predict_proba(X_test)

In [60]:
df_pred = 

array([[0.55449892, 0.44550108],
       [0.70599796, 0.29400204],
       [0.66169951, 0.33830049],
       ...,
       [0.60507912, 0.39492088],
       [0.64682855, 0.35317145],
       [0.56175316, 0.43824684]])