In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier

## Load and preprocess data 

In [2]:
df_train = pd.read_csv('train.csv', index_col=0)
df_test = pd.read_csv('test.csv', index_col=0)

In [3]:
# identify and replace missing values
print(df_train.isnull().sum())
print(df_test.isnull().sum())

print(df_train['opened_position_qty '].mean())
print(df_train['opened_position_qty '].median())

print(df_train['closed_position_qty'].mean())
print(df_train['closed_position_qty'].median())

# replace missing values with median (less sensitive to outliers)
df_train['opened_position_qty '].fillna(df_train['opened_position_qty '].median(),inplace=True)
df_test['opened_position_qty '].fillna(df_train['opened_position_qty '].median(),inplace=True)
df_train['closed_position_qty'].fillna(df_train['closed_position_qty'].median(),inplace=True)
df_test['closed_position_qty'].fillna(df_train['closed_position_qty'].median(), inplace=True)

last_price                   0
mid                          0
opened_position_qty     172460
closed_position_qty     172460
transacted_qty               0
d_open_interest              0
bid1                         0
bid2                         0
bid3                         0
bid4                         0
bid5                         0
ask1                         0
ask2                         0
ask3                         0
ask4                         0
ask5                         0
bid1vol                      0
bid2vol                      0
bid3vol                      0
bid4vol                      0
bid5vol                      0
ask1vol                      0
ask2vol                      0
ask3vol                      0
ask4vol                      0
ask5vol                      0
y                            0
dtype: int64
last_price                  0
mid                         0
opened_position_qty     53656
closed_position_qty     53656
transacted_qty              0


In [4]:
# Normalize data
from sklearn import preprocessing

x_train = df_train[df_train.columns[:26]]
y_train = df_train['y']
x_test = df_test

# Normalize training data by subtracting mean and scaling to unit variance
std_scale = preprocessing.StandardScaler().fit(x_train)
x_train_norm = std_scale.transform(x_train)
x_train = pd.DataFrame(x_train_norm, index=x_train.index, columns=x_train.columns)

# Normalize testing data by using mean and SD of training set
x_test_norm = std_scale.transform(x_test)
x_test = pd.DataFrame(x_test_norm, index=x_test.index, columns=x_test.columns) 

## Build and train model

In [5]:
n_estimators = 100
clf = RandomForestClassifier(n_estimators = n_estimators, criterion = 'gini')
clf.fit(x_train, y_train)
        
# evaluate training error
training_accuracy = clf.score(x_train, y_train)
print(training_accuracy)
print(clf.feature_importances_)

0.9957814240858908
[0.04516642 0.04056989 0.01664618 0.01629452 0.02332884 0.02423074
 0.03421817 0.03317846 0.03329418 0.03370888 0.03467229 0.03455739
 0.03333027 0.0333681  0.03385852 0.03479672 0.04097467 0.04642467
 0.05151106 0.05191283 0.05234978 0.05545786 0.04404413 0.05067642
 0.05038778 0.05104123]


## Save results

In [6]:
df_test['Predicted'] = clf.predict_proba(x_test)[:,1]
df_test[['Predicted']].to_csv('submission_model2.csv')