In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier

## Load and preprocess data 

In [2]:
df_train = pd.read_csv('train.csv', index_col=0)
df_test = pd.read_csv('test.csv', index_col=0)

In [3]:
# identify and replace missing values
print(df_train.isnull().sum())
print(df_test.isnull().sum())

print(df_train['opened_position_qty '].mean())
print(df_train['opened_position_qty '].median())

print(df_train['closed_position_qty'].mean())
print(df_train['closed_position_qty'].median())

# replace missing values with median (less sensitive to outliers)
df_train['opened_position_qty '].fillna(df_train['opened_position_qty '].median(),inplace=True)
df_test['opened_position_qty '].fillna(df_train['opened_position_qty '].median(),inplace=True)
df_train['closed_position_qty'].fillna(df_train['closed_position_qty'].median(),inplace=True)
df_test['closed_position_qty'].fillna(df_train['closed_position_qty'].median(), inplace=True)

last_price                   0
mid                          0
opened_position_qty     172460
closed_position_qty     172460
transacted_qty               0
d_open_interest              0
bid1                         0
bid2                         0
bid3                         0
bid4                         0
bid5                         0
ask1                         0
ask2                         0
ask3                         0
ask4                         0
ask5                         0
bid1vol                      0
bid2vol                      0
bid3vol                      0
bid4vol                      0
bid5vol                      0
ask1vol                      0
ask2vol                      0
ask3vol                      0
ask4vol                      0
ask5vol                      0
y                            0
dtype: int64
last_price                  0
mid                         0
opened_position_qty     53656
closed_position_qty     53656
transacted_qty              0


In [4]:
# Normalize data
from sklearn import preprocessing

x_train = df_train[df_train.columns[:26]]
y_train = df_train['y']
x_test = df_test

# Normalize training data by subtracting mean and scaling to unit variance
std_scale = preprocessing.StandardScaler().fit(x_train)
x_train_norm = std_scale.transform(x_train)
x_train = pd.DataFrame(x_train_norm, index=x_train.index, columns=x_train.columns)

# Normalize testing data by using mean and SD of training set
x_test_norm = std_scale.transform(x_test)
x_test = pd.DataFrame(x_test_norm, index=x_test.index, columns=x_test.columns) 

In [5]:
# Split training and validation data 
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)

## Build and train model

In [6]:
from sklearn.metrics import roc_curve, auc
n_estimators = 64

clf = RandomForestClassifier(n_estimators=n_estimators, criterion='gini', n_jobs=-1)
clf.fit(x_train, y_train)
train_pred = clf.predict(x_train)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('train auc: {}'.format(roc_auc))
y_pred = clf.predict(x_val)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_val, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('val auc: {}'.format(roc_auc))

train auc: 0.9961475799638176
val auc: 0.5702681303445631


In [8]:
# evaluate training classification accuracy and feature importance
training_accuracy = clf.score(x_train, y_train)
print(training_accuracy)
print(clf.feature_importances_)

0.9967250751206995
[0.04396884 0.04020864 0.01709851 0.01628781 0.02381975 0.02455625
 0.03413517 0.03329046 0.03341814 0.03398751 0.03457772 0.03455912
 0.0335642  0.03359612 0.03410573 0.03502343 0.04066847 0.04604067
 0.05151931 0.05195    0.052411   0.05549846 0.04362665 0.05061445
 0.05039597 0.05107761]


## Save results

In [10]:
df_test['Predicted'] = clf.predict_proba(x_test)[:,1]
df_test[['Predicted']].to_csv('submission_model2.csv')