In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score


from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_validate


# Neural Network 
from sklearn.feature_selection import RFECV    # for feature selection 

# Load in training and test data

In [71]:
df = pd.read_csv("data/train.csv")
df.columns = list(df.columns.values)
df_test = pd.read_csv("data/test.csv")
df_test.columns = list(df_test.columns.values)

# Adding Additional Variables
Result: These indicator variables definitely worsen the performance of Logistic Regression. 

In [51]:
# ADD (bid volumes / ask volumes) ratio  TO TRAIN AND TEST 
# indicator - ratio of all bid volumes to all ask volumes
df_test = df_test.assign(bid_ask_ratio_all = \
                        lambda x: (x.bid1vol+x.bid2vol+x.bid3vol+x.bid4vol+x.bid5vol) / \
                                  (x.ask1vol+x.ask2vol+x.ask3vol+x.ask4vol+x.ask5vol))

# indicator - ratio of all bid volumes to all ask volumes
df = df.assign(bid_ask_ratio_all = \
                        lambda x: (x.bid1vol+x.bid2vol+x.bid3vol+x.bid4vol+x.bid5vol) / \
                                  (x.ask1vol+x.ask2vol+x.ask3vol+x.ask4vol+x.ask5vol))


# Add (best bid volume / best ask volume) TO TRAIN AND TEST 
# indicator - ratio of best bid (bid1) volume to best ask (ask1) volume

#df_test = df_test.assign(bid_ask_ratio_best = lambda x: x.bid1vol / x.ask1vol)
#df = df.assign(bid_ask_ratio_best = lambda x: x.bid1vol / x.ask1vol)


# indicator - difference between the price of most recent order and average of best bid and best ask (mid)
#df_test = df_test.assign(diff_price = lambda x: (x.last_price) - (x.mid))
#df = df.assign(diff_price = lambda x: (x.last_price) - (x.mid))

# indicator - difference between best bid price and best ask price
#df_test = df_test.assign(diff_best_bid_ask = lambda x: (x.bid1) - (x.ask1))
#df = df.assign(diff_best_bid_ask = lambda x: (x.bid1) - (x.ask1))


In [58]:
# Let's drop [bid2, ... bid5, ask2, ... ask5] columns due to heavy correlation 
df.drop(columns=['bid1vol','bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 'ask1vol','ask2vol',\
                 'ask3vol', 'ask4vol', 'ask5vol'], inplace=True)
df_test.drop(columns=['bid1vol','bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 'ask1vol','ask2vol',\
                 'ask3vol', 'ask4vol', 'ask5vol'], inplace=True)

# Dropping Highly Correlated Variables 

In [72]:

df.drop(columns=['closed_position_qty', 'opened_position_qty '], inplace=True)
df_test.drop(columns=['closed_position_qty', 'opened_position_qty '], inplace=True)

# Let's drop [bid2, ... bid5, ask2, ... ask5] columns due to heavy correlation 
df.drop(columns=['bid2', 'bid3', 'bid4', 'bid5', 'ask2', 'ask3', \
                 'ask4', 'ask5'], inplace=True)
df_test.drop(columns=['bid2', 'bid3', 'bid4', 'bid5', 'ask2', 'ask3', \
                 'ask4', 'ask5'], inplace=True)



In [73]:
df.head()

Unnamed: 0,id,last_price,mid,transacted_qty,d_open_interest,bid1,ask1,bid1vol,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,103.0,0,3842.4,3842.8,8,1,6,14,6,6,1,1,10,2,1
1,1,3842.8,3843.4,55.0,-43,3843.0,3843.8,7,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,84.0,-69,3843.8,3844.8,3,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,37.0,-30,3843.0,3843.8,10,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,41.0,-35,3842.8,3843.4,14,12,2,2,4,1,3,1,11,15,1


# Separate data into X and y, Normalize 

In [74]:
# Separating into X and y training 
last_idx = len(df.columns) - 1
X = df[df.columns[:-1]]
#y_train = df[df.columns[last_idx]]
y_train = df.y
X = np.array(X)
y_train = np.array(y_train)
X_test = df_test
X_test = np.array(X_test)

# HANDLING IDs FOR TRAINING 
X_ids = X[:, 0]  # Vector of id's 
X = X[:, 1:]
# HANDLING IDs for TEST
X_test_ids = X_test[:, 0]
X_test = X_test[:, 1:]

# Normalize test set features using training set stats
for j in range(0,len(X_test[0])):
    X_std = np.std(X[:, j])
    X_test[:, j] -= np.mean(X[:, j])
    X_test[:, j] *= (1 / X_std)

# Normalize training set features 
for j in range(0,len(X[0])):
    X_std = np.std(X[:, j])
    X[:, j] -= np.mean(X[:, j])
    X[:, j] *= (1 / X_std)

# Logistic Reg (no regularization)

In [75]:
# Let's try logistic regression for different lambda = 0.00001,
# 0.00005, 0.00025, 61, 035.15625
lam = 0.0005 # Penalizing to take care of some strong correlation between features  
# C is inverse of regularization strength 

#base = np.repeat(5, 5)
#power = np.arange(0, 5)
#lambdas = np.power(base, power)
#final_lambdas = 0.00001 * lambdas

# The C's (inv regularization strength) range from 1e-5 ... 1e5
clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y_train)
#clf = LogisticRegression(penalty='l2', C=(1/lam)).fit(X, y_trai )
y_pred = clf.predict_proba(X)
y_pred = y_pred[:,1]    # Grabbing probabililities of being in class 1


# Predicting on test set 
y_pred_test = clf.predict_proba(X_test)
y_pred_test = y_pred_test[:,1]  

# Test AUC-ROC score 

In [76]:
scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

#clf_test = RandomForestClassifier(max_depth=5, min_samples_split=2, 
                                 # random_state=0, n_estimators=10, 
                                 # max_features='sqrt', criterion='gini')

results = cross_validate(clf, X, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results: ')
for sc in range(len(scoring)):
    print(clf.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results: 
LogisticRegressionCV average accuracy: 0.651 (+/-0.001)
LogisticRegressionCV average log_loss: 0.628 (+/-0.002)
LogisticRegressionCV average auc: 0.639 (+/-0.006)


Best logistic regression model: no regularization, all columns except closed_position_qty
and opened_position_qty, [ask2, ... ask5], [bid2, ... bid5] with 5-fold stratified 
cross-validation achieves a score of 0.60549. 

# Artificial Neural Network 

In [3]:
import pandas as pd

# Preparing Dataset for ANN training 

In [39]:
df = pd.read_csv("data/train.csv")
df.columns = list(df.columns.values)
df_test = pd.read_csv("data/test.csv")
df_test.columns = list(df_test.columns.values)

In [40]:
# Using Betty's preprocessing code and final features (from a day ago)
# Filling NaN values 
nan_values = {'opened_position_qty ': (df['transacted_qty'] + df['d_open_interest']) / 2 , 
              'closed_position_qty': (df['transacted_qty'] - df['d_open_interest']) / 2}

nan_values_test = {'opened_position_qty ': (df_test['transacted_qty'] + df_test['d_open_interest']) / 2 , 
              'closed_position_qty': (df_test['transacted_qty'] - df_test['d_open_interest']) / 2}

df = df.fillna(value=nan_values)
df_test = df_test.fillna(value=nan_values_test)



# ADD (bid volumes / ask volumes) ratio  TO TRAIN AND TEST 
# indicator - ratio of all bid volumes to all ask volumes
df_test = df_test.assign(bid_ask_ratio_all = \
                        lambda x: (x.bid1vol+x.bid2vol+x.bid3vol+x.bid4vol+x.bid5vol) / \
                                  (x.ask1vol+x.ask2vol+x.ask3vol+x.ask4vol+x.ask5vol))

# indicator - ratio of all bid volumes to all ask volumes
df = df.assign(bid_ask_ratio_all = \
                        lambda x: (x.bid1vol+x.bid2vol+x.bid3vol+x.bid4vol+x.bid5vol) / \
                                  (x.ask1vol+x.ask2vol+x.ask3vol+x.ask4vol+x.ask5vol))


# Add (best bid volume / best ask volume) TO TRAIN AND TEST 
# indicator - ratio of best bid (bid1) volume to best ask (ask1) volume

df_test = df_test.assign(bid_ask_ratio_best = lambda x: x.bid1vol / x.ask1vol)
df = df.assign(bid_ask_ratio_best = lambda x: x.bid1vol / x.ask1vol)


# indicator - difference between the price of most recent order and average of best bid and best ask (mid)
df_test = df_test.assign(diff_price = lambda x: (x.last_price) - (x.mid))
df = df.assign(diff_price = lambda x: (x.last_price) - (x.mid))

# indicator - difference between best bid price and best ask price
df_test = df_test.assign(diff_best_bid_ask = lambda x: (x.bid1) - (x.ask1))
df = df.assign(diff_best_bid_ask = lambda x: (x.bid1) - (x.ask1))

cols = ['last_price', 
        'transacted_qty', 'd_open_interest', 
        'bid_ask_ratio_all', 'bid_ask_ratio_best', 
        'diff_price', 'diff_best_bid_ask']

X = np.asarray(df[cols])
y_train = np.asarray(df['y'])

X_test = np.asarray(df_test[cols])  # grabbing test set 


# Normalize test set features using training set stats
for j in range(0,len(X_test[0])):
    X_std = np.std(X[:, j])
    X_test[:, j] -= np.mean(X[:, j])
    X_test[:, j] *= (1 / X_std)

# Normalize training set features 
for j in range(0,len(X[0])):
    X_std = np.std(X[:, j])
    X[:, j] -= np.mean(X[:, j])
    X[:, j] *= (1 / X_std)

# Adding indicators 
# indicator - ratio of all bid volumes to all ask volumes
#df_ex = df.assign(bid_ask_ratio_all = \
                     #   lambda x: (x.bid1vol+x.bid2vol+x.bid3vol+x.bid4vol+x.bid5vol) / \
                       #           (x.ask1vol+x.ask2vol+x.ask3vol+x.ask4vol+x.ask5vol))
# indicator - ratio of best bid (bid1) volume to best ask (bid2) volume
#df_ex = df_ex.assign(bid_ask_ratio_best = lambda x: x.bid1vol / x.ask1vol)

# indicator - difference between the price of most recent order and average of best bid and best ask (mid)
#df_ex = df_ex.assign(diff_price = lambda x: (x.last_price) - (x.mid))

# indicator - difference between best bid price and best ask price
#df_ex = df_ex.assign(diff_best_bid_ask = lambda x: (x.bid1) - (x.ask1))
#cols = ['last_price', 
    #    'transacted_qty', 'd_open_interest', 
     #   'bid_ask_ratio_all', 'bid_ask_ratio_best', 
      #  'diff_price', 'diff_best_bid_ask']

#X = np.asarray(df_ex[cols])
#y_train = np.asarray(df_ex['y'])

In [10]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [49]:
model = Sequential()
model.add(Dense(5,input_dim=7, activation='relu'))# Input layer, size (,7)
model.add(Dense(3, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Output probability of midprice 
                                           # going up. 

    # Maybe keep more features??? Because this is supposed to be 
    # a non-linear model, so correlation shouldn't matter too much. 
    
model.compile(optimizer='adam', loss='binary_crossentropy', \
             metrics=['accuracy'])

In [50]:
model.fit(X, y_train, epochs=15, batch_size=64)
y_pred = model.predict(X_test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [51]:
y_pred_train = model.predict(X)

# Outputting Results to CSV

In [17]:
y_pred[:10]
roc_auc_score(y_train, y_pred_train)

0.6558090619590976

In [47]:
# fix output format
X_test_ids = list(df_test['id'].astype(int))
#X_test_ids = df['id'].astype(int)
y_pred_flat = np.ndarray.flatten(y_pred)
predictions = {'id': X_test_ids, 'Predicted': y_pred_flat}
df_output = pd.DataFrame(predictions, columns=['id', 'Predicted'])
df_output.set_index('id')
df_output.to_csv('ANN_attempt_2.csv', index=None)

In [37]:
cols = ['id', 'Predicted']
df_output = df_output[cols]

In [30]:
np.ndarray.flatten(y_pred)

array([0.67061865, 0.61808383, 0.48727542, ..., 0.65179896, 0.621622  ,
       0.66336346], dtype=float32)

In [48]:
df_output.head()

Unnamed: 0,id,Predicted
0,592380,0.670619
1,592381,0.618084
2,592382,0.487275
3,592383,0.512512
4,592384,0.572973


In [44]:
X_test_ids = list(df_test['id'])

In [46]:
X_test_ids[:10]

[592380,
 592381,
 592382,
 592383,
 592384,
 592385,
 592386,
 592387,
 592388,
 592389]