# Deep Neural Network
This notebook is used to fine-tune, train, and test a deep neural network model on the Ubiqant dataset. Preliminary experiments showed that neural networks are superior to other machine learning methods in terms of both performance and time complexity. Thus, our comprehensive investigation of forecasting returns on the Ubiqant dataset will be confined to neural network models. In what follows, we find the optimal deep neural network architecture, train and test this model, and analyze the results using a variety of different techniques.

## Ubiquant Data

In [1]:
# import base libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
# import DNN libraries
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout
from tensorflow.keras import Sequential 
from scipy.stats import pearsonr # pearson correlation coef

In [3]:
data = pd.read_csv('train.csv')
print(data.shape)
data.head()

(3141410, 304)


Unnamed: 0,row_id,time_id,investment_id,target,f_0,f_1,f_2,f_3,f_4,f_5,...,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
0,0_1,0,1,-0.300875,0.932573,0.113691,-0.402206,0.378386,-0.203938,-0.413469,...,0.366028,-1.09562,0.200075,0.819155,0.941183,-0.086764,-1.087009,-1.044826,-0.287605,0.321566
1,0_2,0,2,-0.23104,0.810802,-0.514115,0.742368,-0.616673,-0.194255,1.77121,...,-0.154193,0.912726,-0.734579,0.819155,0.941183,-0.387617,-1.087009,-0.929529,-0.97406,-0.343624
2,0_6,0,6,0.568807,0.393974,0.615937,0.567806,-0.607963,0.068883,-1.083155,...,-0.13802,0.912726,-0.551904,-1.220772,-1.060166,-0.219097,-1.087009,-0.612428,-0.113944,0.243608
3,0_7,0,7,-1.06478,-2.343535,-0.01187,1.874606,-0.606346,-0.586827,-0.815737,...,0.382201,0.912726,-0.266359,-1.220772,0.941183,-0.609113,0.104928,-0.783423,1.15173,-0.773309
4,0_8,0,8,-0.53194,0.842057,-0.262993,2.33003,-0.583422,-0.618392,-0.742814,...,-0.170365,0.912726,-0.741355,-1.220772,0.941183,-0.588445,0.104928,0.753279,1.345611,-0.737624


In [4]:
train_data = data.iloc[:2500000] # use first 2,500,000 obs. for training
test_data = data.iloc[2500000:] # use last 500,000+ obs for testing

train_target = train_data.pop('target')
test_target = test_data.pop('target')

In [5]:
# remove id columns from datasets
train_row_id = train_data.pop('row_id')
train_time_id = train_data.pop('time_id')
train_investment_id = train_data.pop('investment_id')

test_row_id = test_data.pop('row_id')
test_time_id = test_data.pop('time_id')
test_investment_id = test_data.pop('investment_id')

In [6]:
# create DNN builder function
def create_dnn(nodes,layers,lr,drop,activ):
    model = Sequential()
    
    for i in list(range(1,layers+1)):
        if i == 1:
            model.add(Dense(nodes, input_dim = (train_data.shape[1]), activation = activ))
            model.add(BatchNormalization())
            model.add(Dropout(drop))
        else:
            model.add(Dense(i/2, activation=activ))
            model.add(BatchNormalization())
            model.add(Dropout(drop))

    model.add(BatchNormalization())
    model.add(Dense(1))
    model.compile(loss = 'mse', optimizer = tf.keras.optimizers.Adam(learning_rate=lr))
    return model

In [7]:
results = {}
for drop in [0.45,0.3,0.15]:
    for activation_fun in ['swish','tanh','relu']:
        model = create_dnn(16,3,0.1,drop,activation_fun)
        model.fit(train_data, pd.DataFrame(train_target), epochs = 50, batch_size = 4096,verbose = 1)
        key = str(drop)+' '+str(activation_fun)
        results[key] = model

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

In [12]:
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [23]:
pearson = {}
mae = {}
mse = {}
for item in results.items():
    key = item[0]
    model = item[1]
    preds = model.predict(test_data)
    pearson[key] = pearsonr(test_target,preds)
    mae[key] = mean_absolute_error(test_target,preds)
    mse[key] = mean_squared_error(test_target,preds)

In [24]:
sorted_pearson = sorted(pearson.items(), key=lambda x: x[1], reverse=True)
sorted_pearson

[('0.3 swish', (array([0.1205636033367799], dtype=object), 0.0)),
 ('0.3 relu', (array([0.11821852482087802], dtype=object), 0.0)),
 ('0.45 relu', (array([0.11806124970943291], dtype=object), 0.0)),
 ('0.15 relu', (array([0.11762674636332636], dtype=object), 0.0)),
 ('0.45 swish', (array([0.11507220117795056], dtype=object), 0.0)),
 ('0.15 swish', (array([0.1089537970621983], dtype=object), 0.0)),
 ('0.45 tanh', (array([0.04759390610796114], dtype=object), 2.94071e-318)),
 ('0.15 tanh',
  (array([0.04648636692273842], dtype=object), 1.0555779132361456e-303)),
 ('0.3 tanh',
  (array([0.04561743253645969], dtype=object), 1.5930987379189036e-292))]

In [25]:
sorted_mse = sorted(mse.items(), key=lambda x: x[1])
sorted_mse

[('0.45 relu', 0.807820207888261),
 ('0.3 relu', 0.8079580554062133),
 ('0.15 relu', 0.809453066618867),
 ('0.3 swish', 0.8101982975951656),
 ('0.45 swish', 0.8108955858633804),
 ('0.15 swish', 0.8131894334015445),
 ('0.15 tanh', 0.8169551197308408),
 ('0.45 tanh', 0.8176862004897137),
 ('0.3 tanh', 0.8185730361397463)]

In [26]:
sorted_mae = sorted(mae.items(), key=lambda x: x[1])
sorted_mae

[('0.3 swish', 0.5970076500477591),
 ('0.45 relu', 0.5980923041328688),
 ('0.45 swish', 0.5993718938573356),
 ('0.15 relu', 0.5996067685850592),
 ('0.45 tanh', 0.6004804206873355),
 ('0.3 relu', 0.6013351434275263),
 ('0.15 tanh', 0.6013544921752566),
 ('0.3 tanh', 0.6025660014061256),
 ('0.15 swish', 0.6052752879390493)]

# Investigate Best Ubiquant Model

In [88]:
# get predictions for best model
model = results['0.3 swish']
preds = model.predict(test_data).tolist()

In [97]:
# turn predictions into list
preds_fixed = []
for i in preds:
    preds_fixed.append(i[0])

In [104]:
# separate predictions into two categories: (1) if it is an overestimate OR (2) if it is an underestimate 
pred_greater_actual = []
target_greater_actual = []
pred_less_actual = []
target_less_actual = []


for index, item in enumerate(preds_fixed):
    if item>=test_target[index]:
        pred_greater_actual.append(item)
        target_greater_actual.append(test_target[index])
    else:
        pred_less_actual.append(item)
        target_less_actual.append(test_target[index])

In [116]:
pearson_pred_greater = pearsonr(pred_greater_actual,target_greater_actual)
mse_pred_greater = mean_squared_error(pred_greater_actual,target_greater_actual)
mae_pred_greater = mean_absolute_error(pred_greater_actual,target_greater_actual)
print("Pearson: ",pearson_pred_greater)
print("MSE: ",mse_pred_greater)
print("MAE: ",mae_pred_greater)

Pearson:  (0.27844552070643014, 0.0)
MSE:  0.6377391636095127
MAE:  0.550685504198288


In [117]:
pearson_pred_less = pearsonr(pred_less_actual,target_less_actual)
mse_pred_less = mean_squared_error(pred_less_actual,target_less_actual)
mae_pred_less = mean_absolute_error(pred_less_actual,target_less_actual)
print("Pearson: ",pearson_pred_less)
print("MSE: ",mse_pred_less)
print("MAE: ",mae_pred_less)

Pearson:  (0.01895344180810981, 2.1794047423556098e-26)
MSE:  0.9895185025496863
MAE:  0.6451726645395484
