In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils as utils
import torch.utils.data as data
import torch.optim as optim

This dataset contains an anonymized set of features, feature_{0...129}, representing real stock market data. Each row in the dataset represents a trading opportunity, for which you will be predicting an action value: 1 to make the trade and 0 to pass on it. Each trade has an associated weight and resp, which together represents a return on the trade. The date column is an integer which represents the day of the trade, while ts_id represents a time ordering. In addition to anonymized feature values, you are provided with metadata about the features in features.csv.

In the training set, train.csv, you are provided a resp value, as well as several other resp_{1,2,3,4} values that represent returns over different time horizons. These variables are not included in the test set. Trades with weight = 0 were intentionally included in the dataset for completeness, although such trades will not contribute towards the scoring evaluation.

This is a code competition that relies on a time-series API to ensure models do not peek forward in time. To use the API, follow the instructions on the Evaluation page. When you submit your notebook, it will be rerun on an unseen test:

    During the model training phase of the competition, this unseen test set is comprised of approximately 1 million rows of historical data.
    During the live forecasting phase, the test set will use periodically updated live market data.

Note that during the second (forecasting) phase of the competition, the notebook time limits will scale with the number of trades presented in the test set.

### Dataset Exploration

In [21]:
train_data = pd.read_csv('../data/jane_street_train.csv')
train_data

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.000000,0.009916,0.014079,0.008773,0.001390,0.006270,1,-1.872746,-2.191242,...,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,,-1.178850,1.777472,-0.915458,2.831612,-1.417010,2.297459,-1.304614,1.898684,1
2,0,0.000000,0.025134,0.027607,0.033406,0.034380,0.023970,-1,0.812780,-0.256156,...,,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2
3,0,0.000000,-0.004730,-0.003273,-0.000461,-0.000476,-0.003200,-1,1.174378,0.344640,...,,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,3
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,...,,0.344850,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390486,499,0.000000,0.000142,0.000142,0.005829,0.020342,0.015396,1,-1.649365,-1.169996,...,-1.896874,-1.260055,1.947725,-1.994399,-1.685163,-2.866165,-0.216130,-1.892048,0.901585,2390486
2390487,499,0.000000,0.000012,0.000012,-0.000935,-0.006326,-0.004718,1,2.432943,5.284504,...,-0.936553,1.064936,3.119762,-0.419796,-0.208975,-0.146749,0.730166,0.648452,2.068737,2390487
2390488,499,0.000000,0.000499,0.000499,0.007605,0.024907,0.016591,1,-0.622475,-0.963682,...,-2.956745,-0.640334,-2.279663,-0.950259,-4.388417,-1.669922,-3.288939,-1.336142,-2.814239,2390488
2390489,499,0.283405,-0.000156,-0.000156,-0.001375,-0.003702,-0.002004,-1,-1.463757,-1.107228,...,-2.035894,-1.780962,0.881246,-2.202140,-1.912601,-3.341684,-0.571188,-2.185795,0.627452,2390489


In [22]:
train_data.columns

Index(['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp',
       'feature_0', 'feature_1', 'feature_2',
       ...
       'feature_121', 'feature_122', 'feature_123', 'feature_124',
       'feature_125', 'feature_126', 'feature_127', 'feature_128',
       'feature_129', 'ts_id'],
      dtype='object', length=138)

In [23]:
len(train_data)

2390491

In [6]:
test_data = pd.read_csv('../data/jane_street_example_test.csv')
test_data

Unnamed: 0,weight,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,date,ts_id
0,0.000000,1,-1.872746,-2.191242,-0.474163,-0.323046,0.014688,-0.002484,,,...,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0,0
1,16.673515,-1,-1.349537,-1.704709,0.068058,0.028432,0.193794,0.138212,,,...,-1.178850,1.777472,-0.915458,2.831612,-1.417010,2.297459,-1.304614,1.898684,0,1
2,0.000000,-1,0.812780,-0.256156,0.806463,0.400221,-0.614188,-0.354800,,,...,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,0,2
3,0.000000,-1,1.174378,0.344640,0.066872,0.009357,-1.006373,-0.676458,,,...,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,0,3
4,0.138531,1,-3.172026,-3.093182,-0.161518,-0.128149,-0.195006,-0.143780,,,...,0.344850,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15214,0.000000,1,-1.213240,-1.557117,0.530793,0.382429,0.316811,0.240976,0.741902,0.680989,...,1.655182,2.551488,0.525934,1.242721,1.977483,2.563083,1.857149,2.424928,2,15214
15215,0.000000,1,-0.413328,-0.642504,0.429951,0.333967,-0.728263,-0.637617,0.204294,0.138052,...,6.393191,22.159397,-0.101824,3.804838,1.780150,7.504901,4.702145,15.376130,2,15215
15216,0.000000,1,-1.378947,-1.702976,0.548763,0.396754,0.328203,0.249898,0.784458,0.730435,...,1.740141,2.685696,0.527251,1.245219,1.981606,2.567519,1.876328,2.450874,2,15216
15217,0.000000,1,-0.324708,-1.089962,-0.873900,-0.544143,-1.265208,-0.844335,2.302628,1.978776,...,5.243907,11.789678,0.310616,2.660067,3.052869,6.399390,5.396259,10.972647,2,15217


In [7]:
test_data.columns

Index(['weight', 'feature_0', 'feature_1', 'feature_2', 'feature_3',
       'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8',
       ...
       'feature_122', 'feature_123', 'feature_124', 'feature_125',
       'feature_126', 'feature_127', 'feature_128', 'feature_129', 'date',
       'ts_id'],
      dtype='object', length=133)

In [8]:
len(train_data.query("date == 0"))

5587

In [9]:
feature_columns = [column for column in train_data.columns.to_list() if 'feature' in column]

### Training

In [2]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

In [3]:
num_feature_columns = 130

In [4]:
train_file = '../data/jane_street_train.csv'
test_file = '../data/jane_street_example_test.csv'

In [5]:
from abc import ABC, abstractmethod

class StatsCollector(ABC):
    @abstractmethod
    def train_gather(self, epoch, batch_idx, batch_size, dataset_size, loss):
        pass
    @abstractmethod
    def eval_gather(self, train_epoch, loss, accuracy):
        pass
    @abstractmethod
    def eval_pred_gather(self, train_epoch, predictions, targets):
        pass
    @abstractmethod
    def train_reset(self):
        pass
    @abstractmethod
    def eval_reset(self):
        pass

class EpochLossCollector(StatsCollector):
    def __init__(self, batch_size, frequency, report_frequency = 0, eval_report_frequency = 0):
        self.batch_size = batch_size
        self.frequency = frequency
        self.report_frequency = report_frequency
        self.eval_report_frequency = eval_report_frequency
        self.losses = []
        self.eval_losses = []
        self.eval_predictions_targets = []
    def train_gather(self, epoch, batch_idx, dataset_size, loss):
        if epoch % self.frequency == 0 and (batch_idx + 1) * self.batch_size >= dataset_size:
            self.losses.append((epoch, loss))
        if self.report_frequency > 0 and epoch % self.report_frequency == 0 and (batch_idx + 1) * self.batch_size >= dataset_size:
            print("Epoch {}, {}/{}: {}".format(epoch, batch_idx * self.batch_size, dataset_size, loss))
    def eval_gather(self, train_epoch, loss, accuracy):
        self.eval_losses.append((train_epoch, loss, accuracy))
        if self.eval_report_frequency > 0 and train_epoch % self.eval_report_frequency == 0:
            print("Epoch {}: loss {} accuracy {}".format(train_epoch, loss, accuracy))
    def eval_pred_gather(self, train_epoch, predictions, targets):
        self.eval_predictions_targets.append((train_epoch, predictions, targets))
    def train_reset(self):
        self.losses = []
    def eval_reset(self):
        self.eval_losses = []
        self.eval_predictions_targets = []

In [6]:
class JSClassificationDataset(data.Dataset):
    def __init__(self, filename, transform=None, target_transform=None):
        data = pd.read_csv(filename)
        data = data.fillna(0.)
        feature_columns = [column for column in data.columns if 'feature' in column]
        target_column = 'resp'
        data['y'] = data[target_column] > 0.0
        self.x = torch.tensor(data[feature_columns].values, dtype=torch.float32)
        self.y = torch.tensor(data['y'].values, dtype=torch.float32)
        self.y = self.y.reshape([self.y.shape[0], 1])
        self.transform = transform
        self.target_transform = target_transform
    def __getitem__(self, index):
        x = self.x[index]
        y = self.y[index]
        if self.transform is not None:
            x = self.transform(x)
        if self.target_transform is not None:
            y = self.target_transform(y)
        return x, y
    def __len__(self):
        return len(self.x)

In [7]:
def train(model, device, loader, optimizer, loss, epoch, collector):
    model.train()
    for batch_idx, (data, target) in enumerate(loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        l = loss(output, target)
        l.backward()
        optimizer.step()
        collector.train_gather(epoch, batch_idx, len(loader.dataset), l.item())

In [8]:
def classification_validate(model, device, loader, loss, train_epoch, collector):
    model.eval()
    total_loss = 0.
    total_corrects = 0.
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            total_loss += loss(output, target).item()
            pred = (output > 0.5).type(torch.float32)
            collector.eval_pred_gather(train_epoch, pred, target)
            corrects = pred.eq(target.view_as(pred)).sum().item()
            total_corrects += corrects
    total_loss /= len(loader.dataset)
    accuracy = total_corrects / len(loader.dataset)
    collector.eval_gather(train_epoch, total_loss, accuracy)

In [9]:
default_batch_size = 128
loader_args = {'batch_size' : default_batch_size, 'shuffle' : True}
if use_cuda:
    loader_args.update({'pin_memory' : True, 'num_workers' : 0})

### MLP Classification

In [30]:
class MLPClassifier1(nn.Module):
    def __init__(self, isize):
        super(MLPClassifier1, self).__init__()
        self.fc1 = nn.Linear(isize, 1000)
        self.relu1 = nn.ReLU(inplace=True)
        self.fc2 = nn.Linear(1000, 1)
        self.sigmoid2 = nn.Sigmoid()
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.sigmoid2(x)
        return x

In [31]:
mlp_modelfile = '../models/js_mlp_model1.pt'
mlp_model = MLPClassifier1(num_feature_columns)

In [32]:
mlp_model = mlp_model.to(device)

In [33]:
trainset = JSClassificationDataset(train_file)
train_loader = data.DataLoader(trainset, **loader_args)

In [34]:
learning_rate = 0.001
total_epochs = 10
optimizer = optim.Adam(mlp_model.parameters(), lr=learning_rate)
loss = nn.BCELoss()
collector = EpochLossCollector(default_batch_size, 1, 1)

In [35]:
for epoch in range(total_epochs):
    train(mlp_model, device, train_loader, optimizer, loss, epoch, collector)

Epoch 0, 2390400/2390491: 0.6939966678619385
Epoch 1, 2390400/2390491: 0.7136209011077881
Epoch 2, 2390400/2390491: 0.7101637721061707
Epoch 3, 2390400/2390491: 0.6985130310058594
Epoch 4, 2390400/2390491: 0.6766823530197144
Epoch 5, 2390400/2390491: 0.674554169178009
Epoch 6, 2390400/2390491: 0.6836362481117249
Epoch 7, 2390400/2390491: 0.6665523648262024
Epoch 8, 2390400/2390491: 0.6906943321228027
Epoch 9, 2390400/2390491: 0.6835135221481323


In [36]:
classification_validate(mlp_model, device, train_loader, loss, total_epochs, collector)

In [37]:
collector.eval_losses

[(10, 0.005382537147133978, 0.5323722197657301)]

In [39]:
torch.save(mlp_model, mlp_modelfile)

### Resnet Classification

attemping to overfit training

In [20]:
class ResnetLayerV1(nn.Module):
    def __init__(self, isize, a1, a2):
        super(ResnetLayerV1, self).__init__()
        self.fc1 = nn.Linear(isize, isize)
        self.fc2 = nn.Linear(isize, isize)
        self.a1 = a1
        self.a2 = a2
    def forward(self, x):
        s = x
        x = self.fc1(x)
        x = self.a1(x)
        x = self.fc2(x)
        x = x + s
        x = self.a2(x)
        return x

In [11]:
class ResnetLayerV2(nn.Module):
    def __init__(self, isize, a1, a2):
        super(ResnetLayerV2, self).__init__()
        self.fc1 = nn.Linear(isize, isize)
        self.fc2 = nn.Linear(isize, isize)
        self.a1 = a1
        self.a2 = a2
    def forward(self, x):
        s = x
        x = self.a1(x)
        x = self.fc1(x)
        x = self.a2(x)
        x = self.fc2(x)
        x = x + s
        return x

In [21]:
class ResnetClassifier1(nn.Module):
    def __init__(self, isize):
        super(ResnetClassifier1, self).__init__()
        rsize = isize * 3
        self.layers = nn.Sequential(
            nn.Linear(isize, rsize),
            nn.ReLU(inplace=True),
            ResnetLayerV1(rsize, nn.ReLU(inplace=True), nn.ReLU(inplace=True)),
            ResnetLayerV1(rsize, nn.ReLU(inplace=True), nn.ReLU(inplace=True)),
            ResnetLayerV1(rsize, nn.ReLU(inplace=True), nn.ReLU(inplace=True)),
            nn.Linear(rsize, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [22]:
resnet_modelfile = '../models/js_resnet_model1.pt'
resnet_model = ResnetClassifier1(num_feature_columns)

In [23]:
resnet_model = resnet_model.to(device)

In [15]:
trainset = JSClassificationDataset(train_file)
train_loader = data.DataLoader(trainset, **loader_args)

In [24]:
learning_rate = 0.0001
total_epochs = 10
optimizer = optim.Adam(resnet_model.parameters(recurse=True), lr=learning_rate)
loss = nn.BCELoss()
collector = EpochLossCollector(default_batch_size, 1, 1)

In [25]:
for epoch in range(total_epochs):
    train(resnet_model, device, train_loader, optimizer, loss, epoch, collector)

Epoch 0, 2390400/2390491: 0.6810844540596008
Epoch 1, 2390400/2390491: 0.6916476488113403
Epoch 2, 2390400/2390491: 0.705119252204895
Epoch 3, 2390400/2390491: 0.617253303527832
Epoch 4, 2390400/2390491: 0.6355941891670227
Epoch 5, 2390400/2390491: 0.6592046618461609
Epoch 6, 2390400/2390491: 0.6671040654182434
Epoch 7, 2390400/2390491: 0.6510292291641235
Epoch 8, 2390400/2390491: 0.633138120174408
Epoch 9, 2390400/2390491: 0.6383679509162903


In [26]:
classification_validate(resnet_model, device, train_loader, loss, total_epochs, collector)

In [27]:
collector.eval_losses

[(10, 0.004602857312198716, 0.6418012031837811)]

In [None]:
torch.save(resnet_model.dict(), resnet_modelfile)