In [5]:
# imports
import os
import numpy as np
import math
import time
import matplotlib.pyplot as plt
import multiprocessing
from os import walk
import keras
import pickle
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# imports
import os
import cv2
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

Using TensorFlow backend.


In [6]:
RESULTS_DIR = './test_results/'
label_map = {'Hold': 0, 'Buy': 1, 'Sell': 2}
label_map_reverse = {0:'Hold', 1:'Buy', 2:'Sell'}

In [7]:
def get_paths(filepath):
    dataset_paths = []
    labels_paths = []
    stock_names = []
    for (dirpath, dirnames, filenames) in walk(filepath):
        dataset_paths = [filepath + '/' + name for name in dirnames]
        labels_paths = [filepath + '/' + name for name in filenames if name != '.DS_Store']
        stock_names = [name.split('_')[0] for name in filenames if name != '.DS_Store']
        break
    dataset_paths.sort()
    labels_paths.sort()
    stock_names.sort()

    return dataset_paths, labels_paths, stock_names

In [8]:
dataset_paths, labels_paths, stock_names = get_paths('./data')

In [9]:
def read_data(d_path, l_path, s_name):
    
    with open(d_path+'/data.pkl', 'rb') as f:
        dataset = pickle.load(f)
    
    labels = np.zeros(len(dataset))
    prices = np.zeros(len(dataset))
    dates = np.empty(len(dataset), dtype='datetime64[s]')
    with open(l_path, 'r') as f:
        for i, line in enumerate(f):
            info = line.strip().split(', ')
            dates[i] = np.datetime64(info[1])
            prices[i] = info[2]
            labels[i] = label_map[info[3]]

    return [dataset, dates, prices, labels]

In [10]:
# do the following for each stock

for (d_path, l_path, s_name) in zip(dataset_paths, labels_paths, stock_names):
    print(d_path)
    print(l_path)
    print(s_name)
    data = read_data(d_path, l_path, s_name)
    break

./data/AAPL_data
./data/AAPL_labels.txt
AAPL


In [11]:
type(data)

list

In [12]:
def data_augmentation(X_data, Y_data):
    proportion = np.array([0.34, 0.33, 0.33])
    labels_ind = [np.where(Y_data == x)[0] for x in range(3)]
    labels_tot = math.ceil(max([labels_ind[x].shape[0] / proportion[x] for x in range(len(labels_ind))]))

    labels_num = (proportion * labels_tot).astype(int)
    labels = np.arange(labels_num.sum())
    
    l = Y_data.shape[0]
    for i in range(labels_num.shape[0]):
        additional_ind = np.random.choice(labels_ind[i], labels_num[i] - labels_ind[i].shape[0])
        r = l + additional_ind.shape[0]
        labels[l:r] = additional_ind
        l = r

    return X_data[labels], Y_data[labels]

In [13]:
def data_processing(dataset, labels, augmentation):
    if augmentation:
        dataset, labels = data_augmentation(dataset, labels)
    
    X_data = dataset if len(dataset.shape) == 3 else dataset.reshape(dataset.shape[0], dataset.shape[1], 1)
    Y_data = keras.utils.to_categorical(labels)
    input_shape = X_data.shape[1:]

    return X_data, Y_data, input_shape

In [14]:
def data_extraction(data, start_year, end_year, augmentation):
    r = np.argwhere((data[1] >= np.datetime64(str(start_year))) & (data[1] < np.datetime64(str(end_year))))
    s = r[0][0]
    e = r[-1][0]

    dataset = data[0][s:e+1]
    dates = data[1][s:e+1]
    prices = data[2][s:e+1]
    labels = data[3][s:e+1]

    return data_processing(dataset, labels, augmentation), dates, prices

In [15]:
(X_train, Y_train, input_shape), _, _ = data_extraction(data, (2002), 2007, True)

In [16]:
X_train.shape

(3273, 19, 1)

In [17]:
class Dataset(torch.utils.data.dataset.Dataset):
    __Xs = None
    __ys = None
    
    def __init__(self, data, start_year, end_year, augment):
        (self.__Xs, self.__ys, input_shape), date, price = data_extraction(data, start_year, end_year, augment)
        
    def __getitem__(self, index):
        feature = self.__Xs[index]
        # transpose feature vector
        feature = feature.T
        feature = torch.from_numpy(np.asarray(feature))
        label = torch.from_numpy(np.asarray(self.__ys[index]))
        
        return feature, label
    
    def __len__(self):
        return self.__Xs.shape[0]

In [18]:
# do the following for each stock and year range

batch_size = 128

transform = transforms.Compose(
    [transforms.ToTensor(),])

trainset = Dataset(data, 2002, 2007, True)#, transform=transform)
validationset = Dataset(data, 2007, 2008, False)#, transform=transform)

train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=1)
validation_loader = torch.utils.data.DataLoader(validationset, batch_size=batch_size, shuffle=True, num_workers=1)

In [19]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [33]:
class Net(nn.Module):
    def __init__(self, num_classes=3):
        super(Net, self).__init__()
        
        self.features = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            
#             nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Dropout(0.25),
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(1216, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(128, num_classes),
            nn.Softmax()
        )
        
        

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        print(x.shape)
        x = self.classifier(x)
        return x
    


net = Net().double()
if torch.cuda.is_available():
    print("Running on GPU")
    net = net.cuda()

In [34]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [36]:
epochs = 1

training_losses = []
validation_losses = []

for epoch in range(epochs):  # loop over the dataset multiple times

    training_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        labels = labels.long()
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            labels = labels.cuda()

        optimizer.zero_grad()

        outputs = net(inputs)
        if torch.cuda.is_available():
            outputs = outputs.cuda()
        loss = criterion(outputs, labels)
        loss.backward() # compute gradients
        optimizer.step() # update weights

        training_loss += loss.item()
    training_losses.append(training_losses)
        
    validation_loss = 0.0
    for i, data in enumerate(validation_loader, 0):
        inputs, labels = data
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            labels = labels.cuda()
        
        outputs = net(inputs)
        if torch.cuda.is_available():
            outputs = outputs.cuda()
        loss = criterion(outputs, labels)
        
        validation_loss += loss.item()
    validation_losses.append(validation_loss)
    
    print('epoch %d/%d \t training loss: %.3f \t validation_loss: %.3f' %
              (epoch + 1, epochs, training_loss, validation_loss))

print('Finished Training')

# torch.save(net, 'ConvNet.pt')
# print("Saved model in ConvNet.pt")

torch.Size([128, 1216])


  input = module(input)


RuntimeError: multi-target not supported at /Users/soumith/miniconda2/conda-bld/pytorch_1532623076075/work/aten/src/THNN/generic/ClassNLLCriterion.c:21