In [2]:
import datetime
from typing import Optional, NamedTuple

import common
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

# Plan:

1. try to use the same data as before - to bredict buy / sell - so I will have classification problem instead of regression problem. 
2. try LSTM + CNN
3. try to add data and see what happens. 



In [3]:
from functools import reduce


def get_nearest_stocks(ticker):
    with open('most_correlated_stocks.json') as f:
        stocks_map = json.load(f)
    return stocks_map[ticker].keys()

wanted_stocks = ['V','AAPL', 'GOOGL', 'JNJ', 'AMZN', 'XOM', 'JPM', 'KO', 'SPY', 'MSFT']
for stock in wanted_stocks:
    train_data = []
    val_data = []
    test_data = []
    for near_stock in [stock] + list(get_nearest_stocks(stock)):
        data = common.StockData(near_stock)

        relevant_df = data.training[['datetime', 'close']]
        relevant_df = relevant_df.set_index('datetime', drop=True)
        relevant_df.columns = [near_stock]
        train_data.append(relevant_df)

        relevant_df = data.validation[['datetime', 'close']]
        relevant_df = relevant_df.set_index('datetime', drop=True)
        relevant_df.columns = [near_stock]
        val_data.append(relevant_df)
        
        relevant_df = data.test[['datetime', 'close']]
        relevant_df = relevant_df.set_index('datetime', drop=True)
        relevant_df.columns = [near_stock]
        test_data.append(relevant_df)

    train_data = reduce(lambda  left,right: pd.merge(left,right, how='outer', left_index=True, right_index=True), train_data)
    val_data = reduce(lambda  left,right: pd.merge(left,right, how='outer', left_index=True, right_index=True), val_data)
    test_data = reduce(lambda  left,right: pd.merge(left,right, how='outer', left_index=True, right_index=True), test_data)
    train_data.to_csv(f'most_common_stocks/training/{stock}.csv')
    val_data.to_csv(f'most_common_stocks/validation/{stock}.csv')   
    test_data.to_csv(f'most_common_stocks/test/{stock}.csv')   
    print(f"{stock} Done")
    
    

V Done
AAPL Done
GOOGL Done
JNJ Done
AMZN Done
XOM Done
JPM Done
KO Done
SPY Done
MSFT Done


In [12]:
data[0].training

Unnamed: 0,datetime,open,high,low,close,volume
0,2017-01-26 16:30:00,125.570,125.690,125.570,125.660,0.3360
1,2017-01-26 16:31:00,125.670,125.670,125.610,125.630,0.2796
2,2017-01-26 16:32:00,125.630,125.690,125.600,125.690,0.2770
3,2017-01-26 16:33:00,125.700,125.790,125.700,125.780,0.3564
4,2017-01-26 16:34:00,125.790,125.810,125.750,125.770,0.3098
...,...,...,...,...,...,...
203938,2019-02-28 22:55:00,173.118,173.118,172.938,172.988,1.8648
203939,2019-02-28 22:56:00,172.988,173.048,172.907,172.988,1.8967
203940,2019-02-28 22:57:00,172.988,172.998,172.888,172.918,1.9429
203941,2019-02-28 22:58:00,172.917,173.058,172.907,173.038,1.9219


In [2]:
apple_data = common.get_dukas_data("AAPL")
apple_data

Unnamed: 0,Local time,Open,High,Low,Close,Volume
0,01.01.2019 00:00:00.000 GMT+0200,157.718,157.718,157.718,157.718,0.0
1,01.01.2019 00:01:00.000 GMT+0200,157.718,157.718,157.718,157.718,0.0
2,01.01.2019 00:02:00.000 GMT+0200,157.718,157.718,157.718,157.718,0.0
3,01.01.2019 00:03:00.000 GMT+0200,157.718,157.718,157.718,157.718,0.0
4,01.01.2019 00:04:00.000 GMT+0200,157.718,157.718,157.718,157.718,0.0
...,...,...,...,...,...,...
1736755,31.12.2018 23:55:00.000 GMT+0200,157.718,157.718,157.718,157.718,0.0
1736756,31.12.2018 23:56:00.000 GMT+0200,157.718,157.718,157.718,157.718,0.0
1736757,31.12.2018 23:57:00.000 GMT+0200,157.718,157.718,157.718,157.718,0.0
1736758,31.12.2018 23:58:00.000 GMT+0200,157.718,157.718,157.718,157.718,0.0


In [3]:
def preprocess_data_from_dukas(df):
    
    # rename columns
    new_cols_map = {col: col.lower() for col in df}
    new_cols_map['Local time'] = "datetime"
    df = df.rename(columns=new_cols_map)

    # convert datetime type (string to datetime)
    df['datetime'] = df['datetime'].str.replace(r"\:00\.000 GMT\+0\d00","")
    df['datetime'] = pd.to_datetime(df['datetime'], format='%d.%m.%Y %H:%M')
    
    df = df.set_index(['datetime']).between_time("16:30", "23:00").reset_index()
    df = df.sort_values(by=['datetime'])
    
    # drop inactive days
    df['day_of_week'] = df['datetime'].dt.dayofweek
    mask = (df['day_of_week'] != 5) & (df['day_of_week'] != 6)
    df = df[mask]
    del df['day_of_week']
    df = df.reset_index(drop=True)

    return df

apple_data = preprocess_data_from_dukas(apple_data)
apple_data


Unnamed: 0,datetime,open,high,low,close,volume
0,2017-01-26 16:30:00,121.680,121.894,121.660,121.710,0.4459
1,2017-01-26 16:31:00,121.720,121.894,121.702,121.891,0.3697
2,2017-01-26 16:32:00,121.851,122.090,121.840,122.060,0.4814
3,2017-01-26 16:33:00,122.070,122.240,122.053,122.240,0.6113
4,2017-01-26 16:34:00,122.240,122.430,122.150,122.213,0.6310
...,...,...,...,...,...,...
337484,2020-05-22 22:56:00,318.317,318.538,318.238,318.438,3.4204
337485,2020-05-22 22:57:00,318.448,318.678,318.388,318.677,3.6803
337486,2020-05-22 22:58:00,318.678,318.918,318.638,318.818,3.8403
337487,2020-05-22 22:59:00,318.818,319.168,318.688,318.688,3.5204


In [4]:
apple_data['target'] = np.where(apple_data['close'] < apple_data['close'].shift(-50), 1, 0)
print("Long / Short precentage: " + str(apple_data['target'].sum() / len(apple_data)))
apple_data

Long / Short precentage: 0.5024341534094444


Unnamed: 0,datetime,open,high,low,close,volume,target
0,2017-01-26 16:30:00,121.680,121.894,121.660,121.710,0.4459,1
1,2017-01-26 16:31:00,121.720,121.894,121.702,121.891,0.3697,0
2,2017-01-26 16:32:00,121.851,122.090,121.840,122.060,0.4814,0
3,2017-01-26 16:33:00,122.070,122.240,122.053,122.240,0.6113,0
4,2017-01-26 16:34:00,122.240,122.430,122.150,122.213,0.6310,0
...,...,...,...,...,...,...,...
337484,2020-05-22 22:56:00,318.317,318.538,318.238,318.438,3.4204,0
337485,2020-05-22 22:57:00,318.448,318.678,318.388,318.677,3.6803,0
337486,2020-05-22 22:58:00,318.678,318.918,318.638,318.818,3.8403,0
337487,2020-05-22 22:59:00,318.818,319.168,318.688,318.688,3.5204,0


In [5]:
# i will start using only close & volume prices
apple_close_price = apple_data[['close', 'volume']].values
apple_targets = apple_data['target'].values.reshape(-1, 1)
from keras.utils import to_categorical
apple_targets = to_categorical(apple_targets)
apple_targets

Using TensorFlow backend.


array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [6]:
from dataclasses import dataclass

@dataclass
class SingleDataSet:
    X: np.array
    y: np.array
    
    def generator(self, shuffle=False):
        shifted_y = np.insert(self.y, 0, 0, axis=0)[:-1]  # TimeseriesGenerator uses stride=1, but we want stride=0
        return TimeseriesGenerator(self.X, shifted_y, length=500, sampling_rate=1, stride=1, 
                                   batch_size=128, shuffle=shuffle)


class FullDataSet(NamedTuple):
    training: SingleDataSet
    validation: SingleDataSet
    test: SingleDataSet

    x_scaler: Optional[MinMaxScaler] = None
    y_scaler: Optional[MinMaxScaler] = None

    def normelize(self):
        x_scaler = MinMaxScaler()
        y_scaler = MinMaxScaler()
        x_scaler.fit(self.training.X)
        y_scaler.fit(self.training.y)
        datasets = []
        for dataset in [self.training, self.validation, self.test]:
            new_X = x_scaler.transform(dataset.X)
            new_y = y_scaler.transform(dataset.y)
            datasets.append(SingleDataSet(new_X, new_y))
        return FullDataSet(*datasets, x_scaler, y_scaler)


def train_val_test_split(samples, features) -> FullDataSet:
    X_train, X_test, y_train, y_test = train_test_split(
        samples, features, test_size=0.2, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.25, shuffle=False)
    train = SingleDataSet(X_train, y_train)
    validation = SingleDataSet(X_val, y_val)
    test = SingleDataSet(X_test, y_test)
    return FullDataSet(train, validation, test)


dataset = train_val_test_split(apple_close_price, apple_targets)


In [7]:
normelize_dataset = dataset.normelize()


In [8]:

def get_callbacks(model_name: str):
    # define the checkpoint
    filepath="weights_improvement_" + model_name + "_{epoch:02d}-{loss:.4f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

    # tensorboard
    log_dir = f'logs/fit/{model_name}/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}'
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
    
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
                   patience=100, min_delta=0.0001)

    return [checkpoint, tensorboard_callback, es]



In [9]:
def pnl_using_price(curr_price, predicted_price, true_price):    
    predicted_roc = predicted_price / curr_price - 1
    true_roc = true_price / curr_price - 1
    return rate_of_change_PnL(predicted_roc, true_roc)


# from common.py - will return a vector of profits/losses (percentage wise) for every trade
def rate_of_change_PnL(pred_roc, actual_roc):
    """ calculate the profit/loss of every prediction """
    return (pred_roc > 0) * actual_roc - (pred_roc < 0) * actual_roc


def mean(l: list) -> float:
    return sum(l) / len(l)


def calculate_model_pnl(model, dataset: FullDataSet, data_to_use: str) -> float:
    data_generator = getattr(dataset, data_to_use).generator()
    # checking how well the model did:
    pnls = []
    for batch in data_generator:
        samples, targets = batch
        predictions = model.predict(samples)
        for sample, target, prediction in zip(samples, targets, predictions):
            original_last_price_point = dataset.x_scaler.inverse_transform(sample)[:, 0][-1]
            original_prediction = dataset.y_scaler.inverse_transform(prediction.reshape(1, 1))[0][0]
            original_target = dataset.y_scaler.inverse_transform(target.reshape(1, 1))[0][0]
            pnls.append(pnl_using_price(original_last_price_point, original_prediction, original_target))
        
    return mean(pnls)


In [10]:
train_generator = normelize_dataset.training.generator(shuffle=True)
n_timesteps = train_generator[0][0][0].shape[0]
n_features = train_generator[0][0][0].shape[1]
n_outputs = train_generator[0][1][0].shape[0]

In [11]:
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(n_timesteps,n_features)))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(n_outputs, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 498, 64)           448       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 496, 64)           12352     
_________________________________________________________________
dropout (Dropout)            (None, 496, 64)           0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 248, 64)           0         
_________________________________________________________________
flatten (Flatten)            (None, 15872)             0         
_________________________________________________________________
dense (Dense)                (None, 100)               1587300   
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 2

In [12]:
train_generator = normelize_dataset.training.generator(shuffle=True)

model.fit(
    train_generator,epochs=200, steps_per_epoch=200,
    validation_data=normelize_dataset.validation.generator(), validation_steps=50,
    callbacks=get_callbacks("CNN_categorical_200_epochs"))


Train for 200 steps, validate for 50 steps
Epoch 1/200
Epoch 00001: loss improved from inf to 0.69508, saving model to weights_improvement_CNN_categorical_200_epochs_01-0.6951.hdf5
Epoch 2/200
Epoch 00002: loss improved from 0.69508 to 0.69279, saving model to weights_improvement_CNN_categorical_200_epochs_02-0.6928.hdf5
Epoch 3/200
Epoch 00003: loss improved from 0.69279 to 0.69106, saving model to weights_improvement_CNN_categorical_200_epochs_03-0.6911.hdf5
Epoch 4/200
Epoch 00004: loss improved from 0.69106 to 0.68406, saving model to weights_improvement_CNN_categorical_200_epochs_04-0.6841.hdf5
Epoch 5/200
Epoch 00005: loss improved from 0.68406 to 0.67773, saving model to weights_improvement_CNN_categorical_200_epochs_05-0.6777.hdf5
Epoch 6/200
Epoch 00006: loss improved from 0.67773 to 0.67452, saving model to weights_improvement_CNN_categorical_200_epochs_06-0.6745.hdf5
Epoch 7/200
Epoch 00007: loss improved from 0.67452 to 0.67342, saving model to weights_improvement_CNN_categ

Epoch 00024: loss did not improve from 0.63604
Epoch 25/200
Epoch 00025: loss improved from 0.63604 to 0.63445, saving model to weights_improvement_CNN_categorical_200_epochs_25-0.6344.hdf5
Epoch 26/200
Epoch 00026: loss improved from 0.63445 to 0.63169, saving model to weights_improvement_CNN_categorical_200_epochs_26-0.6317.hdf5
Epoch 27/200
Epoch 00027: loss improved from 0.63169 to 0.63051, saving model to weights_improvement_CNN_categorical_200_epochs_27-0.6305.hdf5
Epoch 28/200
Epoch 00028: loss improved from 0.63051 to 0.62034, saving model to weights_improvement_CNN_categorical_200_epochs_28-0.6203.hdf5
Epoch 29/200
Epoch 00029: loss did not improve from 0.62034
Epoch 30/200
Epoch 00030: loss improved from 0.62034 to 0.61794, saving model to weights_improvement_CNN_categorical_200_epochs_30-0.6179.hdf5
Epoch 31/200
Epoch 00031: loss improved from 0.61794 to 0.61455, saving model to weights_improvement_CNN_categorical_200_epochs_31-0.6145.hdf5
Epoch 32/200
Epoch 00032: loss impr

Epoch 73/200
Epoch 00073: loss improved from 0.47667 to 0.47008, saving model to weights_improvement_CNN_categorical_200_epochs_73-0.4701.hdf5
Epoch 74/200
Epoch 00074: loss improved from 0.47008 to 0.46661, saving model to weights_improvement_CNN_categorical_200_epochs_74-0.4666.hdf5
Epoch 75/200
Epoch 00075: loss did not improve from 0.46661
Epoch 76/200
Epoch 00076: loss improved from 0.46661 to 0.46152, saving model to weights_improvement_CNN_categorical_200_epochs_76-0.4615.hdf5
Epoch 77/200
Epoch 00077: loss improved from 0.46152 to 0.45931, saving model to weights_improvement_CNN_categorical_200_epochs_77-0.4593.hdf5
Epoch 78/200
Epoch 00078: loss did not improve from 0.45931
Epoch 79/200
Epoch 00079: loss improved from 0.45931 to 0.45575, saving model to weights_improvement_CNN_categorical_200_epochs_79-0.4558.hdf5
Epoch 80/200
Epoch 00080: loss improved from 0.45575 to 0.45026, saving model to weights_improvement_CNN_categorical_200_epochs_80-0.4503.hdf5
Epoch 81/200
Epoch 000

Epoch 98/200
Epoch 00098: loss did not improve from 0.40990
Epoch 99/200
Epoch 00099: loss improved from 0.40990 to 0.40931, saving model to weights_improvement_CNN_categorical_200_epochs_99-0.4093.hdf5
Epoch 100/200
Epoch 00100: loss did not improve from 0.40931
Epoch 101/200
Epoch 00101: loss improved from 0.40931 to 0.40874, saving model to weights_improvement_CNN_categorical_200_epochs_101-0.4087.hdf5
Epoch 00101: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fbde126fc50>

In [11]:
model = load_model("weights_improvement_CNN_categorical_200_epochs_101-0.4087.hdf5")

In [14]:
first_batch = normelize_dataset.validation.generator()[0]
loss, accuracy = model.evaluate(normelize_dataset.validation.generator(), verbose=1)
loss, accuracy
# normelize_dataset.validation.generator()
# model.evaluate(first_batch)



(1.9144682519215672, 0.5080749)

In [12]:
predictions = model.predict(normelize_dataset.validation.generator())
predictions

array([[1.6551429e-01, 8.3448571e-01],
       [3.7632060e-01, 6.2367940e-01],
       [3.6860329e-01, 6.3139671e-01],
       ...,
       [2.1844049e-19, 1.0000000e+00],
       [1.5735119e-19, 1.0000000e+00],
       [6.8533295e-18, 1.0000000e+00]], dtype=float32)

In [13]:
prices_normelized = []
targets = []
for batch_of_x, batch_of_y in normelize_dataset.validation.generator():
    for x, y in zip(batch_of_x, batch_of_y):
        last_price = x[-1][0]
        prices_normelized.append(last_price)
        targets.append(np.argmax(y, axis=-1))
        
prices_normelized = np.array(prices_normelized)
targets = np.array(targets)



In [14]:
original_prices = dataset.validation.X[:, 0][499:-1]
original_target = np.argmax(normelize_dataset.validation.y[499:-1], axis=-1)


In [15]:
prices_and_dummy_volumes = normelize_dataset.x_scaler.inverse_transform(np.column_stack([prices_normelized, np.zeros(prices_normelized.shape[0])]))
prices = prices_and_dummy_volumes[:, 0]
prices

array([154.118, 153.927, 154.017, ..., 219.328, 219.068, 218.958])

In [16]:
prices_and_predictions = pd.DataFrame(
    {'close': prices,
     'original_close': original_prices,
     'prediction': np.argmax(predictions, axis=-1),
     'target': targets,
     'original_target': original_target
    })
prices_and_predictions

Unnamed: 0,close,original_close,prediction,target,original_target
0,154.118,154.118,1,0,0
1,153.927,153.927,1,0,0
2,154.017,154.017,1,0,0
3,153.968,153.968,1,0,0
4,153.978,153.978,1,0,0
...,...,...,...,...,...
66993,219.478,219.478,1,0,0
66994,219.498,219.498,1,0,0
66995,219.328,219.328,1,0,0
66996,219.068,219.068,1,0,0


In [17]:
prices_and_predictions['target_calculated'] = np.where(prices_and_predictions['close'] < prices_and_predictions['close'].shift(-50), 1, 0)
prices_and_predictions

Unnamed: 0,close,original_close,prediction,target,original_target,target_calculated
0,154.118,154.118,1,0,0,0
1,153.927,153.927,1,0,0,0
2,154.017,154.017,1,0,0,0
3,153.968,153.968,1,0,0,0
4,153.978,153.978,1,0,0,0
...,...,...,...,...,...,...
66993,219.478,219.478,1,0,0,0
66994,219.498,219.498,1,0,0,0
66995,219.328,219.328,1,0,0,0
66996,219.068,219.068,1,0,0,0


In [26]:
prices_and_predictions['future_price'] = prices_and_predictions['close'].shift(-50)
prices_and_predictions[['close', 'target', 'prediction', 'future_price']]

Unnamed: 0,close,target,prediction,future_price
0,154.118,0,1,152.798
1,153.927,0,1,152.727
2,154.017,0,1,152.738
3,153.968,0,1,152.708
4,153.978,0,1,152.848
...,...,...,...,...
66993,219.478,0,1,
66994,219.498,0,1,
66995,219.328,0,1,
66996,219.068,0,1,


In [30]:
prices_and_predictions['tmp'] = prices_and_predictions['target'] == prices_and_predictions['prediction']
prices_and_predictions['roc'] = prices_and_predictions['future_price'] / prices_and_predictions['close'] - 1
prices_and_predictions['tmp'][prices_and_predictions['tmp']==0] = -1

prices_and_predictions['pnl'] = prices_and_predictions['tmp'] * prices_and_predictions['roc'].abs()


prices_and_predictions['true_pnl'] = prices_and_predictions['pnl'].abs()
prices_and_predictions.head(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,close,original_close,prediction,target,original_target,target_calculated,future_price,tmp,roc,pnl,true_pnl
0,154.118,154.118,1,0,0,0,152.798,-1.0,-0.008565,-0.008565,0.008565
1,153.927,153.927,1,0,0,0,152.727,-1.0,-0.007796,-0.007796,0.007796
2,154.017,154.017,1,0,0,0,152.738,-1.0,-0.008304,-0.008304,0.008304
3,153.968,153.968,1,0,0,0,152.708,-1.0,-0.008184,-0.008184,0.008184
4,153.978,153.978,1,0,0,0,152.848,-1.0,-0.007339,-0.007339,0.007339
5,154.018,154.018,1,0,0,0,152.828,-1.0,-0.007726,-0.007726,0.007726
6,153.818,153.818,1,0,0,0,152.878,-1.0,-0.006111,-0.006111,0.006111
7,153.707,153.707,1,0,0,0,152.857,-1.0,-0.00553,-0.00553,0.00553
8,153.818,153.818,1,0,0,0,152.847,-1.0,-0.006313,-0.006313,0.006313
9,153.867,153.867,1,0,0,0,152.817,-1.0,-0.006824,-0.006824,0.006824


In [41]:
prices_and_predictions[['pnl', 'true_pnl']].describe()

Unnamed: 0,pnl,true_pnl
count,66948.0,66948.0
mean,4.1e-05,0.003401
std,0.006155,0.005131
min,-0.056876,0.0
25%,-0.001901,0.000795
50%,0.0,0.001947
75%,0.001993,0.00392
max,0.060842,0.060842
