<a href="https://colab.research.google.com/github/ahsank/StockML/blob/main/breakout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install -q yahoo_fin

In [2]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from yahoo_fin import stock_info as si
from collections import deque

import os
import numpy as np
import pandas as pd
import random
import time
from tensorflow.keras.layers import LSTM
import matplotlib.pyplot as plt


In [42]:
def loaddataBO(alldf, steps=12, shuffle=True, skip=6, lookup=60,
               split_by_date=False, target="scale_target", test_size=0.2,
               features = ["pct_close", "pct_vol"]):
    xdata, ydata = [], []
    alldf = alldf[alldf.close > 0]
    alldf = alldf[alldf.vol > 1000]
    tickers = alldf.ticker.unique()
    last_seq = []
    alldf.loc[:, 'pct_vol'] = alldf['vol'].pct_change()
    alldf.loc[:, 'pct_close'] = alldf['close'].pct_change()
    alldf.loc[:, 'mean_close'] = alldf['close'].rolling(steps).mean()
    alldf.dropna(inplace=True)
    newtickers = []
    for ticker in tickers:
        last_row = alldf[alldf.ticker == ticker].tail(steps)[features]
        assert not last_row.isnull().values.any()
        if len(last_row) > 0:
            last_seq.append(np.array(last_row))
            newtickers.append(ticker)
    tickers = newtickers
    last_seq = np.array(last_seq)
    alldf.loc[:, 'target'] = alldf.loc[:, 'close'].rolling(lookup).max().shift(-lookup)
    alldf.dropna(inplace=True)
    alldf.loc[:, 'pct_target'] = alldf['target']/alldf['mean_close']
    alldf.dropna(inplace=True)

    scale = 5.0
    alldf['scale_target'] = alldf['pct_target'].apply(lambda x:  min(x,scale)*2.0/scale - 1)

#    alldf.dropna(inplace=True)
    alldf['timestamp'] = pd.to_datetime(alldf['timestamp'])
    for ticker in tickers:
        lasttime = None
        tickerdf = alldf[alldf.ticker == ticker]
        predx = tickerdf.tail(steps)[features]

        for i in range(len(tickerdf)-steps-lookup):
            if lasttime is None:
                lasttime = tickerdf.iloc[i].timestamp

            if tickerdf.iloc[i].timestamp < lasttime:
                continue
            else:
                lasttime = lasttime + relativedelta(months=skip)

            # start_row = i
            # end_row = i+steps
            # xrow = tickerdf.iloc[end_row-1, :]
            # tsi = xrow.timestamp
            # closei = xrow.close
            # checktime = tsi + relativedelta(months=lookup)
            # mean_value = tickerdf.iloc[start_row:end_row]['close'].mean()
            # rate = closei / mean_value
            rate = tickerdf.iloc[i+steps-1][target]
            seq = tickerdf.iloc[i:i+steps][features]
            xdata.append(np.array(seq))
            ydata.append(rate)
    xdata = np.array(xdata)
    ydata = np.array(ydata)
    if split_by_date:
        train_samples = int((1-test_size) * len(xdata))
        xtrain = xdata[:train_samples]
        ytrain = ydata[:train_samples]
        xtest = xdata[train_samples:]
        ytest = ydata[train_samples:]
    else:
        xtrain, xtest, ytrain, ytest = train_test_split(xdata, ydata, test_size=test_size, shuffle=shuffle)
    xtrain = xtrain[:, :, :len(features)].astype(np.float32)
    last_seq = last_seq[:, :, :len(features)].astype(np.float32)
    xtest = xtest[:, :, :len(features)].astype(np.float32)
    return xtrain, ytrain, xtest, ytest, (tickers, last_seq), alldf


In [61]:
def create_model(sequence_length, n_features, units=64, cell=LSTM,
                 n_layers=2, dropout=0.4,
                 loss="huber_loss", optimizer="adam", bidirectional=False):
    model = Sequential()
    for i in range(n_layers):
        if i == 0:
            # first layer
            if bidirectional:
                model.add(Bidirectional(cell(units, return_sequences=True), batch_input_shape=(None, sequence_length, n_features)))
            else:
                model.add(cell(units, return_sequences=True, batch_input_shape=(None, sequence_length, n_features)))
        elif i == n_layers - 1:
            # last layer
            if bidirectional:
                model.add(Bidirectional(cell(units, return_sequences=False)))
            else:
                model.add(cell(units, return_sequences=False))
        else:
            # hidden layers
            if bidirectional:
                model.add(Bidirectional(cell(units, return_sequences=True)))
            else:
                model.add(cell(units, return_sequences=True))
        # add dropout after each layer
        model.add(Dropout(dropout))
    model.add(Dense(1, activation="linear"))
    model.compile(loss=loss, metrics=["mean_absolute_error"], optimizer=optimizer)
    return model


In [5]:
def mytrain(model_name, model, xtrain,
            ytrain, xtest, ytest, epochs, batch_size=64):
    checkpointer = ModelCheckpoint("test.h5", save_weights_only=True,
                                   save_best_only = True, verbose=1)
    tensorboard = TensorBoard(log_dir=os.path.join("logs", model_name))
    earlystopping = EarlyStopping(monitor='loss', patience=5)
    model.fit(xtrain, ytrain, batch_size=batch_size, epochs=epochs,
              validation_data=(xtest, ytest),
              callbacks = [checkpointer, tensorboard, earlystopping],
              verbose=1)


In [6]:
def do_op(alldf=None, model=None, epoch=10):
    if alldf is None:
        alldf = pd.read_csv("alldf.csv")
    xtrain, ytrain, xtest, ytest, pred_pair = loaddataBO(alldf)
    if model is None:
        model = create_model(12, 2)
    mytrain("testmodel", model, xtrain, ytrain, xtest, ytest, epoch)
    result = {}
    tickers, last_seq = pred_pair
    preds = model.predict(last_seq)
    result = pd.DataFrame({'ticker': tickers, 'pred': preds[:, 0]})
    return result. alldf


In [43]:
alldf = pd.read_csv("alldf.csv")


In [44]:
xtrain, ytrain, xtest, ytest, pred_pair, newdf = loaddataBO(alldf)

In [62]:
model = create_model(12, 2)

In [48]:
ytest


array([-0.72458633,  0.05329758,  1.        , ..., -0.40994979,
       -0.30115314,  1.        ])

In [None]:
mytrain("testmodel", model, xtrain, ytrain, xtest, ytest, 50)

In [65]:
tickers, last_seq = pred_pair
preds = model.predict(last_seq)
result = pd.DataFrame({'ticker': tickers, 'pred': preds[:, 0]})



In [66]:
result[result.pred > 0.5].sort_values(by=['pred'], ascending=False).head(50)


Unnamed: 0,ticker,pred
1446,RIGL,1.043295
2703,GSAT,0.998527
189,AENZ,0.99822
939,ERNA,0.961528
1104,UHG,0.958399
2649,GETY,0.904892
2352,CYH,0.901413
106,NCTY,0.894018
1557,CTSO,0.885607
3150,CCO,0.883837
