In [32]:
import numpy as np
import pandas as pd
from preprocessing.wrangling import get_indi_df, get_labels, slide_and_flatten
from preprocessing.extract_features import get_all_ta_features, get_wavelet_coeffs
from evaluation.eval import sliding_window_cv_regression, batch_test_swcv_regression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from numpy.lib.stride_tricks import sliding_window_view
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline
import datetime
import os
import csv
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
from math import prod

In [33]:
def add_closing_price(y, cls_price):
    return y + cls_price

In [34]:
class PersistanceModel:
    def __init__(self, persist_colname='Close'):
        self.persist_colname = persist_colname

    def __repr__(self):
        return "PersistanceModel(persist_colname={})".format(self.persist_colname)

    def fit(self, Xtr, ytr):
        pass

    def predict(self, Xts):
        return Xts.loc[:, self.persist_colname]


In [35]:
class Decoder(nn.Module):
    def __init__(self, latent_dims, op_dim):
        super(Decoder, self).__init__()
        self.linear1 = nn.Linear(latent_dims, op_dim//2)
        self.linear2 = nn.Linear(op_dim//2, op_dim)

    def forward(self, z):
        z = F.relu(self.linear1(z))
        z = torch.sigmoid(self.linear2(z))
        # return z.reshape((-1, 1, 28, 28))
        return z

In [36]:
class VariationalEncoder(nn.Module):
    def __init__(self, ip_dim, latent_dims):
        assert ip_dim//2 >= latent_dims, "Ensure ip_dim//2 () >= latent_dims ()".format(ip_dim//2, latent_dims)
        super(VariationalEncoder, self).__init__()
        self.linear1 = nn.Linear(ip_dim, ip_dim//2)
        self.linear2 = nn.Linear(ip_dim//2, latent_dims)
        self.linear3 = nn.Linear(ip_dim//2, latent_dims)

        self.N = torch.distributions.Normal(0, 1)
        # self.N.loc = self.N.loc.cuda() # hack to get sampling on the GPU
        # self.N.scale = self.N.scale.cuda()
        self.kl = 0

    def forward(self, x):
        x = torch.flatten(x, start_dim=1)
        print('xmax', torch.max(x), 'xmin', torch.min(x))
        print('pfltn', x)
        x = self.linear1(x)
        print('pl1', x)
        x = F.relu(x)
        print('prelu', x)
        mu =  self.linear2(x)
        sigma = torch.exp(self.linear3(x))
        z = mu + sigma*self.N.sample(mu.shape)
        print('x', x, 'mu', mu, 'sigma', sigma, 'z', z)
        self.kl = (sigma**2 + mu**2 - torch.log(sigma) - 1/2).sum()
        return z

In [37]:
class VariationalAutoencoder(nn.Module):
    def __init__(self, ip_dim, latent_dims):
        super(VariationalAutoencoder, self).__init__()
        self.encoder = VariationalEncoder(ip_dim, latent_dims)
        self.decoder = Decoder(latent_dims, ip_dim)

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)

In [38]:
def train_vae(autoencoder, data, epochs=20):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    opt = torch.optim.Adam(autoencoder.parameters())
    for epoch in range(epochs):
        x = data  # Full batch gradient descent
    # for x in data:
        x = x.to(device) # GPU
        opt.zero_grad()
        print(x)
        x_hat = autoencoder(x)
        print(x, x_hat)
        loss = ((x.reshape(x_hat.shape) - x_hat)**2).sum() + autoencoder.encoder.kl
        print(loss)
        loss.backward()
        opt.step()
    return autoencoder

In [39]:
def sliding_window_cv_torch(swdf, y, model, optimizer, loss_fn, n_tr, n_ts=1, scorers=[], comment="", post_processor=None):
    assert len(swdf) == len(y), "Length of X ([]) must match that of y ([]).".format(len(X), len(y))
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using {device} device")

    y_pred = []
    y_target = []
    agg_results = {}
    if post_processor is not None:
        post_processor_f, post_processor_args = post_processor[0], post_processor[1]
        

    for i_tr_start in range(0, len(swdf)-(n_tr+n_ts)):
        # The last i_ts_end should be len(X).
        # i_ts_end = i_ts_start + n_ts
        # Now, i_tr_end = i_ts_start
        # So, i_tr_start = i_ts_start - n_tr
        # But, i_ts_start = i_ts_end - n_ts
        # Thus, i_tr_start = i_ts_end - n_tr - n_ts
        # Hence, last i_tr_start = len(X) - (n_tr + n_ts)

        i_tr_end = i_ts_start = i_tr_start + n_tr 
        i_ts_end = i_ts_start + n_ts 

        Xtr, Xts = swdf[i_tr_start:i_tr_end, :, :], swdf[i_ts_start:i_ts_end, :, :]
        ytr, yts = y[i_tr_start:i_tr_end].to_numpy(), y[i_ts_start:i_ts_end].to_numpy()
        Xtr, Xts, ytr, yts = torch.Tensor(Xtr), torch.Tensor(Xts), torch.Tensor(ytr), torch.Tensor(yts)
        Xtr, Xts, ytr, yts = Xtr.float().to(device), Xts.float().to(device), ytr.float().to(device), yts.float().to(device)
        model.to(device)
        
        epochs = 150
        for e in range(epochs):
            model.train()
            pred = model(Xtr)
            loss = loss_fn(pred, ytr)
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            pred = model(Xts)
            mape = torch.mean((torch.abs((yts - pred) / yts)) * 100)
            if device == "cuda":
                pred = pred.detach()
                yts = yts.detach()

            if len(pred.shape) == 0:
                pred = pred.unsqueeze(0)
            if pred.shape[0] == 1:
                y_pred.append(pred.item())
                y_target.append(yts.item())
            else:
                y_pred, yts = list(y_pred), list(yts)
                y_pred.extend(y_pred)
                y_target.extend(yts)

    # print(len(y_target), len(y_pred))
    # print(y_pred, y_target)
    if len(y_pred) > 1:
        y_pred = np.squeeze(y_pred)

    if post_processor is not None:
        y_pred = post_processor_f(y_pred, **post_processor_args)
        y_target = post_processor_f(y_target, **post_processor_args)

    print("y_target, y_pred", y_target, y_pred)
    agg_results['time'] = datetime.datetime.now()
    agg_results['model'] = str(model)
    agg_results['comment'] = comment
    for scorer in scorers:
        agg_results[scorer.__name__] = scorer(y_target, y_pred)

    return agg_results





In [40]:
class LSTMBaseline(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMBaseline, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.dense = nn.Linear(hidden_size, 1)
    
    def forward(self, swdf):
        _, hncn = self.lstm(swdf)
        hn, cn = hncn
        hn = hn.squeeze()
        op = self.dense(hn)
        return op.squeeze()
    

In [42]:
list_dir = 'data_collection/stocks_list'
list_prefix = "ind_nifty"
list_suffix = "list.csv"
save_dir = 'data_collection/ohlcv_data'
save_prefix = "ohlcv_"
save_suffix = ".csv"
resultfile = "results/baseline_lstm_vae.csv"
cap_n_stocks = 10

skip_till = "HEROMOTOCO.NS"  # Set to None if nothing is to be skipped. 
results = []
start = skip_till is None

use_vae = True
vae_latent_dims = 2 # Must be > half of features.

len_window = 10
n_tr = 60
n_ts = 1

for f in os.listdir(list_dir):
    if f.startswith(list_prefix) and f.endswith(list_suffix):
        savefile = os.path.join(save_dir, save_prefix+f[9:-8]+save_suffix)
        listfile = os.path.join(list_dir, f)
        p = pd.read_csv(listfile)
        symbols = list(p['Symbol'].values + '.NS')
        if cap_n_stocks <= 0:
            break
        for symbol in symbols:
            if not start:
                if symbol != skip_till:
                    continue 
                else:
                    start = True 
                    continue  # skip_till is skip inclusive i.e. stock = skip_till will be skipped.

            cap_n_stocks -= 1
            if cap_n_stocks <= 0:
                break

            df = get_indi_df(symbol, ohlcvfile=savefile, start_date="2017-01-01")
            # df = get_all_ta_features(df)
            drop_columns = ['Date', 'Adj Close', 'Volume']
            df.drop(drop_columns, axis=1, inplace=True)
            move_dir_target, cls_target = get_labels(df['Close'])
            df = df.iloc[:-1]
            unflattened_df_cls = df['Close']

            min_max_scaler = MinMaxScaler()
            df = min_max_scaler.fit_transform(df)  # Data Leakage! To be fixed.
            if use_vae:
                vae = VariationalAutoencoder(ip_dim=prod(df.shape)//df.shape[0], latent_dims=vae_latent_dims)
                vae = train_vae(vae, torch.Tensor(df), epochs=50)
                df = vae.encoder(torch.Tensor(df)).detach().numpy()
                print(pd.DataFrame(df).head())

            swdf10 = (sliding_window_view(df, (10, df.shape[1]))).squeeze()
            cls_target10 = cls_target.iloc[(10 - 1):-1]
            # df10 = slide_and_flatten(df, window_len=10)
            # df10 = pd.DataFrame(df10, index=df.index[9:])
            # df30 = slide_and_flatten(df, window_len=30)
            # df30 = pd.DataFrame(df30, index=df.index[29:])
            # df60 = slide_and_flatten(df, window_len=60)
            # df60 = pd.DataFrame(df60, index=df.index[59:])
            # print(swdf10.shape, cls_target10.shape)
            #           
            model = LSTMBaseline(swdf10.shape[2], 64)
            optimizer = torch.optim.RMSprop(model.parameters(), lr=0.1)
            loss_fn = nn.MSELoss()
            post_processor = (add_closing_price, {'cls_price':unflattened_df_cls.iloc[(len_window-1):len(unflattened_df_cls)-(n_tr+n_ts)]})
            result = sliding_window_cv_torch(swdf10, cls_target10, model, optimizer, loss_fn, n_tr=n_tr, n_ts=n_ts, 
            scorers=[mean_squared_error,mean_absolute_percentage_error, r2_score], comment="lstm_woattn_ohlcv_{}".format(symbol), post_processor=None)
            results.append(result)
            if resultfile is not None:
                file_exists = os.path.isfile(resultfile)

                with open(resultfile, 'a', newline='') as f:
                    writer = csv.DictWriter(f, fieldnames=results[0].keys(), delimiter=',', lineterminator='\n')

                    if not file_exists:
                        writer.writeheader()  # file doesn't exist yet, write a header

                    writer.writerows(results)

            print("Processing completed for {}".format(symbol))

5156, 240.9499969482422, 236.25, 249.3000030517578, 252.1999969482422, 240.0, 239.85000610351562, 233.89999389648438, 252.1999969482422, 251.8000030517578, 243.6999969482422, 229.60000610351562, 220.39999389648438, 222.10000610351562, 210.85000610351562, 202.9499969482422, 202.14999389648438, 219.14999389648438, 222.25, 218.14999389648438, 226.64999389648438, 240.64999389648438, 250.25, 233.4499969482422, 247.6999969482422, 263.25, 259.8500061035156, 268.75, 267.5, 283.5, 268.6499938964844, 268.0, 263.20001220703125, 271.6499938964844, 290.6000061035156, 311.25, 350.8500061035156, 331.3500061035156, 335.3999938964844, 337.95001220703125, 328.20001220703125, 319.29998779296875, 325.8500061035156, 346.25, 347.1499938964844, 356.95001220703125, 394.70001220703125, 379.29998779296875, 381.1000061035156, 373.95001220703125, 370.1000061035156, 352.3999938964844, 349.3999938964844, 342.5, 352.79998779296875, 350.5, 353.20001220703125, 343.8999938964844, 349.95001220703125, 341.3500061035156, 