In [108]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view as sliding_window_view
import pickle
import datetime


class NSEDataset(Dataset):
    def __init__(self, ohlcv_dir, target_ticker, target_ticker_file, len_window, len_corr_traceback, nP, nN, 
    keep_tickers=None, ohlcv_prefix='', ohlcv_sufix='', ohlcv_files=None, start_date=None, end_date=None,
    target_feat='c', keep_feat='ohlcva'):

        feat_name_map = {
            'o' : 'Open', 
            'h' : 'High', 
            'l' : 'Low', 
            'c' : 'Close', 
            'v' : 'Volume',
            'a' : 'Adj Close'
        }

        self.len_window = len_window
        self.len_corr_traceback = len_corr_traceback
        self.nP, self.nN = nP, nN
        self.target_feat = target_feat
        self.keep_feat = keep_feat
        self.start_date, self.end_date = start_date, end_date

        if ohlcv_files is not None:
            ohlcv_files = set(ohlcv_files)
        
        if keep_tickers is not None:
            keep_tickers = set(keep_tickers)
    
        df = pd.read_csv(os.path.join(ohlcv_dir, target_ticker_file))
        df['Date'] = pd.to_datetime(df['Date'])

        if start_date is not None:
            start_mask =  df['Date'] >= datetime.datetime.fromisoformat(start_date)
            i_start = start_mask[start_mask].index.min()
        else:
            i_start = 0
        
        if end_date is not None:
            end_mask =  df['Date'] > datetime.datetime.fromisoformat(end_date)
            i_end = end_mask[end_mask].index.min()
        else:
            i_end = len(df)

        df.reset_index(drop=True, inplace=True)
        df.set_index(['Date', 'Ticker'], inplace=True)
        df = df.iloc[i_start:i_end]
        self.mainstream_df = df.loc[:, target_ticker]
        self.df = df.drop(target_ticker, axis=1)
        
        if ohlcv_files is not None and target_ticker_file not in ohlcv_files:
            self.df = pd.DataFrame(columns=df.columns)
            
        for f in os.listdir(ohlcv_dir):
            if f.startswith(ohlcv_prefix) and f.endswith(ohlcv_sufix) and (ohlcv_files is None or f in ohlcv_files):
                if f == target_ticker_file:
                    continue
                else:
                    temp_df = pd.read_csv(os.path.join(ohlcv_dir, f))
                    temp_df.reset_index(drop=True, inplace=True)
                    temp_df['Date'] = pd.to_datetime(temp_df['Date'])
                    self.df = pd.merge(self.df.reset_index(), temp_df, on=['Date', 'Ticker'],
                    how='inner', suffixes=('', '_y')).set_index(['Date', 'Ticker'])
                    self.df.drop(self.df.filter(regex='_y$').columns.tolist(), axis=1, inplace=True)
                    
        if keep_tickers is not None:
            for c in self.df:
                if c not in keep_tickers:
                    self.df.drop(c, axis=1, inplace=True)

        self.df = self.df.pivot_table(index='Date', columns='Ticker')
        
        self.df.columns = self.df.columns.map('_'.join)
        if isinstance(self.mainstream_df, pd.Series):
            self.mainstream_df = pd.DataFrame(self.mainstream_df)
        
        self.mainstream_df = self.mainstream_df.pivot_table(index='Date', columns='Ticker')
        self.mainstream_df.columns = self.mainstream_df.columns.map('_'.join)

        target_feat_name = "{}_{}".format(target_ticker, feat_name_map[target_feat])
        
        self.unshifted_target = self.mainstream_df.loc[:, target_feat_name]
        self.target = self.unshifted_target.shift(periods=-1)
        
        # To account for absence of target for last row.
        self.df = self.df.iloc[:-1, :]  
        self.mainstream_df = self.mainstream_df.iloc[:-1, :]

        drop_features = set(feat_name_map.keys()).difference({feat for feat in keep_feat})
        for feat in drop_features:
            self.df.drop(self.df.filter(regex='_{}$'.format(feat_name_map[feat])).columns.tolist(), axis=1, inplace=True)
            self.mainstream_df.drop(self.mainstream_df.filter(regex='_{}$'.format(feat_name_map[feat])).columns.tolist(), axis=1, inplace=True)

        # For i_end, data of [i_end - (len_corr_traceback) : i_end] (py notation)
        # is needed to calculate correlation, basis on which data of 
        # [i_end - (len_window) : i_end] (py notation) must be sent.
        # i_end is excluded, so last i_end should be len(self.df)
        
        self.swdf = []
        for i_end in range(len_corr_traceback, len(self.df)+1):
            if i_end % 50 == 0:
                print(i_end, len(self.df))
            self.swdf.append(self.get_high_corr(self.unshifted_target.iloc[i_end-len_corr_traceback:i_end], 
            self.df.iloc[i_end-len_corr_traceback:i_end, :], len_window, nP, nN))
            
        # self.swdf = np.array(self.swdf).reshape(len(self.swdf), self.len_window, -1)
        self.swdf = np.array(self.swdf)
        
        # if earlier self.df.shape was (6(n+1), c), it should now be
        # (n, c), mainstream_df.shape and index_data_df should be (n, 1) and swdf.shape
        # should be (n-lct+1, lw*(nP+nN)).

        # For index 0, 
        # swdf[0], mainstream_df[lct-lw : lct] flattend (py notation)
        # index_data_df[lct-lw : lct] flattened (py notation) should be accessed.

        # For index i < len(swdf), 
        # swdf[i], mainstream_df[lct-lw + i: lct + i] flattend (py notation)
        # index_data_df[lct-lw + i : lct + i] flattened (py notation) should be accessed.
        # Correct. Continue from here. 

        # The index data used is for a single index.
        self.index_data_df = pd.read_csv("data_collection/NIFTY 50.csv")
        self.index_data_df['Date'] = pd.to_datetime(self.index_data_df['Date'])
        self.index_data_df.rename(columns={'SharesTraded' : 'Volume'}, inplace=True)
        self.index_data_df.drop("Unnamed: 0", axis=1, inplace=True)

        if start_date is not None:
            start_mask =  self.index_data_df['Date'] >= datetime.datetime.fromisoformat(start_date)
            i_start = start_mask[start_mask].index.min()
        else:
            i_start = 0

        if end_date is not None:
            end_mask =  self.index_data_df['Date'] > datetime.datetime.fromisoformat(end_date)
            i_end = end_mask[end_mask].index.min()
        else:
            i_end = len(self.index_data_df)

        self.index_data_df.reset_index(drop=True, inplace=True)
        self.index_data_df.set_index(['Date'], inplace=True)
        self.index_data_df = self.index_data_df.iloc[i_start:i_end]
        self.index_data_df = pd.DataFrame(self.index_data_df.loc[:, 'Close'])
        self.index_data_df = self.index_data_df.reset_index().merge(
    self.df.reset_index()['Date'], how='inner', on='Date').set_index('Date')
        

    def get_high_corr(self, target: pd.Series, candidates: pd.DataFrame, len_window, nP, nN):
        corr = candidates.corrwith(target)
        p_best = corr.nlargest(nP)
        n_best = corr.nsmallest(nN)
        newrow = candidates.iloc[-len_window:, candidates.columns.get_indexer(p_best.index)].melt()['value'].tolist()
        newrow.extend(candidates.iloc[-len_window:, candidates.columns.get_indexer(n_best.index)].melt()['value'].tolist())
        return newrow

    def __len__(self):
        return len(self.swdf)

    def __getitem__(self, idx):
        # For index i < len(swdf), 
        # swdf[i], mainstream_df[lct-lw + i: lct + i] flattend (py notation)
        # index_data_df[lct-lw + i : lct + i] flattened (py notation) should be accessed.
        
        return (self.swdf[idx, :].reshape(dataset.len_window, -1), 
        self.mainstream_df.iloc[self.len_corr_traceback-self.len_window+idx : self.len_corr_traceback+idx].to_numpy(), 
        self.index_data_df.iloc[self.len_corr_traceback-self.len_window+idx : self.len_corr_traceback+idx].to_numpy())


def save_NSEDataset(dataset, opfile):
    with open(opfile, 'wb') as f:
        pickle.dump(dataset, f)

def load_NSEDataset(ipfile):
    PICKLE_PROTOCOL = 4
    with open(ipfile, 'rb') as f:
        dataset = pickle.load(f)
    
    return dataset 

In [114]:
dataset = NSEDataset('data_collection/ohlcv_data', 'ITC.NS', 'ohlcv_fmcg.csv', len_window=10, len_corr_traceback=20, 
nP=10, nN=10, keep_feat='o', start_date='2017-01-01')
save_NSEDataset(dataset, 'data_collection/pickled_datasets/itc_Jan17_w10_t20_p10_n10_o.pkl')

50 1285
100 1285
150 1285
200 1285
250 1285
300 1285
350 1285
400 1285
450 1285
500 1285
550 1285
600 1285
650 1285
700 1285
750 1285
800 1285
850 1285
900 1285
950 1285
1000 1285
1050 1285
1100 1285
1150 1285
1200 1285
1250 1285


In [115]:
dataset = load_NSEDataset('data_collection/pickled_datasets/itc_Jan17_w10_t20_p10_n10_o.pkl')

In [116]:
print(dataset.swdf.shape, dataset.mainstream_df.shape, dataset.index_data_df.shape)
print(dataset[1][0].shape, dataset[1][1].shape, dataset[1][2].shape)

(1266, 200) (1285, 1) (1285, 1)
(10, 20) (10, 1) (10, 1)
