# Testing transformers & Decomposition on 1 Stock

In [3]:
import pandas as pd
import numpy as np
import pickle
from datetime import timedelta, datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from PyEMD import CEEMDAN
from sklearn.decomposition import PCA
import plotly.express as px
from tqdm import tqdm
from datetime import datetime
import sys, os

In [4]:
absolutepath = os.path.abspath('')
fileDirectory = os.path.dirname(absolutepath)

#Path of parent directory (moves outside of repository)
parentDirectory = os.path.dirname(fileDirectory)

In [5]:
train = 0.8
validation = 0.1
test = 0.1

# Part 1

In [6]:
def stock_decomp(dfile, tick, annum, minmax= False, standard=False, logged=False, export=False):
    """ 
    PARAMS
    dfile = string of data file path
    tick = string stock ticker name
    annum = int year of interest
    minmax = Boolean to scale or not 
    standard = Boolean to scale or not 
    logged = Boolean to log transform or not
    export = Boolean to export or not 
    
    RETURN:
    dictionary keys ['decomposed_ticker_features_series', 'scalers']
    """
    
    # grab only the data needed
    ticker_dataframe = pd.read_pickle(parentDirectory+ dfile)
    coi = ['ticker','reportperiod','Open','High','Low','Close','Volume']
    ticker_dataframe['reportperiod'] = pd.to_datetime(ticker_dataframe['reportperiod'])
    ticker_dataframe = ticker_dataframe[ticker_dataframe.ticker == tick][coi] #& (ticker_dataframe['reportperiod'].dt.year == annum)][coi]
    ticker_dataframe['Target_Close_2_NXT_High'] = ticker_dataframe['High'].shift(-1) / ticker_dataframe['Close']
    ticker_dataframe['Target_Close_2_NXT_High'].fillna(0,inplace=True) # cannot have NaN in CEEMDAN
    coi.append('Target_Close_2_NXT_High')
    
    # decomposing
    decomposed_ticker_features_series = {}
    scalers = {}
    ceemdan = CEEMDAN(parallel=True, processes=10)
    
    for ticker in ticker_dataframe.ticker.unique():
        print(f'[{ticker}] Decomposing...')
        decomposed_ticker_features_series[ticker] = {}
        scalers[ticker] = {}
        
        if logged:
            for c in coi[2:]:
                ticker_dataframe[c] = np.log(ticker_dataframe[c])
         
        for column in coi[2:]:  
            decomposed_ticker_features_series[ticker][column] ={}      
            try:
                
                series = ticker_dataframe[column].values.reshape(-1,1)

                
                # decompose
                ticker_feature_time_series = np.frombuffer(series)
                ticker_feature_time_series_imfs = ceemdan(ticker_feature_time_series, max_imf=10)
                #print(ticker_feature_time_series_imfs)
                #print(f'[{ticker}][{column}] Decomposition finished; restructuring')
                
                # iterating every IMF 
                for i, imf_series in enumerate(ticker_feature_time_series_imfs):
                    
                    if i < len(ticker_feature_time_series_imfs)-1: # last one is residual
                        decomposed_ticker_features_series[ticker][column][f'IMF{i+1}'] = imf_series
                    else:
                        decomposed_ticker_features_series[ticker][column][f'Rsd'] = imf_series
                print(f'Finished Decomposing [{ticker}][{column}]')
            except:
                print(f'ERROR ticker[{ticker}][{column}]')
                decomposed_ticker_features_series[ticker][column] = 'ERROR'
                
            finally:
                continue
    


    for ticker in decomposed_ticker_features_series.keys():

            scalers[ticker] = {}
            
            for column in decomposed_ticker_features_series[ticker].keys():
                for imf in decomposed_ticker_features_series[ticker][column]:
                    #print(f'{column} {imf} {t[ticker][column][imf].shape[0]}')  
                    cut = round(train*decomposed_ticker_features_series[ticker][column][imf].shape[0])
                    train_series = decomposed_ticker_features_series[ticker][column][imf][:cut].reshape(-1,1)
                
                if standard:
                    scaler = StandardScaler()
                    scaler.fit(train_series)
                    scalers[ticker][column] = scaler
                    decomposed_ticker_features_series[ticker][column][imf] = scaler.transform(decomposed_ticker_features_series[ticker][column][imf].reshape(-1,1)).flatten()
                
                if minmax:
                    scaler = MinMaxScaler()
                    scaler.fit(train_series)
                    scalers[ticker][column] = scaler
                    decomposed_ticker_features_series[ticker][column][imf] = scaler.transform(decomposed_ticker_features_series[ticker][column][imf].reshape(-1,1)).flatten()
    
    tmp_dict = {}
    tmp_dict['decomposed_ticker_features_series'] = decomposed_ticker_features_series
    tmp_dict['scalers'] = scalers
    tmp_dict['dates'] = ticker_dataframe.reportperiod.dt.strftime("%m/%d/%Y").values
            
    if export:
        
        # get date stamp
        dateTimeObj = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
        
        if standard:
            s = 'stdscl'
        if minmax:
            s = 'minmaxscl'
        else:
            s = 'no_scale'
        if logged:
            l = 'logged'
        else:
            l = 'orig'
            
            
        # save file
        with open(parentDirectory +  f'/data/DNN_testing_outputs/{tick}__{annum}_decomposed_ticker_features_series_{l}_{s}_{dateTimeObj}.pkl', 'wb') as f:
            pickle.dump(tmp_dict, f)
        with open(parentDirectory + f'/data/DNN_testing_outputs/{tick}__{annum}_scalers_{l}_{s}_{dateTimeObj}.pkl', 'wb') as f:
            pickle.dump(scalers, f)
        print('Export Locations:')
        print(f'/data/DNN_testing_outputs/{tick}__{annum}_decomposed_ticker_features_series_{l}_{s}_{dateTimeObj}.pkl')
        print(f'/data/DNN_testing_outputs/{tick}__{annum}_scalers_{l}_{s}_{dateTimeObj}.pkl')
    
            
    return tmp_dict
    

In [7]:
# let's test normalizing before and after
# Let's test removing close and also changing high and low to relative deltas
# relative deltas being low to next day high or high to next day high spread
# arguably given a big enough data we should be able to catch the relative ranges 
# and therefore mitigate leakage and afford training on un seen

In [6]:
test_dict = stock_decomp(dfile= '/data/raw_data_2021-11-12_12.31.45.pkl', tick=['MSFT', 'HD', 'UNH', 'XOM', 'ADSK', 'WAT'], minmax= True, annum='all', export=True)

[MSFT] Decomposing...
ERROR ticker[MSFT][Open]
ERROR ticker[MSFT][High]


In [8]:
for t in ['MSFT', 'HD', 'UNH', 'XOM', 'ADSK', 'WAT']:
    test_dict = stock_decomp(dfile= '/data/raw_data_2021-11-12_12.31.45.pkl', tick=t, minmax= True, annum='all', export=True)
    

[MSFT] Decomposing...
Finished Decomposing [MSFT][Open]
Finished Decomposing [MSFT][High]
Finished Decomposing [MSFT][Low]
Finished Decomposing [MSFT][Close]
Finished Decomposing [MSFT][Volume]
Finished Decomposing [MSFT][Target_Close_2_NXT_High]
Export Locations:
/data/DNN_testing_outputs/MSFT__all_decomposed_ticker_features_series_orig_minmaxscl_2021-12-10_21.23.49.pkl
/data/DNN_testing_outputs/MSFT__all_scalers_orig_minmaxscl_2021-12-10_21.23.49.pkl
[HD] Decomposing...
Finished Decomposing [HD][Open]
Finished Decomposing [HD][High]
Finished Decomposing [HD][Low]
Finished Decomposing [HD][Close]
Finished Decomposing [HD][Volume]
Finished Decomposing [HD][Target_Close_2_NXT_High]
Export Locations:
/data/DNN_testing_outputs/HD__all_decomposed_ticker_features_series_orig_minmaxscl_2021-12-10_21.26.04.pkl
/data/DNN_testing_outputs/HD__all_scalers_orig_minmaxscl_2021-12-10_21.26.04.pkl
[UNH] Decomposing...
Finished Decomposing [UNH][Open]
Finished Decomposing [UNH][High]
Finished Decompos

In [17]:
# Printing structure
tick = list(test_dict['decomposed_ticker_features_series'].keys())[0]
print(f"TICKER: {tick} {test_dict['decomposed_ticker_features_series'][tick].keys()}")
for k in test_dict['decomposed_ticker_features_series'][tick].keys():
    print(f"FEATURE: {k} \n{test_dict['decomposed_ticker_features_series'][tick][k].keys()}")
    

TICKER: MSFT dict_keys(['Open', 'High', 'Low', 'Close', 'Volume', 'Return_High_2_Nxt_Close'])
FEATURE: Open 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'Rsd'])
FEATURE: High 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'Rsd'])
FEATURE: Low 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'Rsd'])
FEATURE: Close 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'Rsd'])
FEATURE: Volume 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'Rsd'])
FEATURE: Return_High_2_Nxt_Close 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'Rsd'])


# Part 2
### Restart notebook for Tensor

In [6]:
from datetime import timedelta, datetime
import pandas as pd
import numpy as np
import pickle

from scipy.interpolate import CubicSpline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error 
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense, LSTM, LeakyReLU
from tensorflow.keras.callbacks import ModelCheckpoint
from tqdm import tqdm
import sys, os

import plotly.express as px
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

experiment_time = datetime.now().strftime("%H_%M_%S_%m_%d_%Y")

features_in_order = ['Open', 'High', 'Low', 'Volume', 'Close', 'Target_Close_2_NXT_High'] # target feature must be the last one here
target_feature = 'Target_Close_2_NXT_High'

train = 0.8
validation = 0.1
test = 0.1

absolutepath = os.path.abspath('')
fileDirectory = os.path.dirname(absolutepath)

#Path of parent directory (moves outside of repository)
parentDirectory = os.path.dirname(fileDirectory)

class ManyToOneTimeSeriesGenerator(TimeseriesGenerator):
  def __getitem__(self, idx):
    x, y = super().__getitem__(idx)
    last_element_index = y.shape[1]-1
    return x, y[:,last_element_index].reshape(1,-1)

class SplineModel():
    def __init__(self,time_series_generator):
        self.name = "SplineModel"
        self.gen = time_series_generator
    
    def predict(self, x_window, verbose=0):
        result = []
        x_window = np.squeeze(x_window, axis=0)
        last_element_index = x_window.shape[1]-1
        series = x_window[:,last_element_index].reshape(-1)
        cs = CubicSpline(np.arange(len(series)), series)
        next_value = cs(len(series)+1)
        result += [next_value]

        return np.array(result).reshape(1,-1) # 1,-1

In [2]:
with open(parentDirectory +  f'/data/DNN_testing_outputs/UNH__2019_decomposed_ticker_features_series_orig_minmaxscl_2021-12-10_14.57.15.pkl', 'rb') as f:
    test_dict = pickle.load(f)

In [3]:
# Data organization
max_window_size = 10
windows_sizes_for_imf_level = {
    'IMF1': 2,
    'IMF2': 2,
    'IMF3': 3,
    'IMF4': 3,
    'IMF5': 4,
    'IMF6': 4,
    'IMF7': 5,
    'IMF8': 5,
    'Rsd': 6,
    'DEFAULT': 4
}
target_feature_max_imf_level = {}

# Coupling together the IMFs of the same level for different features to create exogenous input
# The number of imfs for each feature decomposition may differ, thus some of the last imfs may not match in number of features
series = {}
for ticker in test_dict['decomposed_ticker_features_series']:
    
    series[ticker] = {}
    target_feature_max_imf_level[ticker] ={}


    for feature in test_dict['decomposed_ticker_features_series'][ticker]:
        
        imfs = pd.DataFrame.from_dict(test_dict['decomposed_ticker_features_series'][ticker][feature])
        
        for imf in imfs:
            if imf not in series[ticker]:
                series[ticker][imf] = []
            _series = imfs[imf].values
            _series = _series.reshape((len(_series),1)) # reshaping to get into column format
            series[ticker][imf] += [_series]
            if feature == target_feature:
                target_feature_max_imf_level[ticker] = imf


dataset = {}
# # horizontal stack
for ticker in series:
    dataset[ticker] = {}
    for imf_level in series[ticker]:
        dataset[ticker][imf_level] = np.hstack(tuple(series[ticker][imf_level]))




In [4]:
train = 0.8
validation = 0.1
test = 0.1

train_dataset = {}
train_dataset['dates'] = test_dict['dates'][:round(train*dataset[ticker][imf_level].shape[0])]
validation_dataset = {}
validation_dataset['dates'] = test_dict['dates'][round(train*dataset[ticker][imf_level].shape[0]):round((train+validation)*dataset[ticker][imf_level].shape[0])]
test_dataset = {}
test_dataset['dates'] =  test_dict['dates'][round((train+validation)*dataset[ticker][imf_level].shape[0]):]

for ticker in dataset:
    
    train_dataset[ticker] = {}
    validation_dataset[ticker] = {}
    test_dataset[ticker] = {}
    
    for imf_level in dataset[ticker]:
                
        # splitting data sets according to rates
        train_dataset[ticker][imf_level] = dataset[ticker][imf_level][:round(train*dataset[ticker][imf_level].shape[0]),:]
        validation_dataset[ticker][imf_level] = dataset[ticker][imf_level][round(train*dataset[ticker][imf_level].shape[0]):round((train+validation)*dataset[ticker][imf_level].shape[0]),:]
        test_dataset[ticker][imf_level] = dataset[ticker][imf_level][round((train+validation)*dataset[ticker][imf_level].shape[0]):,:]

In [10]:
def etl(file):
    
    with open(parentDirectory +  file, 'rb') as f:
        test_dict = pickle.load(f)
    
    
        # Data organization
    max_window_size = 10
    windows_sizes_for_imf_level = {
        'IMF1': 2,
        'IMF2': 2,
        'IMF3': 3,
        'IMF4': 3,
        'IMF5': 4,
        'IMF6': 4,
        'IMF7': 5,
        'IMF8': 5,
        'Rsd': 6,
        'DEFAULT': 4
    }
    target_feature_max_imf_level = {}

    # Coupling together the IMFs of the same level for different features to create exogenous input
    # The number of imfs for each feature decomposition may differ, thus some of the last imfs may not match in number of features
    series = {}
    for ticker in test_dict['decomposed_ticker_features_series']:
        
        series[ticker] = {}
        target_feature_max_imf_level[ticker] ={}


        for feature in test_dict['decomposed_ticker_features_series'][ticker]:
            
            imfs = pd.DataFrame.from_dict(test_dict['decomposed_ticker_features_series'][ticker][feature])
            
            for imf in imfs:
                if imf not in series[ticker]:
                    series[ticker][imf] = []
                _series = imfs[imf].values
                _series = _series.reshape((len(_series),1)) # reshaping to get into column format
                series[ticker][imf] += [_series]
                if feature == target_feature:
                    target_feature_max_imf_level[ticker] = imf


    dataset = {}
    # # horizontal stack
    for ticker in series:
        dataset[ticker] = {}
        for imf_level in series[ticker]:
            dataset[ticker][imf_level] = np.hstack(tuple(series[ticker][imf_level]))
            
            
    train = 0.8
    validation = 0.1
    test = 0.1

    train_dataset = {}
    train_dataset['dates'] = test_dict['dates'][:round(train*dataset[ticker][imf_level].shape[0])]
    validation_dataset = {}
    validation_dataset['dates'] = test_dict['dates'][round(train*dataset[ticker][imf_level].shape[0]):round((train+validation)*dataset[ticker][imf_level].shape[0])]
    test_dataset = {}
    test_dataset['dates'] =  test_dict['dates'][round((train+validation)*dataset[ticker][imf_level].shape[0]):]

    for ticker in dataset:
        
        train_dataset[ticker] = {}
        validation_dataset[ticker] = {}
        test_dataset[ticker] = {}
        
        for imf_level in dataset[ticker]:
                    
            # splitting data sets according to rates
            train_dataset[ticker][imf_level] = dataset[ticker][imf_level][:round(train*dataset[ticker][imf_level].shape[0]),:]
            validation_dataset[ticker][imf_level] = dataset[ticker][imf_level][round(train*dataset[ticker][imf_level].shape[0]):round((train+validation)*dataset[ticker][imf_level].shape[0]),:]
            test_dataset[ticker][imf_level] = dataset[ticker][imf_level][round((train+validation)*dataset[ticker][imf_level].shape[0]):,:]
    
    def generator(dataset):
        # data set split rates
        # create generators
        # NOTE STANDARD SCALER was FIT on .75 split so leakage if TRAIN VALIDATE GOES PAST


        train_generators = {}
        validation_generators = {}
        test_generators = {}

        for ticker in dataset:

            train_generators[ticker] = {}
            validation_generators[ticker] = {}
            test_generators[ticker] = {}
            

            for imf_level in dataset[ticker]:

                if imf_level in windows_sizes_for_imf_level:
                    window_size = windows_sizes_for_imf_level[imf_level]
                else: 
                    window_size = windows_sizes_for_imf_level['DEFAULT']
                # windowing
                train_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(train_dataset[ticker][imf_level], train_dataset[ticker][imf_level], length=window_size, batch_size=1)
                validation_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(validation_dataset[ticker][imf_level], validation_dataset[ticker][imf_level], length=window_size, batch_size=1)
                test_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(test_dataset[ticker][imf_level], test_dataset[ticker][imf_level], length=window_size, batch_size=1)
    
        return train_generators,validation_generators,test_generators

    train_generators,validation_generators,test_generators = generator(dataset)
    
    # Model Training

    models = {}

    model_epochs = {
        'IMF1': 2500,
        'IMF2': 2000,
        'IMF3': 1500,
        'IMF4': 1500,
        'IMF5': 1500,
        'IMF6': 1200,
        'IMF7': 1200,
        'IMF8': 1000,
        'Rsd': 1000,
        'DEFAULT': 1000
    }

    imfs_to_predict_with_neural = ['IMF1','IMF2'] # set to ['IMF1'] , ['IMF1', 'IMF2'], ['IMF1','IMF2','IMF3'] and so on



    for ticker in train_generators:
        models[ticker] = {}

        reached_max_imf_of_target_feature = False
        for imf_level in train_generators[ticker]:
            if imf_level in imfs_to_predict_with_neural:
                print(f'Training model [{ticker}][{imf_level}]')
                if reached_max_imf_of_target_feature is True:
                    break # no need to predict further if target feature doesn't contain greater IMF levels

                if target_feature_max_imf_level[ticker] == imf_level:
                    reached_max_imf_of_target_feature = True
                # Prediction model
                model = Sequential()
                current_dataset = train_dataset[ticker][imf_level]
                n_features = current_dataset.shape[1]
                cur_tmp_gen = train_generators[ticker][imf_level]
                cur_tmp_val_gen = validation_generators[ticker][imf_level]

                if imf_level in windows_sizes_for_imf_level:
                    window_size = windows_sizes_for_imf_level[imf_level]
                else: 
                    window_size = windows_sizes_for_imf_level['DEFAULT']

                model.add(LSTM(128, activation='tanh', return_sequences=True, input_shape=(window_size, n_features)))
                model.add(LSTM(64, activation='tanh', input_shape=(window_size, 128)))
                model.add(Dense(16))
                model.add(LeakyReLU())
                model.add(Dense(4))
                model.add(LeakyReLU())
                model.add(Dense(1)) # 1 target feature only
                model.compile(optimizer='adam', loss='mse')

                number_of_epochs = model_epochs[imf_level]
                checkpoint_path = fileDirectory + "/data/DNN_tmp/" +f"{ticker}_" +f"ltsm_spline.h5"
                
                callbacks = [ ModelCheckpoint(checkpoint_path, monitor='loss', mode="max", verbose=0,save_best_only=True, save_weights_only=False, save_freq=250)]

                # fit model
                #model.fit_generator(cur_tmp_gen, steps_per_epoch=1, epochs=number_of_epochs, verbose=0)
                model.fit(cur_tmp_gen, validation_data=cur_tmp_val_gen, steps_per_epoch=10, epochs=number_of_epochs, verbose=0, callbacks=callbacks)

                models[ticker][imf_level] = model
            else:
                # Spline prediction model
                cur_tmp_gen = train_generators[ticker][imf_level]
                model = SplineModel(cur_tmp_gen)
                models[ticker][imf_level] = model
                
    for t in models:
        for IM in models[t]:
            if IM in ['IMF1','IMF2']:
                models[t][IM].save(os.path.join(parentDirectory+f"/data/tf_LSTM_{t}__{IM}network.h5"))
                print(f'exported {t}__{IM} LSTM')
            else:
                with open(parentDirectory+f'/data/LSTM_{t}__{IM}_spline.pkl', 'wb') as fi:
                    pickle.dump(models[t][IM], fi)
                    print(f'exported {t}__{IM} spline')
            

    return

    

In [11]:
fl = ['/data/DNN_testing_outputs/MSFT__all_decomposed_ticker_features_series_orig_minmaxscl_2021-12-10_21.23.49.pkl',
      '/data/DNN_testing_outputs/HD__all_decomposed_ticker_features_series_orig_minmaxscl_2021-12-10_21.26.04.pkl',
      '/data/DNN_testing_outputs/UNH__all_decomposed_ticker_features_series_orig_minmaxscl_2021-12-10_21.28.24.pkl',
      '/data/DNN_testing_outputs/XOM__all_decomposed_ticker_features_series_orig_minmaxscl_2021-12-10_21.30.32.pkl',
      '/data/DNN_testing_outputs/ADSK__all_decomposed_ticker_features_series_orig_minmaxscl_2021-12-10_21.33.02.pkl',
      '/data/DNN_testing_outputs/WAT__all_decomposed_ticker_features_series_orig_minmaxscl_2021-12-10_21.35.31.pkl']

In [12]:
for f in fl:
    etl(f)

Training model [MSFT][IMF1]
Training model [MSFT][IMF2]
exported MSFT__IMF1 LSTM
exported MSFT__IMF2 LSTM
exported MSFT__IMF3 spline
exported MSFT__IMF4 spline
exported MSFT__IMF5 spline
exported MSFT__IMF6 spline
exported MSFT__IMF7 spline
exported MSFT__IMF8 spline
exported MSFT__IMF9 spline
exported MSFT__Rsd spline
exported MSFT__IMF10 spline
Training model [HD][IMF1]
Training model [HD][IMF2]
exported HD__IMF1 LSTM
exported HD__IMF2 LSTM
exported HD__IMF3 spline
exported HD__IMF4 spline
exported HD__IMF5 spline
exported HD__IMF6 spline
exported HD__IMF7 spline
exported HD__IMF8 spline
exported HD__Rsd spline
exported HD__IMF9 spline
exported HD__IMF10 spline
Training model [UNH][IMF1]
Training model [UNH][IMF2]
exported UNH__IMF1 LSTM
exported UNH__IMF2 LSTM
exported UNH__IMF3 spline
exported UNH__IMF4 spline
exported UNH__IMF5 spline
exported UNH__IMF6 spline
exported UNH__IMF7 spline
exported UNH__Rsd spline
exported UNH__IMF8 spline
exported UNH__IMF9 spline
exported UNH__IMF10

In [5]:
# check size first

print(f"train shape {len(train_dataset['dates'])}")
print(f"validation shape {len(validation_dataset['dates'])}")
print(f"test shape {len(test_dataset['dates'])}")
print(len(train_dataset['dates']) + len(validation_dataset['dates']) + len(test_dataset['dates']))

train shape 202
validation shape 25
test shape 25
252


In [6]:
def generator(dataset):
    # data set split rates
    # create generators
    # NOTE STANDARD SCALER was FIT on .75 split so leakage if TRAIN VALIDATE GOES PAST


    train_generators = {}
    validation_generators = {}
    test_generators = {}

    for ticker in dataset:

        train_generators[ticker] = {}
        validation_generators[ticker] = {}
        test_generators[ticker] = {}
        

        for imf_level in dataset[ticker]:

            if imf_level in windows_sizes_for_imf_level:
                window_size = windows_sizes_for_imf_level[imf_level]
            else: 
                window_size = windows_sizes_for_imf_level['DEFAULT']
            # windowing
            train_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(train_dataset[ticker][imf_level], train_dataset[ticker][imf_level], length=window_size, batch_size=1)
            validation_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(validation_dataset[ticker][imf_level], validation_dataset[ticker][imf_level], length=window_size, batch_size=1)
            test_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(test_dataset[ticker][imf_level], test_dataset[ticker][imf_level], length=window_size, batch_size=1)
    
    return train_generators,validation_generators,test_generators

In [7]:
train_generators,validation_generators,test_generators = generator(dataset)

In [8]:
# Model Training

models = {}

model_epochs = {
    'IMF1': 2500,
    'IMF2': 2000,
    'IMF3': 1500,
    'IMF4': 1500,
    'IMF5': 1500,
    'IMF6': 1200,
    'IMF7': 1200,
    'IMF8': 1000,
    'Rsd': 1000,
    'DEFAULT': 1000
}

imfs_to_predict_with_neural = ['IMF1','IMF2'] # set to ['IMF1'] , ['IMF1', 'IMF2'], ['IMF1','IMF2','IMF3'] and so on



for ticker in train_generators:
    models[ticker] = {}

    reached_max_imf_of_target_feature = False
    for imf_level in train_generators[ticker]:
        if imf_level in imfs_to_predict_with_neural:
            print(f'Training model [{ticker}][{imf_level}]')
            if reached_max_imf_of_target_feature is True:
                break # no need to predict further if target feature doesn't contain greater IMF levels

            if target_feature_max_imf_level[ticker] == imf_level:
                reached_max_imf_of_target_feature = True
            # Prediction model
            model = Sequential()
            current_dataset = train_dataset[ticker][imf_level]
            n_features = current_dataset.shape[1]
            cur_tmp_gen = train_generators[ticker][imf_level]
            cur_tmp_val_gen = validation_generators[ticker][imf_level]

            if imf_level in windows_sizes_for_imf_level:
                window_size = windows_sizes_for_imf_level[imf_level]
            else: 
                window_size = windows_sizes_for_imf_level['DEFAULT']

            model.add(LSTM(128, activation='tanh', return_sequences=True, input_shape=(window_size, n_features)))
            model.add(LSTM(64, activation='tanh', input_shape=(window_size, 128)))
            model.add(Dense(16))
            model.add(LeakyReLU())
            model.add(Dense(4))
            model.add(LeakyReLU())
            model.add(Dense(1)) # 1 target feature only
            model.compile(optimizer='adam', loss='mse')

            number_of_epochs = model_epochs[imf_level]
            checkpoint_path = fileDirectory + "/data/DNN_tmp/" +f"{ticker}_" +f"ltsm_spline.h5"
            
            callbacks = [ ModelCheckpoint(checkpoint_path, monitor='loss', mode="max", verbose=0,save_best_only=True, save_weights_only=False, save_freq=250)]

            # fit model
            #model.fit_generator(cur_tmp_gen, steps_per_epoch=1, epochs=number_of_epochs, verbose=0)
            model.fit(cur_tmp_gen, validation_data=cur_tmp_val_gen, steps_per_epoch=10, epochs=number_of_epochs, verbose=0, callbacks=callbacks)

            models[ticker][imf_level] = model
        else:
            # Spline prediction model
            cur_tmp_gen = train_generators[ticker][imf_level]
            model = SplineModel(cur_tmp_gen)
            models[ticker][imf_level] = model
            


Training model [UNH][IMF1]
Training model [UNH][IMF2]


In [9]:
# reset generators
train_generators,validation_generators,test_generators = generator(dataset)

In [10]:
# predicting

results = {}

for ticker in models:
    results[ticker] = {}

    # initializing results dictionary
    for feature in features_in_order:
        if feature != target_feature:
            continue
        results[ticker][feature] = {}
        for imf_level in models[ticker]:
            results[ticker][feature][imf_level] = {
                'real_train': [],
                'predicted_train': [],
                'real_validation': [],
                'predicted_validation': [],
                'real_test': [],
                'predicted_test': [],
                'x_axis_train': [],
                'x_axis_validation': [],
                'x_axis_test': []
            }

    for imf_level in models[ticker]:
        model = models[ticker][imf_level]
        
        print(f'Predicting: [{ticker}][{imf_level}]')

        cur_train_gen = train_generators[ticker][imf_level]
        cur_validation_gen = validation_generators[ticker][imf_level]
        cur_test_gen = test_generators[ticker][imf_level]

        # predicting train
        day_counter = 0
        for i in range(len(cur_train_gen)):
            x, y = cur_train_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_train'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_train'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_train'] += [day_counter]
            day_counter += 1

        # predicting validation
        for i in range(len(cur_validation_gen)):
            x, y = cur_validation_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_validation'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_validation'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_validation'] += [day_counter]
            day_counter += 1

        # predicting test
        for i in range(len(cur_test_gen)):
            x, y = cur_test_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_test'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_test'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_test'] += [day_counter]
            day_counter += 1


Predicting: [UNH][IMF1]
Predicting: [UNH][IMF2]
Predicting: [UNH][IMF3]
Predicting: [UNH][IMF4]
Predicting: [UNH][IMF5]
Predicting: [UNH][Rsd]
Predicting: [UNH][IMF6]


In [11]:
# the below is helping me discern where the timeshift happens given that these features have varying windows

In [12]:
# organizing imf prediction results, concatenating train, validation and test
concatenated_results = {}

for ticker in results:
    concatenated_results[ticker] = {}
    for feature in results[ticker]:
        concatenated_results[ticker][feature] = {}
        for imf_level in results[ticker][feature]:
            
            df_result = pd.DataFrame.from_dict(results[ticker][feature][imf_level], orient='index').T
            df_train = df_result[['real_train','predicted_train','x_axis_train']].set_index('x_axis_train').dropna(axis=0)
            df_train.index.name = 'x'
            df_validation = df_result[['real_validation','predicted_validation','x_axis_validation']].set_index('x_axis_validation').dropna(axis=0)
            df_validation.index.name = 'x'
            df_test = df_result[['real_test','predicted_test','x_axis_test']].set_index('x_axis_test').dropna(axis=0)
            df_test.index.name = 'x'

            df_concatenated = pd.concat([df_train,df_validation,df_test], axis=1)

            concatenated_results[ticker][feature][imf_level] = df_concatenated

In [13]:
# iplot layout
space =  {
            'legend' : {'bgcolor':'#1A1A1C','font':{'color':'#D9D9D9',"size":12}},
            'paper_bgcolor' : '#1A1A1C',
            'plot_bgcolor' : '#1A1A1C',
            "title" : {"font":{"color":"#D9D9D9"},"x":0.5},
            'yaxis' : {
                'tickfont' : {'color':'#C2C2C2', "size":12},
                'gridcolor' : '#434343',
                'titlefont' : {'color':'#D9D9D9'},
                'zerolinecolor' : '#666570',
                'showgrid' : True
            },
            'xaxis' : {
                'tickfont' : {'color':'#C2C2C2', "size":12},
                'gridcolor' : '#434343',
                'titlefont' : {'color':'#D9D9D9'},
                'zerolinecolor' : '#666570',
                'showgrid' : True
            },
            'titlefont' : {'color':'#D9D9D9'}
        }

In [14]:
# plotting partial result
plot_ticker = 'UNH'
plot_feature = target_feature
plot_imf = 'IMF1'

concatenated_results[plot_ticker][plot_feature][plot_imf].iplot(title=f'{plot_ticker} {plot_feature} {plot_imf}', asFigure=True, layout=space)

In [15]:
# recomposing prediction by arithmetically adding the IMF curves

final_prediction_results = {}
max_window_size = 10

for ticker in concatenated_results:
    final_prediction_results[ticker] = {}
    for feature in concatenated_results[ticker]:
        addition_train = None
        addition_validation = None
        addition_test = None

        addition_real_train = None
        addition_real_validation = None
        addition_real_test = None

        # recomposing predictions
        for imf_level in concatenated_results[ticker][feature]:
            # adding test
            can_sum = True
            if addition_test is None:
                addition_test = concatenated_results[ticker][feature][imf_level]['predicted_test'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_test'].values
                cur_length = addition_test.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_test = addition_test[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_test = np.add(addition_test,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding train
            can_sum = True
            if addition_train is None:
                addition_train = concatenated_results[ticker][feature][imf_level]['predicted_train'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_train'].values
                cur_length = addition_train.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_train = addition_train[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_train = np.add(addition_train,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding validation
            can_sum = True
            if addition_validation is None:
                addition_validation = concatenated_results[ticker][feature][imf_level]['predicted_validation'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_validation'].values
                cur_length = addition_validation.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_validation = addition_validation[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_validation = np.add(addition_validation,np_array_to_be_added)
        ###################
        # recomposing real#
        ###################
        
        # Note this is a method adopted from the research
        # However, it is peculiar that this is a reconstruction instead of a merging.
        # Verified: When converted back to Close the real values don't match; use actual values
        
        for imf_level in concatenated_results[ticker][feature]:
            # adding test
            can_sum = True
            if addition_real_test is None:
                addition_real_test = concatenated_results[ticker][feature][imf_level]['real_test'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_test'].values
                cur_length = addition_real_test.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_test = addition_real_test[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_test = np.add(addition_real_test,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding train
            can_sum = True
            if addition_real_train is None:
                addition_real_train = concatenated_results[ticker][feature][imf_level]['real_train'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_train'].values
                cur_length = addition_real_train.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_train = addition_real_train[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_train = np.add(addition_real_train,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding validation
            can_sum = True
            if addition_real_validation is None:
                addition_real_validation = concatenated_results[ticker][feature][imf_level]['real_validation'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_validation'].values
                cur_length = addition_real_validation.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_validation = addition_real_validation[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_validation = np.add(addition_real_validation,np_array_to_be_added)
        
        scaler = test_dict['scalers'][ticker][feature]

        final_prediction_results[ticker][feature] = {
            'train_predicted': scaler.inverse_transform(addition_train.reshape(-1,1)).reshape(-1),
            'validation_predicted': scaler.inverse_transform(addition_validation.reshape(-1,1)).reshape(-1),
            'test_predicted': scaler.inverse_transform(addition_test.reshape(-1,1)).reshape(-1),
            'train_real': scaler.inverse_transform(addition_real_train.reshape(-1,1)).reshape(-1),
            'validation_real': scaler.inverse_transform(addition_real_validation.reshape(-1,1)).reshape(-1),
            'test_real': scaler.inverse_transform(addition_real_test.reshape(-1,1)).reshape(-1)
        }


In [17]:
np_array_to_be_added = concatenated_results['MSFT']['Target_Close_2_NXT_High']['IMF1']['real_test'].values
cur_length = addition_real_test.shape[0]
next_np_array_length = np_array_to_be_added.shape[0]
if cur_length < next_np_array_length:
    if next_np_array_length-cur_length < max_window_size:
        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]

In [18]:
# plotting final result
plot_ticker = 'UNH'
plot_feature = target_feature

pd.DataFrame.from_dict(final_prediction_results[plot_ticker][plot_feature]).iplot(title=f'{plot_ticker} {plot_feature} {plot_imf}', layout=space)

In [19]:
# the below suggests that the data would be off by 18 ( 3*6) in affordance of the RSD window
# this adds up 253 - 18 = 235 ( total length of predictions)

for k,v in windows_sizes_for_imf_level.items():
    print(f"window size for {k} = {v}")

print(f"train shape {len(train_dataset['dates'])}")
print(f"train generator")
for k in train_generators['MSFT'].keys():
    print(f"k = {k}, len = {len(train_generators['MSFT'][k])}")
    
print(f"validation shape {len(validation_dataset['dates'])}")
print(f"validation generator")
for k in validation_generators['MSFT'].keys():
    print(f"k = {k}, len = {len(validation_generators['MSFT'][k])}")
    
print(f"test shape {len(test_dataset['dates'])}")
print(f"test generator")
for k in test_generators['MSFT'].keys():
    print(f"k = {k}, len = {len(test_generators['MSFT'][k])}")


window size for IMF1 = 2
window size for IMF2 = 2
window size for IMF3 = 3
window size for IMF4 = 3
window size for IMF5 = 4
window size for IMF6 = 4
window size for IMF7 = 5
window size for IMF8 = 5
window size for Rsd = 6
window size for DEFAULT = 4
train shape 202
train generator
k = IMF1, len = 200
k = IMF2, len = 200
k = IMF3, len = 199
k = IMF4, len = 199
k = Rsd, len = 196
k = IMF5, len = 198
k = IMF6, len = 198
validation shape 25
validation generator
k = IMF1, len = 23
k = IMF2, len = 23
k = IMF3, len = 22
k = IMF4, len = 22
k = Rsd, len = 19
k = IMF5, len = 21
k = IMF6, len = 21
test shape 25
test generator
k = IMF1, len = 23
k = IMF2, len = 23
k = IMF3, len = 22
k = IMF4, len = 22
k = Rsd, len = 19
k = IMF5, len = 21
k = IMF6, len = 21


In [16]:
# Verifying that 'Real' are as expected
ticker_dataframe = pd.read_pickle(parentDirectory+ '/data/raw_data_2021-11-12_12.31.45.pkl')
coi = ['ticker','reportperiod','Open','High','Low','Close','Volume']
ticker_dataframe['reportperiod'] = pd.to_datetime(ticker_dataframe['reportperiod'])
ticker_dataframe = ticker_dataframe[(ticker_dataframe.ticker == 'UNH') & (ticker_dataframe['reportperiod'].dt.year == 2019)][coi]

# THIS WOULD CALCULATE THE TRUE VALUE AS OPPOSED TO THE REAL REPRESENTED IN THE SUMMATION
#ticker_dataframe['Target_Close_2_NXT_High'] = ticker_dataframe['High'].shift(periods=-1, axis=0) / ticker_dataframe['Close']
#ticker_dataframe['Target_Close_2_NXT_High'].fillna(0,inplace=True)


In [17]:
df = pd.DataFrame.from_dict(final_prediction_results['UNH']['Target_Close_2_NXT_High'])
# data is shifted forward 18 days to accommodate residual window
flt_dates = ticker_dataframe.iloc[18:,:]

# set indexes for joining
df = df.set_index(flt_dates['reportperiod'])
flt_dates.set_index('reportperiod', inplace=True)

In [18]:
recompiled = df.join(flt_dates)

In [19]:
# Need to shift high back to run apply function
recompiled['Back_Shifted_Real_High'] = recompiled['High'].shift(-1)

In [20]:
# these are not matching, perhaps the residuals should be added?
# ticker_dataframe['High'].shift(periods=-1, axis=0) / ticker_dataframe['Close']

for col in ['train_predicted','validation_predicted','test_predicted']:
    recompiled[col] = recompiled[col] * recompiled['Close']

for col in ['train_real','validation_real','test_real']: 
    recompiled[col] = recompiled.apply(lambda x: np.nan if np.isnan(x[col]) else x['Back_Shifted_Real_High'], axis=1)
#recompiled['Predicted_NXT_High'] = recompiled['Target_Close_2_NXT_High'] * recompiled['Close']
recompiled.head()

Unnamed: 0_level_0,train_predicted,validation_predicted,test_predicted,train_real,validation_real,test_real,ticker,Open,High,Low,Close,Volume,Back_Shifted_Real_High
reportperiod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-01-29,270.120467,,,272.440002,,,UNH,266.0,268.450012,265.440002,267.339996,2173000.0,272.440002
2019-01-30,273.167935,,,271.73999,,,UNH,269.690002,272.440002,267.190002,270.369995,3659000.0,271.73999
2019-01-31,272.993327,,,269.390015,,,UNH,270.480011,271.73999,268.98999,270.200012,4055100.0,269.390015
2019-02-01,271.496211,,,268.690002,,,UNH,268.470001,269.390015,266.029999,268.720001,3946800.0,268.690002
2019-02-04,270.977368,,,272.089996,,,UNH,268.309998,268.690002,264.269989,268.209991,3398900.0,272.089996


Unnamed: 0_level_0,train_predicted,validation_predicted,test_predicted,train_real,validation_real,test_real,ticker,Open,High,Low,Close,Volume,Back_Shifted_Real_High
reportperiod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-12-08,,,206.137358,,,215.229996,MSFT,213.970001,216.949997,212.889999,216.009995,23284100.0,215.229996
2020-12-09,,,201.565004,,,213.080002,MSFT,215.160004,215.229996,211.210007,211.800003,32440600.0,213.080002
2020-12-10,,,199.851945,,,213.320007,MSFT,211.770004,213.080002,210.360001,210.520004,26733300.0,213.320007
2020-12-11,,,202.020954,,,216.210007,MSFT,210.050003,213.320007,209.110001,213.259995,30979400.0,216.210007
2020-12-14,,,202.500638,,,215.419998,MSFT,213.100006,216.210007,212.880005,214.199997,28798400.0,215.419998
2020-12-15,,,202.071838,,,220.110001,MSFT,215.169998,215.419998,212.240005,214.130005,27000600.0,220.110001
2020-12-16,,,206.534182,,,220.889999,MSFT,214.75,220.110001,214.720001,219.279999,35023300.0,220.889999
2020-12-17,,,206.169073,,,219.690002,MSFT,219.869995,220.889999,217.919998,219.419998,32515800.0,219.690002
2020-12-18,,,204.774496,,,224.0,MSFT,218.589996,219.690002,216.020004,218.589996,63354900.0,224.0
2020-12-21,,,207.797577,,,225.630005,MSFT,217.550003,224.0,217.279999,222.589996,37181900.0,225.630005


In [21]:
recompiled_flt = recompiled[['train_predicted', 'validation_predicted', 'test_predicted',
       'train_real', 'validation_real', 'test_real']]

In [22]:
# plotting final result
plot_ticker = 'UNH'
plot_feature = target_feature

recompiled_flt.iplot(title=f'{plot_ticker} {plot_feature} {plot_imf}', layout=space)

### It seems odd that the Test phase would be so off given that it doesn't have values outside of the previously seen values

In [27]:
adj_close_accuracies = {}
accuracies_detailed = {}

for ticker in final_prediction_results:
    adj_close_accuracies[ticker] = {}
    accuracies_detailed[ticker] = {}
    for feature in final_prediction_results[ticker]:

        y_train = final_prediction_results[ticker][feature]['train_predicted'][~np.isnan(final_prediction_results[ticker][feature]['train_predicted'])]
        yhat_train = final_prediction_results[ticker][feature]['train_real'][~np.isnan(final_prediction_results[ticker][feature]['train_real'])]

        y_validation = final_prediction_results[ticker][feature]['validation_predicted'][~np.isnan(final_prediction_results[ticker][feature]['validation_predicted'])]
        yhat_validation = final_prediction_results[ticker][feature]['validation_real'][~np.isnan(final_prediction_results[ticker][feature]['validation_real'])]

        y_test = final_prediction_results[ticker][feature]['test_predicted'][~np.isnan(final_prediction_results[ticker][feature]['test_predicted'])]
        yhat_test = final_prediction_results[ticker][feature]['test_real'][~np.isnan(final_prediction_results[ticker][feature]['test_real'])]
        accuracies_detailed[ticker][feature] = {
            'mse':{
                'train':mean_squared_error(y_train,yhat_train),
                'validation':mean_squared_error(y_validation,yhat_validation),
                'test':mean_squared_error(y_test,yhat_test),
            },
            'mape':{
                'train':np.mean(np.abs((y_train - yhat_train) / y_train)) * 100,
                'validation':np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100,
                'test':np.mean(np.abs((y_test - yhat_test) / y_test)) * 100,
            }
        }

        if feature == 'Target_Close_2_NXT_High':
            adj_close_accuracies[ticker] = {
                'mse': mean_squared_error(y_test,yhat_test),
                'mape': np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100
            }

# pd.DataFrame.from_dict(accuracies_detailed[plot_ticker][plot_feature])
df_close_accuracies = pd.DataFrame.from_dict(adj_close_accuracies).T
df_close_accuracies.to_csv(parentDirectory + f"/data/DNN_metrics/{'_'.join(imfs_to_predict_with_neural)}_full_spline_{experiment_time}.csv", sep=',', encoding='utf-8')
df_close_accuracies

Unnamed: 0,mape,mse
MSFT,0.003027,2.719878e-08


In [23]:
adj_close_accuracies = {}
accuracies_detailed = {}

for ticker in final_prediction_results:
    adj_close_accuracies[ticker] = {}
    accuracies_detailed[ticker] = {}
    for feature in final_prediction_results[ticker]:

        y_train = final_prediction_results[ticker][feature]['train_predicted'][~np.isnan(final_prediction_results[ticker][feature]['train_predicted'])]
        yhat_train = final_prediction_results[ticker][feature]['train_real'][~np.isnan(final_prediction_results[ticker][feature]['train_real'])]

        y_validation = final_prediction_results[ticker][feature]['validation_predicted'][~np.isnan(final_prediction_results[ticker][feature]['validation_predicted'])]
        yhat_validation = final_prediction_results[ticker][feature]['validation_real'][~np.isnan(final_prediction_results[ticker][feature]['validation_real'])]

        y_test = final_prediction_results[ticker][feature]['test_predicted'][~np.isnan(final_prediction_results[ticker][feature]['test_predicted'])]
        yhat_test = final_prediction_results[ticker][feature]['test_real'][~np.isnan(final_prediction_results[ticker][feature]['test_real'])]
        accuracies_detailed[ticker][feature] = {
            'mse':{
                'train':mean_squared_error(y_train,yhat_train),
                'validation':mean_squared_error(y_validation,yhat_validation),
                'test':mean_squared_error(y_test,yhat_test),
            },
            'mape':{
                'train':np.mean(np.abs((y_train - yhat_train) / y_train)) * 100,
                'validation':np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100,
                'test':np.mean(np.abs((y_test - yhat_test) / y_test)) * 100,
            }
        }

        if feature == 'Target_Close_2_NXT_High':
            adj_close_accuracies[ticker] = {
                'mse': mean_squared_error(y_test,yhat_test),
                'mape': np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100
            }

# pd.DataFrame.from_dict(accuracies_detailed[plot_ticker][plot_feature])
df_close_accuracies = pd.DataFrame.from_dict(adj_close_accuracies).T
df_close_accuracies.to_csv(parentDirectory + f"/data/DNN_metrics/{plot_ticker}_2019_{'_'.join(imfs_to_predict_with_neural)}_full_spline_{experiment_time}.csv", sep=',', encoding='utf-8')
df_close_accuracies

Unnamed: 0,mape,mse
UNH,0.04796,1.4e-05
