In [1]:
from datetime import timedelta, datetime
import pandas as pd
import numpy as np
import pickle

from scipy.interpolate import CubicSpline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error 
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense, LSTM, LeakyReLU
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tqdm import tqdm
import sys, os
import re

In [2]:
absolutepath = os.path.abspath('')
fileDirectory = os.path.dirname(absolutepath)

#Path of parent directory (moves outside of repository)
parentDirectory = os.path.dirname(fileDirectory)

class ManyToOneTimeSeriesGenerator(TimeseriesGenerator):
  def __getitem__(self, idx):
    x, y = super().__getitem__(idx)
    last_element_index = y.shape[1]-1
    return x, y[:,last_element_index].reshape(1,-1)

class SplineModel():
    def __init__(self,time_series_generator):
        self.name = "SplineModel"
        self.gen = time_series_generator
    
    def predict(self, x_window, verbose=0):
        result = []
        x_window = np.squeeze(x_window, axis=0)
        last_element_index = x_window.shape[1]-1
        series = x_window[:,last_element_index].reshape(-1)
        cs = CubicSpline(np.arange(len(series)), series)
        next_value = cs(len(series)+1)
        result += [next_value]

        return np.array(result).reshape(1,-1) # 1,-1

In [4]:
pkl_list = os.listdir('/Users/chris/Documents/GitHub/financial_forecasting_analysis/data/CEEMDAN_datasets/')
pkl_list.remove('.DS_Store')

In [5]:
# this loops through the individual exports to recombine the files into one
dataset = {}
scalers = {}
dates =  {}
for file in pkl_list:
    with open(parentDirectory + '/data/CEEMDAN_datasets/' + file, 'rb') as f:
        test_dict = pickle.load(f)
        ticker = list(test_dict['decomposed_ticker_features_series'].keys())[0]
        dataset[ticker] = test_dict['decomposed_ticker_features_series'][ticker]
        scalers[ticker] = test_dict['scalers'][ticker]
        dates[ticker] = test_dict['dates']
        
    

In [6]:
# read in models
stock_dir = os.listdir(parentDirectory + '/data/CEEMDAN_trained_models')
if '.DS_Store' in stock_dir:
            stock_dir.remove('.DS_Store')
models ={}
for ticker in stock_dir:
        models[ticker] = {}
        tmp_dir = os.listdir(parentDirectory+ f'/data/CEEMDAN_trained_models/'+ f'{ticker}/')
        if '.DS_Store' in tmp_dir:
            tmp_dir.remove('.DS_Store')
        for IMF in tmp_dir:
            # get the IMF
            if re.findall("(IMF\d+)", IMF):
                tmp = re.findall("(IMF\d+)", IMF)[0]
            else:
                tmp = re.findall("(Rsd)", IMF)[0]
            if tmp in ['IMF1','IMF2']:
                models[ticker][tmp] = load_model(parentDirectory+ f'/data/CEEMDAN_trained_models/{ticker}/{IMF}')
            else:
                models[ticker][tmp] = pickle.load(open(parentDirectory+ f'/data/CEEMDAN_trained_models/{ticker}/{IMF}','rb'))

In [7]:
master_dict = {}
master_dict['data'] = dataset
master_dict['scalers'] = scalers
master_dict['dates'] = dates
master_dict['models'] = models

In [8]:
# Data organization
features_in_order = ['Open', 'High', 'Low', 'Volume', 'Close', 'Target_Close_2_NXT_High'] # target feature must be the last one here
target_feature = 'Target_Close_2_NXT_High'

train = 0.8
validation = 0.1
test = 0.1

max_window_size = 10
windows_sizes_for_imf_level = {
    'IMF1': 2,
    'IMF2': 2,
    'IMF3': 3,
    'IMF4': 3,
    'IMF5': 4,
    'IMF6': 4,
    'IMF7': 5,
    'IMF8': 5,
    'Rsd': 6,
    'DEFAULT': 4
}
target_feature_max_imf_level = {}

series = {}
for ticker in master_dict['data']:
    
    series[ticker] = {}
    target_feature_max_imf_level[ticker] ={}


    for feature in master_dict['data'][ticker]:
        
        imfs = pd.DataFrame.from_dict(master_dict['data'][ticker][feature])
        
        for imf in imfs:
            if imf not in series[ticker]:
                series[ticker][imf] = []
            _series = imfs[imf].values
            _series = _series.reshape((len(_series),1)) # reshaping to get into column format
            series[ticker][imf] += [_series]
            if feature == target_feature:
                target_feature_max_imf_level[ticker] = imf


dataset = {}
# # horizontal stack
for ticker in series:
    dataset[ticker] = {}
    for imf_level in series[ticker]:
        dataset[ticker][imf_level] = np.hstack(tuple(series[ticker][imf_level]))
            
train_dataset = {}
validation_dataset = {}
test_dataset = {}

for ticker in dataset:
        
        train_dataset[ticker] = {}
        validation_dataset[ticker] = {}
        test_dataset[ticker] = {}
        
        for imf_level in dataset[ticker]:
                    
            # splitting data sets according to rates
            train_dataset[ticker][imf_level] = dataset[ticker][imf_level][:round(train*dataset[ticker][imf_level].shape[0]),:]
            validation_dataset[ticker][imf_level] = dataset[ticker][imf_level][round(train*dataset[ticker][imf_level].shape[0]):round((train+validation)*dataset[ticker][imf_level].shape[0]),:]
            test_dataset[ticker][imf_level] = dataset[ticker][imf_level][round((train+validation)*dataset[ticker][imf_level].shape[0]):,:]

In [9]:
def generator(dataset):
    # data set split rates
    # create generators
    # NOTE STANDARD SCALER was FIT on .75 split so leakage if TRAIN VALIDATE GOES PAST
    
    train_dataset = {}
    validation_dataset = {}
    test_dataset = {}
    
    for ticker in dataset:
        
        train_dataset[ticker] = {}
        validation_dataset[ticker] = {}
        test_dataset[ticker] = {}
        
        for imf_level in dataset[ticker]:
                    
            # splitting data sets according to rates
            train_dataset[ticker][imf_level] = dataset[ticker][imf_level][:round(train*dataset[ticker][imf_level].shape[0]),:]
            validation_dataset[ticker][imf_level] = dataset[ticker][imf_level][round(train*dataset[ticker][imf_level].shape[0]):round((train+validation)*dataset[ticker][imf_level].shape[0]),:]
            test_dataset[ticker][imf_level] = dataset[ticker][imf_level][round((train+validation)*dataset[ticker][imf_level].shape[0]):,:]


    train_generators = {}
    validation_generators = {}
    test_generators = {}

    for ticker in dataset:

        train_generators[ticker] = {}
        validation_generators[ticker] = {}
        test_generators[ticker] = {}
        

        for imf_level in dataset[ticker]:

            if imf_level in windows_sizes_for_imf_level:
                window_size = windows_sizes_for_imf_level[imf_level]
            else: 
                window_size = windows_sizes_for_imf_level['DEFAULT']
            # windowing
            train_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(train_dataset[ticker][imf_level], train_dataset[ticker][imf_level], length=window_size, batch_size=1)
            validation_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(validation_dataset[ticker][imf_level], validation_dataset[ticker][imf_level], length=window_size, batch_size=1)
            test_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(test_dataset[ticker][imf_level], test_dataset[ticker][imf_level], length=window_size, batch_size=1)
    
    return train_generators,validation_generators,test_generators

In [10]:
train_generators,validation_generators,test_generators = generator(dataset)

In [11]:
# predicting

results = {}

for ticker in models:
    results[ticker] = {}

    # initializing results dictionary
    for feature in features_in_order:
        if feature != target_feature:
            continue
        results[ticker][feature] = {}
        for imf_level in models[ticker]:
            results[ticker][feature][imf_level] = {
                'real_train': [],
                'predicted_train': [],
                'real_validation': [],
                'predicted_validation': [],
                'real_test': [],
                'predicted_test': [],
                'x_axis_train': [],
                'x_axis_validation': [],
                'x_axis_test': []
            }

    for imf_level in tqdm(models[ticker]):
        model = models[ticker][imf_level]
        
        print(f'Predicting: [{ticker}][{imf_level}]')

        cur_train_gen = train_generators[ticker][imf_level]
        cur_validation_gen = validation_generators[ticker][imf_level]
        cur_test_gen = test_generators[ticker][imf_level]

        # predicting train
        day_counter = 0
        for i in range(len(cur_train_gen)):
            x, y = cur_train_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_train'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_train'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_train'] += [day_counter]
            day_counter += 1

        # predicting validation
        for i in range(len(cur_validation_gen)):
            x, y = cur_validation_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_validation'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_validation'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_validation'] += [day_counter]
            day_counter += 1

        # predicting test
        for i in range(len(cur_test_gen)):
            x, y = cur_test_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_test'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_test'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_test'] += [day_counter]
            day_counter += 1


  0%|          | 0/11 [00:00<?, ?it/s]

Predicting: [HD][IMF7]


  9%|▉         | 1/11 [00:00<00:04,  2.31it/s]

Predicting: [HD][IMF1]


 18%|█▊        | 2/11 [01:30<07:57, 53.02s/it]

Predicting: [HD][IMF8]


 27%|██▋       | 3/11 [01:30<03:52, 29.00s/it]

Predicting: [HD][IMF5]


 36%|███▋      | 4/11 [01:31<02:04, 17.72s/it]

Predicting: [HD][IMF10]


 45%|████▌     | 5/11 [01:31<01:08, 11.48s/it]

Predicting: [HD][IMF3]


 55%|█████▍    | 6/11 [01:31<00:38,  7.70s/it]

Predicting: [HD][IMF6]


 64%|██████▎   | 7/11 [01:32<00:21,  5.32s/it]

Predicting: [HD][Rsd]


 73%|███████▎  | 8/11 [01:32<00:11,  3.76s/it]

Predicting: [HD][IMF2]


 82%|████████▏ | 9/11 [03:02<01:01, 30.55s/it]

Predicting: [HD][IMF9]


 91%|█████████ | 10/11 [03:02<00:21, 21.25s/it]

Predicting: [HD][IMF4]


100%|██████████| 11/11 [03:02<00:00, 16.63s/it]
  0%|          | 0/11 [00:00<?, ?it/s]

Predicting: [MSFT][IMF3]


  9%|▉         | 1/11 [00:00<00:03,  2.62it/s]

Predicting: [MSFT][IMF2]


 18%|█▊        | 2/11 [01:26<07:35, 50.63s/it]

Predicting: [MSFT][IMF10]


 27%|██▋       | 3/11 [01:26<03:41, 27.70s/it]

Predicting: [MSFT][IMF6]


 36%|███▋      | 4/11 [01:27<01:58, 16.92s/it]

Predicting: [MSFT][IMF4]


 45%|████▌     | 5/11 [01:27<01:05, 10.96s/it]

Predicting: [MSFT][IMF9]


 55%|█████▍    | 6/11 [01:27<00:36,  7.37s/it]

Predicting: [MSFT][IMF7]


 64%|██████▎   | 7/11 [01:28<00:20,  5.10s/it]

Predicting: [MSFT][Rsd]


 73%|███████▎  | 8/11 [01:28<00:10,  3.61s/it]

Predicting: [MSFT][IMF5]


 82%|████████▏ | 9/11 [01:29<00:05,  2.60s/it]

Predicting: [MSFT][IMF1]


 91%|█████████ | 10/11 [02:56<00:28, 28.67s/it]

Predicting: [MSFT][IMF8]


100%|██████████| 11/11 [02:56<00:00, 16.04s/it]
  0%|          | 0/11 [00:00<?, ?it/s]

Predicting: [XOM][IMF1]


  9%|▉         | 1/11 [01:26<14:23, 86.38s/it]

Predicting: [XOM][IMF10]


 18%|█▊        | 2/11 [01:26<05:22, 35.81s/it]

Predicting: [XOM][IMF5]


 27%|██▋       | 3/11 [01:27<02:37, 19.65s/it]

Predicting: [XOM][IMF8]


 36%|███▋      | 4/11 [01:27<01:24, 12.05s/it]

Predicting: [XOM][Rsd]


 45%|████▌     | 5/11 [01:28<00:47,  7.85s/it]

Predicting: [XOM][IMF7]


 55%|█████▍    | 6/11 [01:28<00:26,  5.32s/it]

Predicting: [XOM][IMF4]


 64%|██████▎   | 7/11 [01:28<00:14,  3.70s/it]

Predicting: [XOM][IMF9]


 73%|███████▎  | 8/11 [01:29<00:07,  2.66s/it]

Predicting: [XOM][IMF2]


 82%|████████▏ | 9/11 [02:56<00:58, 29.18s/it]

Predicting: [XOM][IMF6]


 91%|█████████ | 10/11 [02:57<00:20, 20.30s/it]

Predicting: [XOM][IMF3]


100%|██████████| 11/11 [02:57<00:00, 16.14s/it]
  0%|          | 0/11 [00:00<?, ?it/s]

Predicting: [UNH][Rsd]


  9%|▉         | 1/11 [00:00<00:04,  2.40it/s]

Predicting: [UNH][IMF7]


 18%|█▊        | 2/11 [00:00<00:03,  2.44it/s]

Predicting: [UNH][IMF8]


 27%|██▋       | 3/11 [00:01<00:03,  2.41it/s]

Predicting: [UNH][IMF5]


 36%|███▋      | 4/11 [00:01<00:02,  2.41it/s]

Predicting: [UNH][IMF1]


 55%|█████▍    | 6/11 [01:29<01:45, 21.20s/it]

Predicting: [UNH][IMF10]
Predicting: [UNH][IMF6]


 64%|██████▎   | 7/11 [01:30<00:57, 14.41s/it]

Predicting: [UNH][IMF3]


 73%|███████▎  | 8/11 [01:30<00:29,  9.94s/it]

Predicting: [UNH][IMF2]


 82%|████████▏ | 9/11 [02:57<01:07, 33.99s/it]

Predicting: [UNH][IMF9]


 91%|█████████ | 10/11 [02:57<00:23, 23.62s/it]

Predicting: [UNH][IMF4]


100%|██████████| 11/11 [02:58<00:00, 16.21s/it]
  0%|          | 0/11 [00:00<?, ?it/s]

Predicting: [WAT][IMF6]


  9%|▉         | 1/11 [00:00<00:04,  2.44it/s]

Predicting: [WAT][IMF10]


 18%|█▊        | 2/11 [00:00<00:03,  2.45it/s]

Predicting: [WAT][Rsd]


 27%|██▋       | 3/11 [00:01<00:03,  2.45it/s]

Predicting: [WAT][IMF3]


 36%|███▋      | 4/11 [00:01<00:02,  2.53it/s]

Predicting: [WAT][IMF4]


 45%|████▌     | 5/11 [00:01<00:02,  2.57it/s]

Predicting: [WAT][IMF9]


 55%|█████▍    | 6/11 [00:02<00:01,  2.51it/s]

Predicting: [WAT][IMF1]


 64%|██████▎   | 7/11 [01:29<01:54, 28.71s/it]

Predicting: [WAT][IMF2]


 73%|███████▎  | 8/11 [02:56<02:21, 47.33s/it]

Predicting: [WAT][IMF7]


 82%|████████▏ | 9/11 [02:56<01:05, 32.66s/it]

Predicting: [WAT][IMF5]


 91%|█████████ | 10/11 [02:57<00:22, 22.70s/it]

Predicting: [WAT][IMF8]


100%|██████████| 11/11 [02:57<00:00, 16.16s/it]
  0%|          | 0/11 [00:00<?, ?it/s]

Predicting: [ADSK][IMF8]


  9%|▉         | 1/11 [00:00<00:04,  2.35it/s]

Predicting: [ADSK][IMF10]


 18%|█▊        | 2/11 [00:00<00:03,  2.38it/s]

Predicting: [ADSK][IMF5]


 27%|██▋       | 3/11 [00:01<00:03,  2.40it/s]

Predicting: [ADSK][Rsd]


 36%|███▋      | 4/11 [00:01<00:02,  2.39it/s]

Predicting: [ADSK][IMF7]


 45%|████▌     | 5/11 [00:02<00:02,  2.45it/s]

Predicting: [ADSK][IMF1]


 55%|█████▍    | 6/11 [01:31<02:33, 30.67s/it]

Predicting: [ADSK][IMF9]


 64%|██████▎   | 7/11 [01:31<01:23, 20.78s/it]

Predicting: [ADSK][IMF2]


 73%|███████▎  | 8/11 [02:59<02:05, 41.97s/it]

Predicting: [ADSK][IMF4]


 82%|████████▏ | 9/11 [02:59<00:57, 28.96s/it]

Predicting: [ADSK][IMF3]


 91%|█████████ | 10/11 [02:59<00:20, 20.14s/it]

Predicting: [ADSK][IMF6]


100%|██████████| 11/11 [03:00<00:00, 16.40s/it]


In [12]:
# organizing imf prediction results, concatenating train, validation and test
concatenated_results = {}

for ticker in results:
    concatenated_results[ticker] = {}
    for feature in results[ticker]:
        concatenated_results[ticker][feature] = {}
        for imf_level in results[ticker][feature]:
            
            df_result = pd.DataFrame.from_dict(results[ticker][feature][imf_level], orient='index').T
            df_train = df_result[['real_train','predicted_train','x_axis_train']].set_index('x_axis_train').dropna(axis=0)
            df_train.index.name = 'x'
            df_validation = df_result[['real_validation','predicted_validation','x_axis_validation']].set_index('x_axis_validation').dropna(axis=0)
            df_validation.index.name = 'x'
            df_test = df_result[['real_test','predicted_test','x_axis_test']].set_index('x_axis_test').dropna(axis=0)
            df_test.index.name = 'x'

            df_concatenated = pd.concat([df_train,df_validation,df_test], axis=1)

            concatenated_results[ticker][feature][imf_level] = df_concatenated

In [13]:
# recomposing prediction by arithmetically adding the IMF curves

final_prediction_results = {}
max_window_size = 10

for ticker in concatenated_results:
    final_prediction_results[ticker] = {}
    for feature in concatenated_results[ticker]:
        addition_train = None
        addition_validation = None
        addition_test = None

        addition_real_train = None
        addition_real_validation = None
        addition_real_test = None

        # recomposing predictions
        for imf_level in concatenated_results[ticker][feature]:
            # adding test
            can_sum = True
            if addition_test is None:
                addition_test = concatenated_results[ticker][feature][imf_level]['predicted_test'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_test'].values
                cur_length = addition_test.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_test = addition_test[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_test = np.add(addition_test,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding train
            can_sum = True
            if addition_train is None:
                addition_train = concatenated_results[ticker][feature][imf_level]['predicted_train'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_train'].values
                cur_length = addition_train.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_train = addition_train[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_train = np.add(addition_train,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding validation
            can_sum = True
            if addition_validation is None:
                addition_validation = concatenated_results[ticker][feature][imf_level]['predicted_validation'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_validation'].values
                cur_length = addition_validation.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_validation = addition_validation[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_validation = np.add(addition_validation,np_array_to_be_added)
        ###################
        # recomposing real#
        ###################
        
        # Note this is a method adopted from the research
        # However, it is peculiar that this is a reconstruction instead of a merging.
        # Verified: When converted back to High the real values don't match; use actual values
        
        for imf_level in concatenated_results[ticker][feature]:
            # adding test
            can_sum = True
            if addition_real_test is None:
                addition_real_test = concatenated_results[ticker][feature][imf_level]['real_test'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_test'].values
                cur_length = addition_real_test.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_test = addition_real_test[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_test = np.add(addition_real_test,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding train
            can_sum = True
            if addition_real_train is None:
                addition_real_train = concatenated_results[ticker][feature][imf_level]['real_train'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_train'].values
                cur_length = addition_real_train.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_train = addition_real_train[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_train = np.add(addition_real_train,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding validation
            can_sum = True
            if addition_real_validation is None:
                addition_real_validation = concatenated_results[ticker][feature][imf_level]['real_validation'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_validation'].values
                cur_length = addition_real_validation.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_validation = addition_real_validation[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_validation = np.add(addition_real_validation,np_array_to_be_added)
        
        scaler = master_dict['scalers'][ticker][feature]

        final_prediction_results[ticker][feature] = {
            'train_predicted': scaler.inverse_transform(addition_train.reshape(-1,1)).reshape(-1),
            'validation_predicted': scaler.inverse_transform(addition_validation.reshape(-1,1)).reshape(-1),
            'test_predicted': scaler.inverse_transform(addition_test.reshape(-1,1)).reshape(-1),
            'train_real': scaler.inverse_transform(addition_real_train.reshape(-1,1)).reshape(-1),
            'validation_real': scaler.inverse_transform(addition_real_validation.reshape(-1,1)).reshape(-1),
            'test_real': scaler.inverse_transform(addition_real_test.reshape(-1,1)).reshape(-1)
        }


In [15]:
# instantiate dict for compiling original space results
reporting = dict()

In [25]:
def scoring(ticker=str(), results=final_prediction_results, export_dict = reporting):
    
    
    experiment_time = datetime.now().strftime("%H_%M_%S_%m_%d_%Y")
    
    
    ticker_df = pd.read_pickle(parentDirectory+ '/data/raw_data_2021-11-12_12.31.45.pkl')
    coi = ['ticker','reportperiod','Open','High','Low','Close','Volume']
    ticker_df['reportperiod'] = pd.to_datetime(ticker_df['reportperiod'])
    ticker_df = ticker_df[ticker_df.ticker == ticker][coi]

        
    # Need to shift high back to run apply function
    # this doesn't work when all stocks are represented
    ticker_df['Back_Shifted_Real_High'] = ticker_df['High'].shift(-1)
    
    # read in results
    df = pd.DataFrame.from_dict(results[ticker]['Target_Close_2_NXT_High'])
    
    num_dates = ticker_df.shape[0]

    train_dates = ticker_df['reportperiod'][ticker_df.ticker == ticker][6:round(train*num_dates)]
    valid_dates = ticker_df['reportperiod'][ticker_df.ticker == ticker][6+round(train*num_dates):round((train+validation)*num_dates)]
    test_dates =  ticker_df['reportperiod'][ticker_df.ticker == ticker][6+round((train+validation)*num_dates):]


    # concat 
    # data might be shifted in three separate stages of 6 
    # so I might need to slice the dataset accordingly for proper analysis
    # data is shifted forward 18 days to accommodate residual window
    #flt_dates = ticker_dataframe.iloc[18:,:]
    
    flt_dates = pd.DataFrame()
    for ti in [train_dates,valid_dates,test_dates]:
        tmp_slice = ticker_df[(ticker_df['reportperiod'].isin(ti))]
        flt_dates= pd.concat([flt_dates,tmp_slice])
        
    

    # set indexes for joining
    df = df.set_index(flt_dates['reportperiod'])
    flt_dates.set_index('reportperiod', inplace=True)
    
    # join predicted and real
    recompiled = df.join(flt_dates)
    
    # change prediction back into original feature space
    for col in ['train_predicted','validation_predicted','test_predicted']:
        recompiled[col] = recompiled[col] * recompiled['Close']
     
    # align df for processing   
    for col in ['train_real','validation_real','test_real']: 
        recompiled[col] = recompiled.apply(lambda x: np.nan if np.isnan(x[col]) else x['Back_Shifted_Real_High'], axis=1)
    
    export_dict[ticker] = recompiled
        
    return 

In [27]:
for tick in final_prediction_results.keys():
    scoring(ticker=tick, results=final_prediction_results, export_dict = reporting)

In [32]:
# here is the result of the previous cell
reporting['MSFT']

Unnamed: 0_level_0,train_predicted,validation_predicted,test_predicted,train_real,validation_real,test_real,ticker,Open,High,Low,Close,Volume,Back_Shifted_Real_High
reportperiod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2000-04-10,43.693715,,,43.03125,,,MSFT,44.312500,44.312500,43.000000,43.031250,60685400.0,43.031250
2000-04-11,42.582780,,,41.12500,,,MSFT,42.562500,43.031250,41.750000,41.937500,71961800.0,41.125000
2000-04-12,40.300563,,,41.12500,,,MSFT,41.062500,41.125000,39.375000,39.687500,153003800.0,41.125000
2000-04-13,40.244089,,,39.75000,,,MSFT,40.437500,41.125000,39.500000,39.625000,94316200.0,39.750000
2000-04-14,37.650405,,,38.00000,,,MSFT,39.562500,39.750000,36.625000,37.062500,151217800.0,38.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-05,,,339.568634,,,337.649994,MSFT,338.510010,338.790009,334.420013,336.059998,22564000.0,337.649994
2021-11-08,,,340.248672,,,338.720001,MSFT,337.299988,337.649994,334.440002,336.989990,20897000.0,338.720001
2021-11-09,,,338.785030,,,334.630005,MSFT,337.109985,338.720001,334.529999,335.950012,21307400.0,334.630005
2021-11-10,,,333.059615,,,333.769989,MSFT,334.570007,334.630005,329.920013,330.799988,25500900.0,333.769989


In [33]:
def calc(final_prediction_results = reporting):
    
    comp_accuracy = {}
    accuracies_detailed = {}

    for ticker in final_prediction_results:
        comp_accuracy[ticker] = {}
        accuracies_detailed[ticker] = {}

        y_train = final_prediction_results[ticker]['train_predicted'][~np.isnan(final_prediction_results[ticker]['train_predicted'])]
        yhat_train = final_prediction_results[ticker]['train_real'][~np.isnan(final_prediction_results[ticker]['train_real'])]

        y_validation = final_prediction_results[ticker]['validation_predicted'][~np.isnan(final_prediction_results[ticker]['validation_predicted'])]
        yhat_validation = final_prediction_results[ticker]['validation_real'][~np.isnan(final_prediction_results[ticker]['validation_real'])]

        # need to shave the end because we don't have next day data
        y_test = final_prediction_results[ticker]['test_predicted'][~np.isnan(final_prediction_results[ticker]['test_predicted'])][:-1]
        yhat_test = final_prediction_results[ticker]['test_real'][~np.isnan(final_prediction_results[ticker]['test_real'])]
        accuracies_detailed[ticker] = {
            'mse':{
                'train':mean_squared_error(y_train,yhat_train),
                'validation':mean_squared_error(y_validation,yhat_validation),
                'test':mean_squared_error(y_test,yhat_test),
            },
            'mape':{
                'train':np.mean(np.abs((y_train - yhat_train) / y_train)) * 100,
                'validation':np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100,
                'test':np.mean(np.abs((y_test - yhat_test) / y_test)) * 100,
            }
        }

        comp_accuracy[ticker] = {
            'mse': mean_squared_error(y_test,yhat_test),
            'mape': np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100
        }

    # pd.DataFrame.from_dict(accuracies_detailed[plot_ticker][plot_feature])
    df_close_2_high_accuracies = pd.DataFrame.from_dict(comp_accuracy).T
    return df_close_2_high_accuracies, comp_accuracy, accuracies_detailed
   

In [34]:
df_close_2_high_accuracies, comp_accuracy, accuracies_detailed  = calc(final_prediction_results = reporting)

In [36]:
comp_accuracy

{'HD': {'mse': 14.286167615986654, 'mape': 0.657456483157971},
 'MSFT': {'mse': 9.149559433246413, 'mape': 0.806271872705335},
 'XOM': {'mse': 1.16876991223868, 'mape': 0.6590544138986737},
 'UNH': {'mse': 30.010606998627065, 'mape': 0.7313483510312824},
 'WAT': {'mse': 16.676858208503006, 'mape': 0.7619265055430389},
 'ADSK': {'mse': 17.980777139730808, 'mape': 1.0627991058601152}}

In [37]:
accuracies_detailed

{'HD': {'mse': {'train': 0.5154094844434481,
   'validation': 2.740172849759562,
   'test': 14.286167615986654},
  'mape': {'train': 1.0175993678867397,
   'validation': 0.657456483157971,
   'test': 1.0061553359536255}},
 'MSFT': {'mse': {'train': 0.23278050339507775,
   'validation': 1.4497950213764625,
   'test': 9.149559433246413},
  'mape': {'train': 1.0108272247362566,
   'validation': 0.806271872705335,
   'test': 1.0113732687746868}},
 'XOM': {'mse': {'train': 0.5874099587840659,
   'validation': 0.4861677929927149,
   'test': 1.16876991223868},
  'mape': {'train': 0.7780836760312163,
   'validation': 0.6590544138986737,
   'test': 1.5911883128262514}},
 'UNH': {'mse': {'train': 0.5176155408426173,
   'validation': 5.8614896788147774,
   'test': 30.010606998627065},
  'mape': {'train': 1.0084971101577636,
   'validation': 0.7313483510312824,
   'test': 1.1060194905852436}},
 'WAT': {'mse': {'train': 1.360070730949603,
   'validation': 6.723428880197069,
   'test': 16.6768582085

In [35]:
df_close_2_high_accuracies

Unnamed: 0,mse,mape
HD,14.286168,0.657456
MSFT,9.149559,0.806272
XOM,1.16877,0.659054
UNH,30.010607,0.731348
WAT,16.676858,0.761927
ADSK,17.980777,1.062799


In [43]:
import plotly.express as px
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [45]:
# iplot layout
space =  {
            'legend' : {'bgcolor':'#1A1A1C','font':{'color':'#D9D9D9',"size":12}},
            'paper_bgcolor' : '#1A1A1C',
            'plot_bgcolor' : '#1A1A1C',
            "title" : {"font":{"color":"#D9D9D9"},"x":0.5},
            'yaxis' : {
                'tickfont' : {'color':'#C2C2C2', "size":12},
                'gridcolor' : '#434343',
                'titlefont' : {'color':'#D9D9D9'},
                'zerolinecolor' : '#666570',
                'showgrid' : True
            },
            'xaxis' : {
                'tickfont' : {'color':'#C2C2C2', "size":12},
                'gridcolor' : '#434343',
                'titlefont' : {'color':'#D9D9D9'},
                'zerolinecolor' : '#666570',
                'showgrid' : True
            },
            'titlefont' : {'color':'#D9D9D9'}
        }

In [47]:
# plotting final result
plot_ticker = 'XOM'
plot_feature = target_feature

reporting['XOM'][['train_predicted','validation_predicted','test_predicted','train_real','validation_real','test_real']].iplot(title=f'{plot_ticker} {plot_feature}', layout=space)

In [39]:
with open(parentDirectory + '/data/CEEMDAN_result_exports/' + 'results_dictionary.pkl', 'wb') as f:
    pickle.dump(reporting, f)
    print(f"Exported reporting dictionary to {parentDirectory + '/data/CEEMDAN_result_exports/' + 'results_dictionary.pkl'}")

Exported reporting dictionary to /Users/chris/Documents/GitHub/financial_forecasting_analysis/data/CEEMDAN_result_exports/results_dictionary.pkl


In [57]:
for t in reporting.keys():
    
    tmp =  pd.melt(reporting[t].reset_index()[['reportperiod','train_predicted','validation_predicted','test_predicted']], 
                   id_vars=['reportperiod']).dropna().rename(columns={'value':'prediction'})[['reportperiod','prediction']]

    tmp.to_csv(parentDirectory + f"/data/CEEMDAN_result_exports/{t}_backtesting_LSTM_SPLINE_predictions.csv", sep=',', encoding='utf-8')

In [52]:
reporting['MSFT']

Unnamed: 0,reportperiod,train_predicted,validation_predicted,test_predicted,train_real,validation_real,test_real,ticker,Open,High,Low,Close,Volume,Back_Shifted_Real_High
0,2000-04-10,43.693715,,,43.03125,,,MSFT,44.312500,44.312500,43.000000,43.031250,60685400.0,43.031250
1,2000-04-11,42.582780,,,41.12500,,,MSFT,42.562500,43.031250,41.750000,41.937500,71961800.0,41.125000
2,2000-04-12,40.300563,,,41.12500,,,MSFT,41.062500,41.125000,39.375000,39.687500,153003800.0,41.125000
3,2000-04-13,40.244089,,,39.75000,,,MSFT,40.437500,41.125000,39.500000,39.625000,94316200.0,39.750000
4,2000-04-14,37.650405,,,38.00000,,,MSFT,39.562500,39.750000,36.625000,37.062500,151217800.0,38.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5417,2021-11-05,,,339.568634,,,337.649994,MSFT,338.510010,338.790009,334.420013,336.059998,22564000.0,337.649994
5418,2021-11-08,,,340.248672,,,338.720001,MSFT,337.299988,337.649994,334.440002,336.989990,20897000.0,338.720001
5419,2021-11-09,,,338.785030,,,334.630005,MSFT,337.109985,338.720001,334.529999,335.950012,21307400.0,334.630005
5420,2021-11-10,,,333.059615,,,333.769989,MSFT,334.570007,334.630005,329.920013,330.799988,25500900.0,333.769989


In [56]:
pd.melt(reporting['MSFT'].reset_index()[['reportperiod','train_predicted','validation_predicted','test_predicted']], id_vars=['reportperiod']).dropna().rename(columns={'value':'prediction'})[['reportperiod','prediction']]

Unnamed: 0,reportperiod,prediction
0,2000-04-10,43.693715
1,2000-04-11,42.582780
2,2000-04-12,40.300563
3,2000-04-13,40.244089
4,2000-04-14,37.650405
...,...,...
16261,2021-11-05,339.568634
16262,2021-11-08,340.248672
16263,2021-11-09,338.785030
16264,2021-11-10,333.059615


In [None]:
tmp = reporting['MSFT'][['train_predicted','train_real','validation_predicted','validation_real','test_predicted','test_real']]

In [54]:
tmp

'IMF6'

In [41]:
# export results
df_close_2_high_accuracies.to_csv(parentDirectory + f"/data/CEEMDAN_result_exports/'{'_'.join('imfs_to_predict_with_neural')}_full_spline_.csv", sep=',', encoding='utf-8')