# Testing transformers & Decomposition on 1 Stock

In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import timedelta, datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from PyEMD import CEEMDAN
from sklearn.decomposition import PCA
import plotly.express as px
from tqdm import tqdm
from datetime import datetime
import sys, os

In [2]:
absolutepath = os.path.abspath('')
fileDirectory = os.path.dirname(absolutepath)

#Path of parent directory (moves outside of repository)
parentDirectory = os.path.dirname(fileDirectory)

In [3]:
class ManyToOneTimeSeriesGenerator(TimeseriesGenerator):
  def __getitem__(self, idx):
    x, y = super().__getitem__(idx)
    last_element_index = y.shape[1]-1
    return x, y[:,last_element_index].reshape(1,-1)

class SplineModel():
    def __init__(self,time_series_generator):
        self.name = "SplineModel"
        self.gen = time_series_generator
    
    def predict(self, x_window, verbose=0):
        result = []
        x_window = np.squeeze(x_window, axis=0)
        last_element_index = x_window.shape[1]-1
        series = x_window[:,last_element_index].reshape(-1)
        cs = CubicSpline(np.arange(len(series)), series)
        next_value = cs(len(series)+1)
        result += [next_value]

        return np.array(result).reshape(1,-1) # 1,-1

NameError: name 'TimeseriesGenerator' is not defined

In [4]:
coi = ['ticker','reportperiod','Open','High','Low','Close','Volume']
coi[2:]

['Open', 'High', 'Low', 'Close', 'Volume']

# Part 1

In [5]:

def stock_decomp(dfile, tick, annum, minmax= True, standard=False, logged=False, export=False):
    
    # grab only the data needed
    ticker_dataframe = pd.read_pickle(fileDirectory+ dfile)
    coi = ['ticker','reportperiod','Open','High','Low','Close','Volume']
    ticker_dataframe['reportperiod'] = pd.to_datetime(ticker_dataframe['reportperiod'])
    ticker_dataframe = ticker_dataframe[(ticker_dataframe.ticker == tick) & (ticker_dataframe['reportperiod'].dt.year == annum)][coi]
    
    # decomposing
    decomposed_ticker_features_series = {}
    scalers = {}
    ceemdan = CEEMDAN(parallel=True, processes=10)
    
    for ticker in ticker_dataframe.ticker.unique():
        print(f'[{ticker}] Decomposing...')
        decomposed_ticker_features_series[ticker] = {}
        scalers[ticker] = {}
        
        if logged:
            for c in coi[2:]:
                ticker_dataframe[c] = np.log(ticker_dataframe[c])
         
        for column in coi[2:]:  
            decomposed_ticker_features_series[ticker][column] ={}      
            try:
                # train / valid / test  
                TRAIN_VALID_RATIO = 0.75
                TRAIN_TEST_CUTOFF = ticker_dataframe.index[round(ticker_dataframe.shape[0] * TRAIN_VALID_RATIO)]
                train_valids_series = ticker_dataframe.loc[:TRAIN_TEST_CUTOFF][column].values.reshape(-1,1)
                test = ticker_dataframe.loc[TRAIN_TEST_CUTOFF:][column].values.reshape(-1,1)
                #series = ticker_dataframe[column].values.reshape(-1,1)
                #print(f'[{ticker}][{column}] train_valid_test split done')

                if standard:
                    scaler = StandardScaler()
                    scaler.fit(train_valids_series)
                    scalers[ticker][column] = scaler
                
                if minmax:
                    scaler = MinMaxScaler()
                    scaler.fit(train_valids_series)
                    scalers[ticker][column] = scaler
                #print(f'[{ticker}][{column}] Scaling finished; next decomposition')
                
                # decompose
                ticker_feature_time_series = np.frombuffer(scaler.transform(train_valids_series))
                ticker_feature_time_series_imfs = ceemdan(ticker_feature_time_series, max_imf=10)
                print(ticker_feature_time_series_imfs)
                #print(f'[{ticker}][{column}] Decomposition finished; restructuring')
                
                # iterating every IMF 
                for i, imf_series in enumerate(ticker_feature_time_series_imfs):
                    
                    if i < len(ticker_feature_time_series_imfs)-1: # last one is residual
                        decomposed_ticker_features_series[ticker][column][f'IMF{i+1}'] = imf_series
                    else:
                        decomposed_ticker_features_series[ticker][column][f'Rsd'] = imf_series
                print(f'Finished Decomposing [{ticker}][{column}]')
            except:
                print(f'ERROR ticker[{ticker}][{column}]')
                decomposed_ticker_features_series[ticker][column] = 'ERROR'
                
            finally:
                continue
            
    if export:
        
        # get date stamp
        dateTimeObj = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
        
        if standard:
            s = 'stdscl'
        if minmax:
            s = 'minmaxscl'
        
        if logged:
            l = 'logged'
        else:
            l = 'orig'
            
            
        # save file

        with open(fileDirectory +  f'/data/DNN_testing_outputs/{tick}__{annum}_decomposed_ticker_features_series_{l}_{s}_{dateTimeObj}.pkl', 'wb') as f:
            pickle.dump(decomposed_ticker_features_series, f)
        with open(fileDirectory + f'/data/DNN_testing_outputs/{tick}__{annum}_scalers_{l}_{s}_{dateTimeObj}.pkl', 'wb') as f:
            pickle.dump(scalers, f)
        print('Export Locations:')
        print(f'/data/DNN_testing_outputs/{tick}__{annum}_decomposed_ticker_features_series_{l}_{s}_{dateTimeObj}.pkl')
        print(f'/data/DNN_testing_outputs/{tick}__{annum}_scalers_{l}_{s}_{dateTimeObj}.pkl')
    
    tmp_dict = {}
    tmp_dict['decomposed_ticker_features_series'] = decomposed_ticker_features_series
    tmp_dict['scalers'] = scalers
    tmp_dict['TRAIN_TEST_CUTOFF'] = TRAIN_TEST_CUTOFF
    tmp_dict['train_valids_series'] = train_valids_series
    tmp_dict['test'] = test
            
    return tmp_dict
    

In [44]:
test_dict = stock_decomp(dfile= '/data/raw_data_2021-11-12_12.31.45.pkl', tick='MSFT', annum=2020, minmax= True, export=True)

[MSFT] Decomposing...
Finished Decomposing [MSFT][Open]
Finished Decomposing [MSFT][High]
Finished Decomposing [MSFT][Low]
Finished Decomposing [MSFT][Close]
Finished Decomposing [MSFT][Volume]
Export Locations:
/data/DNN_testing_outputs/MSFT__2020_decomposed_ticker_features_series_orig_minmaxscl_2021-11-26_16.29.14.pkl
/data/DNN_testing_outputs/MSFT__2020_scalers_orig_minmaxscl_2021-11-26_16.29.14.pkl


In [6]:
test_dict2 = stock_decomp(dfile= '/data/raw_data_2021-11-12_12.31.45.pkl', tick='MSFT', annum=2020, minmax= True, export=False)

[MSFT] Decomposing...
[[-3.30617678e-03  5.40537899e-03 -6.46589514e-03  8.50210116e-03
  -9.60252665e-03  7.54359705e-03  7.59730462e-03 -7.60478021e-03
   8.53022084e-03 -9.59387351e-03 -9.48723212e-03  8.10302067e-03
  -4.41881365e-03  3.02074392e-03 -8.31576711e-03  2.00797673e-02
  -3.38061005e-02 -6.86522554e-03  1.92006335e-02  5.82224393e-02
   4.00277617e-03 -5.18594452e-02 -1.46266833e-02  3.19298966e-02
  -1.75898967e-02 -1.08994101e-02 -2.39859901e-02  3.41030678e-02
  -2.68921819e-02 -5.25534067e-02 -4.64902854e-02 -1.20841556e-02
   3.29561965e-02  5.20912853e-02  5.54280659e-02 -5.81795786e-02
   6.52720737e-02  5.66856932e-02  1.01965776e-02 -1.01247913e-01
   3.11260547e-02  1.13392849e-01  5.44302487e-02  3.65878274e-02
   1.71460815e-02 -8.21914153e-02  2.85918479e-02  5.30937473e-02
  -3.87615390e-02  2.49438327e-02 -1.34566288e-02  9.88222236e-03
  -1.82206493e-02  2.14282431e-02  4.89299994e-02 -5.22311175e-02
  -1.47814664e-04  2.44682408e-02 -8.74100653e-03  5.2

In [45]:
test_dict.keys()

dict_keys(['decomposed_ticker_features_series', 'scalers', 'TRAIN_TEST_CUTOFF', 'train_valids_series', 'test'])

In [48]:
dateTimeObj = datetime.now().strftime("%Y-%m-%d_%H.%M")
with open(fileDirectory +  f'/data/DNN_testing_outputs/compiled_dict_{dateTimeObj}.pkl', 'wb') as f:
    pickle.dump(test_dict, f)
    print("Exported to:")
    print(f'/data/DNN_testing_outputs/compiled_dict_{dateTimeObj}.pkl')

Exported to:
/data/DNN_testing_outputs/compiled_dict_2021-11-26_16.31.pkl


In [46]:
# Printing structure
tick = list(test_dict['decomposed_ticker_features_series'].keys())[0]
print(f"TICKER: {tick} {test_dict['decomposed_ticker_features_series'][tick].keys()}")
for k in test_dict['decomposed_ticker_features_series'][tick].keys():
    print(f"FEATURE: {k} \n{test_dict['decomposed_ticker_features_series'][tick][k].keys()}")
    

TICKER: MSFT dict_keys(['Open', 'High', 'Low', 'Close', 'Volume'])
FEATURE: Open 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5'])
FEATURE: High 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5'])
FEATURE: Low 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4'])
FEATURE: Close 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5'])
FEATURE: Volume 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5'])


# Part 2
### Restart notebook for Tensor

In [13]:
from datetime import timedelta, datetime
import pandas as pd
import numpy as np
import pickle

from scipy.interpolate import CubicSpline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error 
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense, LSTM, LeakyReLU
from tensorflow.keras.callbacks import ModelCheckpoint
from tqdm import tqdm
import sys, os

experiment_time = datetime.now().strftime("%H_%M_%S_%m_%d_%Y")

features_in_order = ['Open', 'High', 'Low', 'Volume', 'Close'] # target feature must be the last one here
target_feature = 'Close'

absolutepath = os.path.abspath('')
fileDirectory = os.path.dirname(absolutepath)

#Path of parent directory (moves outside of repository)
parentDirectory = os.path.dirname(fileDirectory)

class ManyToOneTimeSeriesGenerator(TimeseriesGenerator):
  def __getitem__(self, idx):
    x, y = super().__getitem__(idx)
    last_element_index = y.shape[1]-1
    return x, y[:,last_element_index].reshape(1,-1)

class SplineModel():
    def __init__(self,time_series_generator):
        self.name = "SplineModel"
        self.gen = time_series_generator
    
    def predict(self, x_window, verbose=0):
        result = []
        x_window = np.squeeze(x_window, axis=0)
        last_element_index = x_window.shape[1]-1
        series = x_window[:,last_element_index].reshape(-1)
        cs = CubicSpline(np.arange(len(series)), series)
        next_value = cs(len(series)+1)
        result += [next_value]

        return np.array(result).reshape(1,-1) # 1,-1

In [14]:
with open(fileDirectory +  f'/data/DNN_testing_outputs/compiled_dict_2021-11-26_16.31.pkl', 'rb') as f:
    test_dict = pickle.load(f)

In [15]:
# 
test_dict.keys()

dict_keys(['decomposed_ticker_features_series', 'scalers', 'TRAIN_TEST_CUTOFF', 'train_valids_series', 'test'])

In [57]:
# Data organization
max_window_size = 10
windows_sizes_for_imf_level = {
    'IMF1': 2,
    'IMF2': 2,
    'IMF3': 3,
    'IMF4': 3,
    'IMF5': 4,
    'IMF6': 4,
    'IMF7': 5,
    'IMF8': 5,
    'Rsd': 6,
    'DEFAULT': 4
}
target_feature_max_imf_level = {}

# Coupling together the IMFs of the same level for different features to create exogenous input
# The number of imfs for each feature decomposition may differ, thus some of the last imfs may not match in number of features
series = {}
for ticker in test_dict['decomposed_ticker_features_series']:
    
    series[ticker] = {}
    target_feature_max_imf_level[ticker] ={}


    for feature in test_dict['decomposed_ticker_features_series'][ticker]:
        
        imfs = pd.DataFrame.from_dict(test_dict['decomposed_ticker_features_series'][ticker][feature])
        
        for imf in imfs:
            if imf not in series[ticker]:
                series[ticker][imf] = []
            _series = imfs[imf].values
            _series = _series.reshape((len(_series),1)) # reshaping to get into column format
            series[ticker][imf] += [_series]
            if feature == target_feature:
                target_feature_max_imf_level[ticker] = imf

# cut_spare_imfs: when any of the exogenous features have more imfs than the target feature. This solves a bug, if not excluded, these spare imfs from exogenous features would be wrongly added in the recomposition of the target feature.
series_cut = {}
for ticker in series:
    if ticker not in series_cut:
        series_cut[ticker] = {}

    for imf_level_string in series[ticker]:
        imf_level_int = int(imf_level_string[3:])
        if imf_level_int > int(target_feature_max_imf_level[ticker][3:]):
            continue
        else:
            #print(f'ticker = {str(ticker)}, y = {str(y)} imf_level_string ={imf_level_string}')
            series_cut[ticker][imf_level_string] = series[ticker][imf_level_string].copy()
# if doing full and not spline then uncomment below
series = series_cut

dataset = {}
# # horizontal stack
for ticker in series:
    dataset[ticker] = {}
    for imf_level in series[ticker]:
        dataset[ticker][imf_level] = np.hstack(tuple(series[ticker][imf_level]))




In [58]:
series_cut

{'MSFT': {'IMF1': [array([[-3.25240704e-03],
          [ 5.53326355e-03],
          [-6.54192117e-03],
          [ 8.49155177e-03],
          [-9.54906911e-03],
          [ 7.94820652e-03],
          [ 7.40409071e-03],
          [-7.61267285e-03],
          [ 8.51412596e-03],
          [-9.49700540e-03],
          [-9.42902451e-03],
          [ 8.01675625e-03],
          [-4.38767058e-03],
          [ 3.07065616e-03],
          [-8.39373160e-03],
          [ 2.01115898e-02],
          [-3.37194947e-02],
          [-7.23021819e-03],
          [ 1.86019613e-02],
          [ 5.75057507e-02],
          [ 3.87069801e-03],
          [-5.12437615e-02],
          [-1.33967308e-02],
          [ 3.28238118e-02],
          [-1.86232066e-02],
          [-1.35302456e-02],
          [-2.56742129e-02],
          [ 3.45076549e-02],
          [-2.54348241e-02],
          [-5.07370612e-02],
          [-4.48779348e-02],
          [-1.08478985e-02],
          [ 3.33386921e-02],
          [ 5.20035156e-02]

In [59]:
dataset.keys()

dict_keys(['MSFT'])

In [60]:
train = 0.55
validation = 0.2
test = 0.25

train_dataset = {}
validation_dataset = {}
test_dataset = {}
for ticker in dataset:
    
    train_dataset[ticker] = {}
    validation_dataset[ticker] = {}
    test_dataset[ticker] = {}
    
    for imf_level in dataset[ticker]:
                
        # splitting data sets according to rates
        train_dataset[ticker][imf_level] = dataset[ticker][imf_level][:round(train*dataset[ticker][imf_level].shape[0]),:]
        validation_dataset[ticker][imf_level] = dataset[ticker][imf_level][round(train*dataset[ticker][imf_level].shape[0]):round((train+validation)*dataset[ticker][imf_level].shape[0]),:]
        test_dataset[ticker][imf_level] = dataset[ticker][imf_level][round((train+validation)*dataset[ticker][imf_level].shape[0]):,:]

In [61]:
def generator(dataset):
    # data set split rates
    # create generators
    # NOTE STANDARD SCALER was FIT on .75 split so leakage if TRAIN VALIDATE GOES PAST


    train_generators = {}
    validation_generators = {}
    test_generators = {}

    for ticker in dataset:

        train_generators[ticker] = {}
        validation_generators[ticker] = {}
        test_generators[ticker] = {}
        

        for imf_level in dataset[ticker]:

            if imf_level in windows_sizes_for_imf_level:
                window_size = windows_sizes_for_imf_level[imf_level]
            else: 
                window_size = windows_sizes_for_imf_level['DEFAULT']
            # windowing
            train_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(train_dataset[ticker][imf_level], train_dataset[ticker][imf_level], length=window_size, batch_size=1)
            validation_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(validation_dataset[ticker][imf_level], validation_dataset[ticker][imf_level], length=window_size, batch_size=1)
            test_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(test_dataset[ticker][imf_level], test_dataset[ticker][imf_level], length=window_size, batch_size=1)
    
    return train_generators,validation_generators,test_generators

In [65]:
train_generators,validation_generators,test_generators = generator(dataset)

In [66]:
# Model Training

models = {}

model_epochs = {
    'IMF1': 2500,
    'IMF2': 2000,
    'IMF3': 1500,
    'IMF4': 1500,
    'IMF5': 1500,
    'IMF6': 1200,
    'IMF7': 1200,
    'IMF8': 1000,
    'Rsd': 1000,
    'DEFAULT': 1000
}

imfs_to_predict_with_neural = ['IMF1', 'IMF2','IMF3'] # set to ['IMF1'] , ['IMF1', 'IMF2'], ['IMF1','IMF2','IMF3'] and so on



for ticker in train_generators:
    models[ticker] = {}

    reached_max_imf_of_target_feature = False
    for imf_level in train_generators[ticker]:
        if imf_level in imfs_to_predict_with_neural:
            print(f'Training model [{ticker}][{imf_level}]')
            if reached_max_imf_of_target_feature is True:
                break # no need to predict further if target feature doesn't contain greater IMF levels

            if target_feature_max_imf_level[ticker] == imf_level:
                reached_max_imf_of_target_feature = True
            # Prediction model
            model = Sequential()
            current_dataset = train_dataset[ticker][imf_level]
            n_features = current_dataset.shape[1]
            cur_tmp_gen = train_generators[ticker][imf_level]
            cur_tmp_val_gen = validation_generators[ticker][imf_level]

            if imf_level in windows_sizes_for_imf_level:
                window_size = windows_sizes_for_imf_level[imf_level]
            else: 
                window_size = windows_sizes_for_imf_level['DEFAULT']

            model.add(LSTM(128, activation='tanh', return_sequences=True, input_shape=(window_size, n_features)))
            model.add(LSTM(64, activation='tanh', input_shape=(window_size, 128)))
            model.add(Dense(16))
            model.add(LeakyReLU())
            model.add(Dense(4))
            model.add(LeakyReLU())
            model.add(Dense(1)) # 1 target feature only
            model.compile(optimizer='adam', loss='mse')

            number_of_epochs = model_epochs[imf_level]
            checkpoint_path = fileDirectory + "/data/DNN_tmp/" +f"{ticker}_" +f"ltsm_spline.h5"
            
            callbacks = [ ModelCheckpoint(checkpoint_path, monitor='loss', mode="max", verbose=0,save_best_only=True, save_weights_only=False, save_freq=250)]

            # fit model
            #model.fit_generator(cur_tmp_gen, steps_per_epoch=1, epochs=number_of_epochs, verbose=0)
            model.fit(cur_tmp_gen, validation_data=cur_tmp_val_gen, steps_per_epoch=10, epochs=number_of_epochs, verbose=0, callbacks=callbacks)

            models[ticker][imf_level] = model
        else:
            # Spline prediction model
            cur_tmp_gen = train_generators[ticker][imf_level]
            model = SplineModel(cur_tmp_gen)
            models[ticker][imf_level] = model
            


Training model [MSFT][IMF1]
Training model [MSFT][IMF2]
Training model [MSFT][IMF3]


In [67]:
# reset generators
train_generators,validation_generators,test_generators = generator(dataset)

In [68]:
# predicting

results = {}

for ticker in models:
    results[ticker] = {}

    # initializing results dicitionary
    for feature in features_in_order:
        if feature != target_feature:
            continue
        results[ticker][feature] = {}
        for imf_level in models[ticker]:
            results[ticker][feature][imf_level] = {
                'real_train': [],
                'predicted_train': [],
                'real_validation': [],
                'predicted_validation': [],
                'real_test': [],
                'predicted_test': [],
                'x_axis_train': [],
                'x_axis_validation': [],
                'x_axis_test': []
            }

    for imf_level in models[ticker]:
        model = models[ticker][imf_level]
        
        print(f'Predicting: [{ticker}][{imf_level}]')

        cur_train_gen = train_generators[ticker][imf_level]
        cur_validation_gen = validation_generators[ticker][imf_level]
        cur_test_gen = test_generators[ticker][imf_level]

        # predicting train
        day_counter = 0
        for i in range(len(cur_train_gen)):
            x, y = cur_train_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_train'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_train'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_train'] += [day_counter]
            day_counter += 1

        # predicting validation
        for i in range(len(cur_validation_gen)):
            x, y = cur_validation_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_validation'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_validation'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_validation'] += [day_counter]
            day_counter += 1

        # predicting test
        for i in range(len(cur_test_gen)):
            x, y = cur_test_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_test'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_test'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_test'] += [day_counter]
            day_counter += 1


Predicting: [MSFT][IMF1]
Predicting: [MSFT][IMF2]
Predicting: [MSFT][IMF3]
Predicting: [MSFT][IMF4]
Predicting: [MSFT][IMF5]


In [69]:
# organizing imf prediction results, concatenating train, validation and test
concatenated_results = {}

for ticker in results:
    concatenated_results[ticker] = {}
    for feature in results[ticker]:
        concatenated_results[ticker][feature] = {}
        for imf_level in results[ticker][feature]:
            
            df_result = pd.DataFrame.from_dict(results[ticker][feature][imf_level], orient='index').T
            df_train = df_result[['real_train','predicted_train','x_axis_train']].set_index('x_axis_train').dropna(axis=0)
            df_train.index.name = 'x'
            df_validation = df_result[['real_validation','predicted_validation','x_axis_validation']].set_index('x_axis_validation').dropna(axis=0)
            df_validation.index.name = 'x'
            df_test = df_result[['real_test','predicted_test','x_axis_test']].set_index('x_axis_test').dropna(axis=0)
            df_test.index.name = 'x'

            df_concatenated = pd.concat([df_train,df_validation,df_test], axis=1)

            concatenated_results[ticker][feature][imf_level] = df_concatenated

In [70]:
# iplot layout
space =  {
            'legend' : {'bgcolor':'#1A1A1C','font':{'color':'#D9D9D9',"size":12}},
            'paper_bgcolor' : '#1A1A1C',
            'plot_bgcolor' : '#1A1A1C',
            "title" : {"font":{"color":"#D9D9D9"},"x":0.5},
            'yaxis' : {
                'tickfont' : {'color':'#C2C2C2', "size":12},
                'gridcolor' : '#434343',
                'titlefont' : {'color':'#D9D9D9'},
                'zerolinecolor' : '#666570',
                'showgrid' : True
            },
            'xaxis' : {
                'tickfont' : {'color':'#C2C2C2', "size":12},
                'gridcolor' : '#434343',
                'titlefont' : {'color':'#D9D9D9'},
                'zerolinecolor' : '#666570',
                'showgrid' : True
            },
            'titlefont' : {'color':'#D9D9D9'}
        }

In [71]:
# plotting partial result
plot_ticker = 'MSFT'
plot_feature = target_feature
plot_imf = 'IMF1'

concatenated_results[plot_ticker][plot_feature][plot_imf].iplot(title=f'{plot_ticker} {plot_feature} {plot_imf}', asFigure=True, layout=space)

In [72]:
# recomposing prediction by arithmetically adding the IMF curves

final_prediction_results = {}
max_window_size = 10

for ticker in concatenated_results:
    final_prediction_results[ticker] = {}
    for feature in concatenated_results[ticker]:
        addition_train = None
        addition_validation = None
        addition_test = None

        addition_real_train = None
        addition_real_validation = None
        addition_real_test = None

        # recomposing predictions
        for imf_level in concatenated_results[ticker][feature]:
            # adding test
            can_sum = True
            if addition_test is None:
                addition_test = concatenated_results[ticker][feature][imf_level]['predicted_test'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_test'].values
                cur_length = addition_test.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_test = addition_test[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_test = np.add(addition_test,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding train
            can_sum = True
            if addition_train is None:
                addition_train = concatenated_results[ticker][feature][imf_level]['predicted_train'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_train'].values
                cur_length = addition_train.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_train = addition_train[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_train = np.add(addition_train,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding validation
            can_sum = True
            if addition_validation is None:
                addition_validation = concatenated_results[ticker][feature][imf_level]['predicted_validation'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_validation'].values
                cur_length = addition_validation.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_validation = addition_validation[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_validation = np.add(addition_validation,np_array_to_be_added)

        # recomposing real
        for imf_level in concatenated_results[ticker][feature]:
            # adding test
            can_sum = True
            if addition_real_test is None:
                addition_real_test = concatenated_results[ticker][feature][imf_level]['real_test'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_test'].values
                cur_length = addition_real_test.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_test = addition_real_test[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_test = np.add(addition_real_test,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding train
            can_sum = True
            if addition_real_train is None:
                addition_real_train = concatenated_results[ticker][feature][imf_level]['real_train'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_train'].values
                cur_length = addition_real_train.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_train = addition_real_train[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_train = np.add(addition_real_train,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding validation
            can_sum = True
            if addition_real_validation is None:
                addition_real_validation = concatenated_results[ticker][feature][imf_level]['real_validation'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_validation'].values
                cur_length = addition_real_validation.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_validation = addition_real_validation[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_validation = np.add(addition_real_validation,np_array_to_be_added)
        
        scaler = test_dict['scalers'][ticker][feature]

        final_prediction_results[ticker][feature] = {
            'train_predicted': scaler.inverse_transform(addition_train.reshape(-1,1)).reshape(-1),
            'validation_predicted': scaler.inverse_transform(addition_validation.reshape(-1,1)).reshape(-1),
            'test_predicted': scaler.inverse_transform(addition_test.reshape(-1,1)).reshape(-1),
            'train_real': scaler.inverse_transform(addition_real_train.reshape(-1,1)).reshape(-1),
            'validation_real': scaler.inverse_transform(addition_real_validation.reshape(-1,1)).reshape(-1),
            'test_real': scaler.inverse_transform(addition_real_test.reshape(-1,1)).reshape(-1)
        }

In [73]:
# plotting final result
plot_ticker = 'MSFT'
plot_feature = 'Close'

pd.DataFrame.from_dict(final_prediction_results[plot_ticker][plot_feature]).iplot(title=f'{plot_ticker} {plot_feature} {plot_imf}', layout=space)

In [55]:
# Verifying that 'Real' are as expected
ticker_dataframe = pd.read_pickle(fileDirectory+ '/data/raw_data_2021-11-12_12.31.45.pkl')
coi = ['ticker','reportperiod','Open','High','Low','Close','Volume']
ticker_dataframe['reportperiod'] = pd.to_datetime(ticker_dataframe['reportperiod'])
ticker_dataframe = ticker_dataframe[(ticker_dataframe.ticker == 'MSFT') & (ticker_dataframe['reportperiod'].dt.year == 2020)][coi]

ticker_dataframe

Unnamed: 0,ticker,reportperiod,Open,High,Low,Close,Volume
1495992,MSFT,2020-01-02,158.779999,160.729996,158.330002,160.619995,22622100.0
1495991,MSFT,2020-01-03,158.320007,159.949997,158.059998,158.619995,21116200.0
1495990,MSFT,2020-01-06,157.080002,159.100006,156.509995,159.029999,20813700.0
1495989,MSFT,2020-01-07,159.320007,159.669998,157.320007,157.580002,21634100.0
1495988,MSFT,2020-01-08,158.929993,160.800003,157.949997,160.089996,27746500.0
...,...,...,...,...,...,...,...
1495744,MSFT,2020-12-24,221.419998,223.610001,221.199997,222.750000,10550600.0
1495743,MSFT,2020-12-28,224.449997,226.029999,223.020004,224.960007,17933500.0
1495742,MSFT,2020-12-29,226.309998,227.179993,223.580002,224.149994,17403200.0
1495741,MSFT,2020-12-30,225.229996,225.630005,221.470001,221.679993,20272300.0


-0.7180880432082801

In [50]:
#final_prediction_results['MSFT']['Close']['validation_predicted']
final_prediction_results['MSFT']['Close']['train_real']

array([147.33417189, 139.59049457, 138.78819044, 139.88446581,
       142.13751499, 139.63054283, 142.60981662, 155.39884389,
       149.48959583, 142.94261885, 137.51601923, 143.8912391 ,
       152.60690829, 143.86908474, 155.8648157 , 176.36733194,
       157.55463302, 150.20754299, 157.90837233, 161.25952279,
       147.34013652, 154.37317031, 157.19139898, 178.26919808,
       170.84748927, 156.52372516, 141.73837143, 147.38980123,
       150.07425168, 158.43070384, 172.68520516, 196.71233315,
       196.42286581, 181.97725275, 226.97834652, 231.6499939 ,
       200.02294844, 200.80954976, 174.19681382, 171.76579944,
       202.20223092, 199.27858647, 193.11322433, 182.17871131,
       227.04103604, 226.43361719, 220.56453912, 212.23097168,
       212.88026469, 218.15044482, 216.86439671, 209.69344598,
       214.00441559, 205.63140853, 192.15608318, 182.9951358 ,
       190.75889811, 208.41774468, 184.12427752, 173.97320091,
       163.76345177, 195.25233683, 189.9660887 , 172.37

In [44]:
adj_close_accuracies = {}
accuracies_detailed = {}

for ticker in final_prediction_results:
    adj_close_accuracies[ticker] = {}
    accuracies_detailed[ticker] = {}
    for feature in final_prediction_results[ticker]:

        y_train = final_prediction_results[ticker][feature]['train_predicted'][~np.isnan(final_prediction_results[ticker][feature]['train_predicted'])]
        yhat_train = final_prediction_results[ticker][feature]['train_real'][~np.isnan(final_prediction_results[ticker][feature]['train_real'])]

        y_validation = final_prediction_results[ticker][feature]['validation_predicted'][~np.isnan(final_prediction_results[ticker][feature]['validation_predicted'])]
        yhat_validation = final_prediction_results[ticker][feature]['validation_real'][~np.isnan(final_prediction_results[ticker][feature]['validation_real'])]

        y_test = final_prediction_results[ticker][feature]['test_predicted'][~np.isnan(final_prediction_results[ticker][feature]['test_predicted'])]
        yhat_test = final_prediction_results[ticker][feature]['test_real'][~np.isnan(final_prediction_results[ticker][feature]['test_real'])]
        accuracies_detailed[ticker][feature] = {
            'mse':{
                'train':mean_squared_error(y_train,yhat_train),
                'validation':mean_squared_error(y_validation,yhat_validation),
                'test':mean_squared_error(y_test,yhat_test),
            },
            'mape':{
                'train':np.mean(np.abs((y_train - yhat_train) / y_train)) * 100,
                'validation':np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100,
                'test':np.mean(np.abs((y_test - yhat_test) / y_test)) * 100,
            }
        }

        if feature == 'Close':
            adj_close_accuracies[ticker] = {
                'mse': mean_squared_error(y_test,yhat_test),
                'mape': np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100
            }

# pd.DataFrame.from_dict(accuracies_detailed[plot_ticker][plot_feature])
df_close_accuracies = pd.DataFrame.from_dict(adj_close_accuracies).T
df_close_accuracies.to_csv(fileDirectory + f"/data/DNN_metrics/{'_'.join(imfs_to_predict_with_neural)}_full_spline_{experiment_time}.csv", sep=',', encoding='utf-8')
df_close_accuracies

Unnamed: 0,mape,mse
MSFT,5.621219,203.82635


In [74]:
adj_close_accuracies = {}
accuracies_detailed = {}

for ticker in final_prediction_results:
    adj_close_accuracies[ticker] = {}
    accuracies_detailed[ticker] = {}
    for feature in final_prediction_results[ticker]:

        y_train = final_prediction_results[ticker][feature]['train_predicted'][~np.isnan(final_prediction_results[ticker][feature]['train_predicted'])]
        yhat_train = final_prediction_results[ticker][feature]['train_real'][~np.isnan(final_prediction_results[ticker][feature]['train_real'])]

        y_validation = final_prediction_results[ticker][feature]['validation_predicted'][~np.isnan(final_prediction_results[ticker][feature]['validation_predicted'])]
        yhat_validation = final_prediction_results[ticker][feature]['validation_real'][~np.isnan(final_prediction_results[ticker][feature]['validation_real'])]

        y_test = final_prediction_results[ticker][feature]['test_predicted'][~np.isnan(final_prediction_results[ticker][feature]['test_predicted'])]
        yhat_test = final_prediction_results[ticker][feature]['test_real'][~np.isnan(final_prediction_results[ticker][feature]['test_real'])]
        accuracies_detailed[ticker][feature] = {
            'mse':{
                'train':mean_squared_error(y_train,yhat_train),
                'validation':mean_squared_error(y_validation,yhat_validation),
                'test':mean_squared_error(y_test,yhat_test),
            },
            'mape':{
                'train':np.mean(np.abs((y_train - yhat_train) / y_train)) * 100,
                'validation':np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100,
                'test':np.mean(np.abs((y_test - yhat_test) / y_test)) * 100,
            }
        }

        if feature == 'Close':
            adj_close_accuracies[ticker] = {
                'mse': mean_squared_error(y_test,yhat_test),
                'mape': np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100
            }

# pd.DataFrame.from_dict(accuracies_detailed[plot_ticker][plot_feature])
df_close_accuracies = pd.DataFrame.from_dict(adj_close_accuracies).T
df_close_accuracies.to_csv(fileDirectory + f"/data/DNN_metrics/{'_'.join(imfs_to_predict_with_neural)}_full_spline_{experiment_time}.csv", sep=',', encoding='utf-8')
df_close_accuracies

Unnamed: 0,mape,mse
MSFT,6.276005,271.254092
