# Testing transformers & Decomposition on 1 Stock

In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import timedelta, datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from PyEMD import CEEMDAN
from sklearn.decomposition import PCA
import plotly.express as px
from tqdm import tqdm
from datetime import datetime
import sys, os

In [2]:
absolutepath = os.path.abspath('')
fileDirectory = os.path.dirname(absolutepath)

#Path of parent directory (moves outside of repository)
parentDirectory = os.path.dirname(fileDirectory)

In [3]:
train = 0.8
validation = 0.1
test = 0.1

# Part 1

In [5]:
def scale_data(df, scale_type=None):
    # SCALING
    if scale_type=='minmax':
        scaler = MinMaxScaler()
        df = scaler.fit_transform(df.copy().values.reshape(-1, 1))
        scaler_dict = {scale_type: scaler}
    elif scale_type=='standard':
        scaler = StandardScaler()
        df = scaler.fit_transform(df.copy().values.reshape(-1, 1))
        scaler_dict = {scale_type: scaler}
    else:
        scaler_dict = None

    return scaler_dict, df
    
    
def normalize_target(df=pd.DataFrame(), target_name=None, timeframe=-1, normalization_type='simple_return'):
    """
    Normalizes target based on normalzation normilization_type param
    ** DataFrame is shifted using pd.shift(timeframe) to accurately represent target. Note that a shift of -1 will
        shift the data forward by one cell, allowing for prediction of next timeframe price.
    """
    print(
        f'normalization_type: {normalization_type} \ntarget: {target_name} {timeframe *-1} day shifted')



    if normalization_type == 'simple_return':
        return df[target_name].shift(timeframe) / df['Close']

    elif normalization_type == 'return':
        return (df[target_name].shift(timeframe) / df['Close']) - 1

    elif normalization_type == 'log_return':
        return np.log(df[target_name].shift(timeframe) / df['Close'])


    print('WARNING: NO NORMALIZATION USED, target must be predifined')
    return df[target_name]

def unnormalze_target(df=pd.DataFrame(), scaler = dict(), target_name=None, timeframe=-1, normalization_type='simple_return'):
    """
    Un-normalizes target based on normalzation normalization_type param
    """
    if len(scaler.keys()) != 0:
        scaler = scaler[target_name]
        df[target_name] = scaler.inverse_transform(df[target_name].values.reshape(-1,1))

    # Close over close
    if self.original_target_name == 'Close':
        if normalization_type == 'simple_return':
            return df[self.original_target_name] * (df[target_name])

        elif normalization_type == 'return':
            return df[self.original_target_name] * (df[target_name]+1)

        elif normalization_type == 'log_return':
            return np.exp(df[target_name]) * df[self.original_target_name]

    # indicator return from last close value
    if normalization_type == 'simple_return':
        return df['close'] * (df[target_name])

    elif normalization_type == 'return':
        return df['close'] * (df[target_name]+1)

    elif normalization_type == 'log_return':
        return np.exp(df[target_name]) * df['close']

    else:
        print('WARNING: NO NORMILIZATION USED')



def define_target(self, df):
    # Define target as pct change between current close and next day high
    # *** Update 'load_reduced_data()' added columns if changes made ***
#        df['target'] = (df['High'].shift(-1) / df['Close']) - 1
    df['target'] = self.normalize_target(df, target_name='High', normalization_type='log_return')

    return df

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 41)

In [None]:
#ticker_dataframe = pd.read_pickle(parentDirectory+ '/data/raw_data_2021-11-12_12.31.45.pkl')
#coi = ['ticker','reportperiod','Open','High','Low','Close','Volume']
#ticker_dataframe['reportperiod'] = pd.to_datetime(ticker_dataframe['reportperiod'])
#ticker_dataframe = ticker_dataframe[(ticker_dataframe.ticker == 'MSFT') & (ticker_dataframe['reportperiod'].dt.year == 2020)][coi]
#ticker_dataframe['Return_High_2_Nxt_Close'] = ticker_dataframe['High'].shift(periods=-1, axis=0) / ticker_dataframe['Close']
#ticker_dataframe['Return_High_2_Nxt_Close'].fillna(0,inplace=True)

In [4]:
def stock_decomp(dfile, tick, annum, minmax= False, standard=False, logged=False, export=False):
    """ 
    PARAMS
    dfile = string of data file path
    tick = string stock ticker name
    annum = int year of interest
    minmax = Boolean to scale or not 
    standard = Boolean to scale or not 
    logged = Boolean to log transform or not
    export = Boolean to export or not 
    
    RETURN:
    dictionary keys ['decomposed_ticker_features_series', 'scalers']
    """
    
    # grab only the data needed
    ticker_dataframe = pd.read_pickle(parentDirectory+ dfile)
    coi = ['ticker','reportperiod','Open','High','Low','Close','Volume']
    ticker_dataframe['reportperiod'] = pd.to_datetime(ticker_dataframe['reportperiod'])
    ticker_dataframe = ticker_dataframe[(ticker_dataframe.ticker == tick) & (ticker_dataframe['reportperiod'].dt.year == annum)][coi]
    ticker_dataframe['Return_High_2_Nxt_Close'] = ticker_dataframe['High'].shift(-1) / ticker_dataframe['Close']
    ticker_dataframe['Return_High_2_Nxt_Close'].fillna(0,inplace=True) # cannot have NaN in CEEMDAN
    coi.append('Return_High_2_Nxt_Close')
    
    # decomposing
    decomposed_ticker_features_series = {}
    scalers = {}
    ceemdan = CEEMDAN(parallel=True, processes=10)
    
    for ticker in ticker_dataframe.ticker.unique():
        print(f'[{ticker}] Decomposing...')
        decomposed_ticker_features_series[ticker] = {}
        scalers[ticker] = {}
        
        if logged:
            for c in coi[2:]:
                ticker_dataframe[c] = np.log(ticker_dataframe[c])
         
        for column in coi[2:]:  
            decomposed_ticker_features_series[ticker][column] ={}      
            try:
                
                series = ticker_dataframe[column].values.reshape(-1,1)

                
                # decompose
                ticker_feature_time_series = np.frombuffer(series)
                ticker_feature_time_series_imfs = ceemdan(ticker_feature_time_series, max_imf=10)
                #print(ticker_feature_time_series_imfs)
                #print(f'[{ticker}][{column}] Decomposition finished; restructuring')
                
                # iterating every IMF 
                for i, imf_series in enumerate(ticker_feature_time_series_imfs):
                    
                    if i < len(ticker_feature_time_series_imfs)-1: # last one is residual
                        decomposed_ticker_features_series[ticker][column][f'IMF{i+1}'] = imf_series
                    else:
                        decomposed_ticker_features_series[ticker][column][f'Rsd'] = imf_series
                print(f'Finished Decomposing [{ticker}][{column}]')
            except:
                print(f'ERROR ticker[{ticker}][{column}]')
                decomposed_ticker_features_series[ticker][column] = 'ERROR'
                
            finally:
                continue
    


    for ticker in decomposed_ticker_features_series.keys():

            scalers[ticker] = {}
            
            for column in decomposed_ticker_features_series[ticker].keys():
                for imf in decomposed_ticker_features_series[ticker][column]:
                    #print(f'{column} {imf} {t[ticker][column][imf].shape[0]}')  
                    cut = round(train*decomposed_ticker_features_series[ticker][column][imf].shape[0])
                    train_series = decomposed_ticker_features_series[ticker][column][imf][:cut].reshape(-1,1)
                
                if standard:
                    scaler = StandardScaler()
                    scaler.fit(train_series)
                    scalers[ticker][column] = scaler
                    decomposed_ticker_features_series[ticker][column][imf] = scaler.transform(decomposed_ticker_features_series[ticker][column][imf].reshape(-1,1)).flatten()
                
                if minmax:
                    scaler = MinMaxScaler()
                    scaler.fit(train_series)
                    scalers[ticker][column] = scaler
                    decomposed_ticker_features_series[ticker][column][imf] = scaler.transform(decomposed_ticker_features_series[ticker][column][imf].reshape(-1,1)).flatten()
    
    tmp_dict = {}
    tmp_dict['decomposed_ticker_features_series'] = decomposed_ticker_features_series
    tmp_dict['scalers'] = scalers
    tmp_dict['dates'] = ticker_dataframe.reportperiod.dt.strftime("%m/%d/%Y").values
            
    if export:
        
        # get date stamp
        dateTimeObj = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
        
        if standard:
            s = 'stdscl'
        if minmax:
            s = 'minmaxscl'
        else:
            s = 'no_scale'
        if logged:
            l = 'logged'
        else:
            l = 'orig'
            
            
        # save file
        with open(parentDirectory +  f'/data/DNN_testing_outputs/{tick}__{annum}_decomposed_ticker_features_series_{l}_{s}_{dateTimeObj}.pkl', 'wb') as f:
            pickle.dump(tmp_dict, f)
        with open(parentDirectory + f'/data/DNN_testing_outputs/{tick}__{annum}_scalers_{l}_{s}_{dateTimeObj}.pkl', 'wb') as f:
            pickle.dump(scalers, f)
        print('Export Locations:')
        print(f'/data/DNN_testing_outputs/{tick}__{annum}_decomposed_ticker_features_series_{l}_{s}_{dateTimeObj}.pkl')
        print(f'/data/DNN_testing_outputs/{tick}__{annum}_scalers_{l}_{s}_{dateTimeObj}.pkl')
    
            
    return tmp_dict
    

In [None]:
# let's test normalizing before and after
# Let's test removing close and also changing high and low to relative deltas
# relative deltas being low to next day high or high to next day high spread
# arguably given a big enough data we should be able to catch the relative ranges 
# and therefore mitigate leakage and afford training on un seen

In [9]:
test_dict = stock_decomp(dfile= '/data/raw_data_2021-11-12_12.31.45.pkl', tick='MSFT', minmax= True, annum=2020, export=True)

[MSFT] Decomposing...
Finished Decomposing [MSFT][Open]
Finished Decomposing [MSFT][High]
Finished Decomposing [MSFT][Low]
Finished Decomposing [MSFT][Close]
Finished Decomposing [MSFT][Volume]
Finished Decomposing [MSFT][Return_High_2_Nxt_Close]
Export Locations:
/data/DNN_testing_outputs/MSFT__2020_decomposed_ticker_features_series_orig_minmaxscl_2021-11-30_20.46.45.pkl
/data/DNN_testing_outputs/MSFT__2020_scalers_orig_minmaxscl_2021-11-30_20.46.45.pkl


In [17]:
# Printing structure
tick = list(test_dict['decomposed_ticker_features_series'].keys())[0]
print(f"TICKER: {tick} {test_dict['decomposed_ticker_features_series'][tick].keys()}")
for k in test_dict['decomposed_ticker_features_series'][tick].keys():
    print(f"FEATURE: {k} \n{test_dict['decomposed_ticker_features_series'][tick][k].keys()}")
    

TICKER: MSFT dict_keys(['Open', 'High', 'Low', 'Close', 'Volume', 'Return_High_2_Nxt_Close'])
FEATURE: Open 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'Rsd'])
FEATURE: High 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'Rsd'])
FEATURE: Low 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'Rsd'])
FEATURE: Close 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'Rsd'])
FEATURE: Volume 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'Rsd'])
FEATURE: Return_High_2_Nxt_Close 
dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'Rsd'])


# Part 2
### Restart notebook for Tensor

In [1]:
from datetime import timedelta, datetime
import pandas as pd
import numpy as np
import pickle

from scipy.interpolate import CubicSpline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error 
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense, LSTM, LeakyReLU
from tensorflow.keras.callbacks import ModelCheckpoint
from tqdm import tqdm
import sys, os

import plotly.express as px
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

experiment_time = datetime.now().strftime("%H_%M_%S_%m_%d_%Y")

features_in_order = ['Open', 'High', 'Low', 'Volume', 'Close', 'Return_High_2_Nxt_Close'] # target feature must be the last one here
target_feature = 'Return_High_2_Nxt_Close'

train = 0.8
validation = 0.1
test = 0.1

absolutepath = os.path.abspath('')
fileDirectory = os.path.dirname(absolutepath)

#Path of parent directory (moves outside of repository)
parentDirectory = os.path.dirname(fileDirectory)

class ManyToOneTimeSeriesGenerator(TimeseriesGenerator):
  def __getitem__(self, idx):
    x, y = super().__getitem__(idx)
    last_element_index = y.shape[1]-1
    return x, y[:,last_element_index].reshape(1,-1)

class SplineModel():
    def __init__(self,time_series_generator):
        self.name = "SplineModel"
        self.gen = time_series_generator
    
    def predict(self, x_window, verbose=0):
        result = []
        x_window = np.squeeze(x_window, axis=0)
        last_element_index = x_window.shape[1]-1
        series = x_window[:,last_element_index].reshape(-1)
        cs = CubicSpline(np.arange(len(series)), series)
        next_value = cs(len(series)+1)
        result += [next_value]

        return np.array(result).reshape(1,-1) # 1,-1

In [2]:
with open(parentDirectory +  f'/data/DNN_testing_outputs/MSFT__2020_decomposed_ticker_features_series_orig_minmaxscl_2021-11-30_20.46.45.pkl', 'rb') as f:
    test_dict = pickle.load(f)

In [27]:
for s in test_dict['decomposed_ticker_features_series']['MSFT']['Open']:
    print(test_dict['decomposed_ticker_features_series']['MSFT']['Open'][s].shape)

(253,)
(253,)
(253,)
(253,)
(253,)
(1, 253)


In [3]:
# Data organization
max_window_size = 10
windows_sizes_for_imf_level = {
    'IMF1': 2,
    'IMF2': 2,
    'IMF3': 3,
    'IMF4': 3,
    'IMF5': 4,
    'IMF6': 4,
    'IMF7': 5,
    'IMF8': 5,
    'Rsd': 6,
    'DEFAULT': 4
}
target_feature_max_imf_level = {}

# Coupling together the IMFs of the same level for different features to create exogenous input
# The number of imfs for each feature decomposition may differ, thus some of the last imfs may not match in number of features
series = {}
for ticker in test_dict['decomposed_ticker_features_series']:
    
    series[ticker] = {}
    target_feature_max_imf_level[ticker] ={}


    for feature in test_dict['decomposed_ticker_features_series'][ticker]:
        
        imfs = pd.DataFrame.from_dict(test_dict['decomposed_ticker_features_series'][ticker][feature])
        
        for imf in imfs:
            if imf not in series[ticker]:
                series[ticker][imf] = []
            _series = imfs[imf].values
            _series = _series.reshape((len(_series),1)) # reshaping to get into column format
            series[ticker][imf] += [_series]
            if feature == target_feature:
                target_feature_max_imf_level[ticker] = imf


dataset = {}
# # horizontal stack
for ticker in series:
    dataset[ticker] = {}
    for imf_level in series[ticker]:
        dataset[ticker][imf_level] = np.hstack(tuple(series[ticker][imf_level]))




In [4]:
train = 0.8
validation = 0.1
test = 0.1

train_dataset = {}
train_dataset['dates'] = test_dict['dates'][:round(train*dataset[ticker][imf_level].shape[0])]
validation_dataset = {}
validation_dataset['dates'] = test_dict['dates'][round(train*dataset[ticker][imf_level].shape[0]):round((train+validation)*dataset[ticker][imf_level].shape[0])]
test_dataset = {}
test_dataset['dates'] =  test_dict['dates'][round((train+validation)*dataset[ticker][imf_level].shape[0]):]

for ticker in dataset:
    
    train_dataset[ticker] = {}
    validation_dataset[ticker] = {}
    test_dataset[ticker] = {}
    
    for imf_level in dataset[ticker]:
                
        # splitting data sets according to rates
        train_dataset[ticker][imf_level] = dataset[ticker][imf_level][:round(train*dataset[ticker][imf_level].shape[0]),:]
        validation_dataset[ticker][imf_level] = dataset[ticker][imf_level][round(train*dataset[ticker][imf_level].shape[0]):round((train+validation)*dataset[ticker][imf_level].shape[0]),:]
        test_dataset[ticker][imf_level] = dataset[ticker][imf_level][round((train+validation)*dataset[ticker][imf_level].shape[0]):,:]

In [5]:
def generator(dataset):
    # data set split rates
    # create generators
    # NOTE STANDARD SCALER was FIT on .75 split so leakage if TRAIN VALIDATE GOES PAST


    train_generators = {}
    validation_generators = {}
    test_generators = {}

    for ticker in dataset:

        train_generators[ticker] = {}
        validation_generators[ticker] = {}
        test_generators[ticker] = {}
        

        for imf_level in dataset[ticker]:

            if imf_level in windows_sizes_for_imf_level:
                window_size = windows_sizes_for_imf_level[imf_level]
            else: 
                window_size = windows_sizes_for_imf_level['DEFAULT']
            # windowing
            train_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(train_dataset[ticker][imf_level], train_dataset[ticker][imf_level], length=window_size, batch_size=1)
            validation_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(validation_dataset[ticker][imf_level], validation_dataset[ticker][imf_level], length=window_size, batch_size=1)
            test_generators[ticker][imf_level] = ManyToOneTimeSeriesGenerator(test_dataset[ticker][imf_level], test_dataset[ticker][imf_level], length=window_size, batch_size=1)
    
    return train_generators,validation_generators,test_generators

In [6]:
train_generators,validation_generators,test_generators = generator(dataset)

In [7]:
# Model Training

models = {}

model_epochs = {
    'IMF1': 2500,
    'IMF2': 2000,
    'IMF3': 1500,
    'IMF4': 1500,
    'IMF5': 1500,
    'IMF6': 1200,
    'IMF7': 1200,
    'IMF8': 1000,
    'Rsd': 1000,
    'DEFAULT': 1000
}

imfs_to_predict_with_neural = ['IMF1','IMF2'] # set to ['IMF1'] , ['IMF1', 'IMF2'], ['IMF1','IMF2','IMF3'] and so on



for ticker in train_generators:
    models[ticker] = {}

    reached_max_imf_of_target_feature = False
    for imf_level in train_generators[ticker]:
        if imf_level in imfs_to_predict_with_neural:
            print(f'Training model [{ticker}][{imf_level}]')
            if reached_max_imf_of_target_feature is True:
                break # no need to predict further if target feature doesn't contain greater IMF levels

            if target_feature_max_imf_level[ticker] == imf_level:
                reached_max_imf_of_target_feature = True
            # Prediction model
            model = Sequential()
            current_dataset = train_dataset[ticker][imf_level]
            n_features = current_dataset.shape[1]
            cur_tmp_gen = train_generators[ticker][imf_level]
            cur_tmp_val_gen = validation_generators[ticker][imf_level]

            if imf_level in windows_sizes_for_imf_level:
                window_size = windows_sizes_for_imf_level[imf_level]
            else: 
                window_size = windows_sizes_for_imf_level['DEFAULT']

            model.add(LSTM(128, activation='tanh', return_sequences=True, input_shape=(window_size, n_features)))
            model.add(LSTM(64, activation='tanh', input_shape=(window_size, 128)))
            model.add(Dense(16))
            model.add(LeakyReLU())
            model.add(Dense(4))
            model.add(LeakyReLU())
            model.add(Dense(1)) # 1 target feature only
            model.compile(optimizer='adam', loss='mse')

            number_of_epochs = model_epochs[imf_level]
            checkpoint_path = fileDirectory + "/data/DNN_tmp/" +f"{ticker}_" +f"ltsm_spline.h5"
            
            callbacks = [ ModelCheckpoint(checkpoint_path, monitor='loss', mode="max", verbose=0,save_best_only=True, save_weights_only=False, save_freq=250)]

            # fit model
            #model.fit_generator(cur_tmp_gen, steps_per_epoch=1, epochs=number_of_epochs, verbose=0)
            model.fit(cur_tmp_gen, validation_data=cur_tmp_val_gen, steps_per_epoch=10, epochs=number_of_epochs, verbose=0, callbacks=callbacks)

            models[ticker][imf_level] = model
        else:
            # Spline prediction model
            cur_tmp_gen = train_generators[ticker][imf_level]
            model = SplineModel(cur_tmp_gen)
            models[ticker][imf_level] = model
            


Training model [MSFT][IMF1]
Training model [MSFT][IMF2]


In [8]:
# reset generators
train_generators,validation_generators,test_generators = generator(dataset)

In [9]:
# predicting

results = {}

for ticker in models:
    results[ticker] = {}

    # initializing results dictionary
    for feature in features_in_order:
        if feature != target_feature:
            continue
        results[ticker][feature] = {}
        for imf_level in models[ticker]:
            results[ticker][feature][imf_level] = {
                'real_train': [],
                'predicted_train': [],
                'real_validation': [],
                'predicted_validation': [],
                'real_test': [],
                'predicted_test': [],
                'x_axis_train': [],
                'x_axis_validation': [],
                'x_axis_test': []
            }

    for imf_level in models[ticker]:
        model = models[ticker][imf_level]
        
        print(f'Predicting: [{ticker}][{imf_level}]')

        cur_train_gen = train_generators[ticker][imf_level]
        cur_validation_gen = validation_generators[ticker][imf_level]
        cur_test_gen = test_generators[ticker][imf_level]

        # predicting train
        day_counter = 0
        for i in range(len(cur_train_gen)):
            x, y = cur_train_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_train'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_train'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_train'] += [day_counter]
            day_counter += 1

        # predicting validation
        for i in range(len(cur_validation_gen)):
            x, y = cur_validation_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_validation'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_validation'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_validation'] += [day_counter]
            day_counter += 1

        # predicting test
        for i in range(len(cur_test_gen)):
            x, y = cur_test_gen[i]
            yhat = model.predict(x, verbose=0)

            for j in range(yhat.shape[1]):
                results[ticker][target_feature][imf_level]['real_test'] += [y[:,j][0]]
                results[ticker][target_feature][imf_level]['predicted_test'] += [yhat[:,j][0]]
                results[ticker][target_feature][imf_level]['x_axis_test'] += [day_counter]
            day_counter += 1


Predicting: [MSFT][IMF1]
Predicting: [MSFT][IMF2]
Predicting: [MSFT][IMF3]
Predicting: [MSFT][IMF4]
Predicting: [MSFT][IMF5]
Predicting: [MSFT][Rsd]


In [53]:
len(results['MSFT']['Return_High_2_Nxt_Close']['IMF1']['real_test'])

23

In [45]:
results['MSFT']['Return_High_2_Nxt_Close']['IMF1']['predicted_test']

[0.0032440978,
 0.0023520226,
 0.0026988569,
 0.002554127,
 0.0023518926,
 0.002698889,
 0.0026988569,
 0.002554127,
 0.0023520226,
 0.002554127,
 0.0023520226,
 0.002554127,
 0.0032440978,
 0.0023520226,
 0.002554127,
 0.0023520226,
 0.002554127,
 0.0032440978,
 0.0023520226,
 0.002554127,
 0.0023520226,
 0.002554127,
 0.0023520226]

In [10]:
# organizing imf prediction results, concatenating train, validation and test
concatenated_results = {}

for ticker in results:
    concatenated_results[ticker] = {}
    for feature in results[ticker]:
        concatenated_results[ticker][feature] = {}
        for imf_level in results[ticker][feature]:
            
            df_result = pd.DataFrame.from_dict(results[ticker][feature][imf_level], orient='index').T
            df_train = df_result[['real_train','predicted_train','x_axis_train']].set_index('x_axis_train').dropna(axis=0)
            df_train.index.name = 'x'
            df_validation = df_result[['real_validation','predicted_validation','x_axis_validation']].set_index('x_axis_validation').dropna(axis=0)
            df_validation.index.name = 'x'
            df_test = df_result[['real_test','predicted_test','x_axis_test']].set_index('x_axis_test').dropna(axis=0)
            df_test.index.name = 'x'

            df_concatenated = pd.concat([df_train,df_validation,df_test], axis=1)

            concatenated_results[ticker][feature][imf_level] = df_concatenated

In [51]:
# why are 7 days missing should be 253
concatenated_results['MSFT']['Return_High_2_Nxt_Close']['IMF1'].shape

(247, 6)

In [11]:
# iplot layout
space =  {
            'legend' : {'bgcolor':'#1A1A1C','font':{'color':'#D9D9D9',"size":12}},
            'paper_bgcolor' : '#1A1A1C',
            'plot_bgcolor' : '#1A1A1C',
            "title" : {"font":{"color":"#D9D9D9"},"x":0.5},
            'yaxis' : {
                'tickfont' : {'color':'#C2C2C2', "size":12},
                'gridcolor' : '#434343',
                'titlefont' : {'color':'#D9D9D9'},
                'zerolinecolor' : '#666570',
                'showgrid' : True
            },
            'xaxis' : {
                'tickfont' : {'color':'#C2C2C2', "size":12},
                'gridcolor' : '#434343',
                'titlefont' : {'color':'#D9D9D9'},
                'zerolinecolor' : '#666570',
                'showgrid' : True
            },
            'titlefont' : {'color':'#D9D9D9'}
        }

In [31]:
concatenated_results['MSFT'].keys()

dict_keys(['Return_High_2_Nxt_Close'])

In [37]:
# plotting partial result
plot_ticker = 'MSFT'
plot_feature = target_feature
plot_imf = 'IMF1'

concatenated_results[plot_ticker][plot_feature][plot_imf].iplot(title=f'{plot_ticker} {plot_feature} {plot_imf}', asFigure=True, layout=space)

In [13]:
# recomposing prediction by arithmetically adding the IMF curves

final_prediction_results = {}
max_window_size = 10

for ticker in concatenated_results:
    final_prediction_results[ticker] = {}
    for feature in concatenated_results[ticker]:
        addition_train = None
        addition_validation = None
        addition_test = None

        addition_real_train = None
        addition_real_validation = None
        addition_real_test = None

        # recomposing predictions
        for imf_level in concatenated_results[ticker][feature]:
            # adding test
            can_sum = True
            if addition_test is None:
                addition_test = concatenated_results[ticker][feature][imf_level]['predicted_test'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_test'].values
                cur_length = addition_test.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_test = addition_test[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_test = np.add(addition_test,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding train
            can_sum = True
            if addition_train is None:
                addition_train = concatenated_results[ticker][feature][imf_level]['predicted_train'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_train'].values
                cur_length = addition_train.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_train = addition_train[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_train = np.add(addition_train,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding validation
            can_sum = True
            if addition_validation is None:
                addition_validation = concatenated_results[ticker][feature][imf_level]['predicted_validation'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['predicted_validation'].values
                cur_length = addition_validation.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_validation = addition_validation[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_validation = np.add(addition_validation,np_array_to_be_added)

        # recomposing real
        for imf_level in concatenated_results[ticker][feature]:
            # adding test
            can_sum = True
            if addition_real_test is None:
                addition_real_test = concatenated_results[ticker][feature][imf_level]['real_test'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_test'].values
                cur_length = addition_real_test.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_test = addition_real_test[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_test = np.add(addition_real_test,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding train
            can_sum = True
            if addition_real_train is None:
                addition_real_train = concatenated_results[ticker][feature][imf_level]['real_train'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_train'].values
                cur_length = addition_real_train.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_train = addition_real_train[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_train = np.add(addition_real_train,np_array_to_be_added)

        for imf_level in concatenated_results[ticker][feature]:
            # adding validation
            can_sum = True
            if addition_real_validation is None:
                addition_real_validation = concatenated_results[ticker][feature][imf_level]['real_validation'].values
            else:
                np_array_to_be_added = concatenated_results[ticker][feature][imf_level]['real_validation'].values
                cur_length = addition_real_validation.shape[0]
                next_np_array_length = np_array_to_be_added.shape[0]
                if cur_length < next_np_array_length:
                    if next_np_array_length-cur_length < max_window_size:
                        np_array_to_be_added = np_array_to_be_added[next_np_array_length-cur_length:]
                    else:
                        can_sum = False
                else: 
                    if cur_length-next_np_array_length < max_window_size:
                        addition_real_validation = addition_real_validation[cur_length-next_np_array_length:]
                    else:
                        can_sum = False
                
                if can_sum:
                    addition_real_validation = np.add(addition_real_validation,np_array_to_be_added)
        
        scaler = test_dict['scalers'][ticker][feature]

        final_prediction_results[ticker][feature] = {
            'train_predicted': scaler.inverse_transform(addition_train.reshape(-1,1)).reshape(-1),
            'validation_predicted': scaler.inverse_transform(addition_validation.reshape(-1,1)).reshape(-1),
            'test_predicted': scaler.inverse_transform(addition_test.reshape(-1,1)).reshape(-1),
            'train_real': scaler.inverse_transform(addition_real_train.reshape(-1,1)).reshape(-1),
            'validation_real': scaler.inverse_transform(addition_real_validation.reshape(-1,1)).reshape(-1),
            'test_real': scaler.inverse_transform(addition_real_test.reshape(-1,1)).reshape(-1)
        }


In [20]:
test_dict['decomposed_ticker_features_series']['MSFT'].keys()

dict_keys(['Open', 'High', 'Low', 'Close', 'Volume', 'Return_High_2_Nxt_Close'])

In [32]:
# plotting final result
plot_ticker = 'MSFT'
plot_feature = target_feature

pd.DataFrame.from_dict(final_prediction_results[plot_ticker][plot_feature]).iplot(title=f'{plot_ticker} {plot_feature} {plot_imf}', layout=space)

In [22]:
# Verifying that 'Real' are as expected
ticker_dataframe = pd.read_pickle(fileDirectory+ '/data/raw_data_2021-11-12_12.31.45.pkl')
coi = ['ticker','reportperiod','Open','High','Low','Close','Volume']
ticker_dataframe['reportperiod'] = pd.to_datetime(ticker_dataframe['reportperiod'])
ticker_dataframe = ticker_dataframe[(ticker_dataframe.ticker == 'MSFT') & (ticker_dataframe['reportperiod'].dt.year == 2020)][coi]

ticker_dataframe

FileNotFoundError: [Errno 2] No such file or directory: '/Users/chris/Documents/GitHub/financial_forecasting_analysis/notebooks/data/raw_data_2021-11-12_12.31.45.pkl'

-0.7180880432082801

In [50]:
#final_prediction_results['MSFT']['Close']['validation_predicted']
final_prediction_results['MSFT']['Close']['train_real']

array([147.33417189, 139.59049457, 138.78819044, 139.88446581,
       142.13751499, 139.63054283, 142.60981662, 155.39884389,
       149.48959583, 142.94261885, 137.51601923, 143.8912391 ,
       152.60690829, 143.86908474, 155.8648157 , 176.36733194,
       157.55463302, 150.20754299, 157.90837233, 161.25952279,
       147.34013652, 154.37317031, 157.19139898, 178.26919808,
       170.84748927, 156.52372516, 141.73837143, 147.38980123,
       150.07425168, 158.43070384, 172.68520516, 196.71233315,
       196.42286581, 181.97725275, 226.97834652, 231.6499939 ,
       200.02294844, 200.80954976, 174.19681382, 171.76579944,
       202.20223092, 199.27858647, 193.11322433, 182.17871131,
       227.04103604, 226.43361719, 220.56453912, 212.23097168,
       212.88026469, 218.15044482, 216.86439671, 209.69344598,
       214.00441559, 205.63140853, 192.15608318, 182.9951358 ,
       190.75889811, 208.41774468, 184.12427752, 173.97320091,
       163.76345177, 195.25233683, 189.9660887 , 172.37

In [24]:
adj_close_accuracies = {}
accuracies_detailed = {}

for ticker in final_prediction_results:
    adj_close_accuracies[ticker] = {}
    accuracies_detailed[ticker] = {}
    for feature in final_prediction_results[ticker]:

        y_train = final_prediction_results[ticker][feature]['train_predicted'][~np.isnan(final_prediction_results[ticker][feature]['train_predicted'])]
        yhat_train = final_prediction_results[ticker][feature]['train_real'][~np.isnan(final_prediction_results[ticker][feature]['train_real'])]

        y_validation = final_prediction_results[ticker][feature]['validation_predicted'][~np.isnan(final_prediction_results[ticker][feature]['validation_predicted'])]
        yhat_validation = final_prediction_results[ticker][feature]['validation_real'][~np.isnan(final_prediction_results[ticker][feature]['validation_real'])]

        y_test = final_prediction_results[ticker][feature]['test_predicted'][~np.isnan(final_prediction_results[ticker][feature]['test_predicted'])]
        yhat_test = final_prediction_results[ticker][feature]['test_real'][~np.isnan(final_prediction_results[ticker][feature]['test_real'])]
        accuracies_detailed[ticker][feature] = {
            'mse':{
                'train':mean_squared_error(y_train,yhat_train),
                'validation':mean_squared_error(y_validation,yhat_validation),
                'test':mean_squared_error(y_test,yhat_test),
            },
            'mape':{
                'train':np.mean(np.abs((y_train - yhat_train) / y_train)) * 100,
                'validation':np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100,
                'test':np.mean(np.abs((y_test - yhat_test) / y_test)) * 100,
            }
        }

        if feature == 'Close':
            adj_close_accuracies[ticker] = {
                'mse': mean_squared_error(y_test,yhat_test),
                'mape': np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100
            }

# pd.DataFrame.from_dict(accuracies_detailed[plot_ticker][plot_feature])
df_close_accuracies = pd.DataFrame.from_dict(adj_close_accuracies).T
df_close_accuracies.to_csv(parentDirectory + f"/data/DNN_metrics/{'_'.join(imfs_to_predict_with_neural)}_full_spline_{experiment_time}.csv", sep=',', encoding='utf-8')
df_close_accuracies

MSFT


In [74]:
adj_close_accuracies = {}
accuracies_detailed = {}

for ticker in final_prediction_results:
    adj_close_accuracies[ticker] = {}
    accuracies_detailed[ticker] = {}
    for feature in final_prediction_results[ticker]:

        y_train = final_prediction_results[ticker][feature]['train_predicted'][~np.isnan(final_prediction_results[ticker][feature]['train_predicted'])]
        yhat_train = final_prediction_results[ticker][feature]['train_real'][~np.isnan(final_prediction_results[ticker][feature]['train_real'])]

        y_validation = final_prediction_results[ticker][feature]['validation_predicted'][~np.isnan(final_prediction_results[ticker][feature]['validation_predicted'])]
        yhat_validation = final_prediction_results[ticker][feature]['validation_real'][~np.isnan(final_prediction_results[ticker][feature]['validation_real'])]

        y_test = final_prediction_results[ticker][feature]['test_predicted'][~np.isnan(final_prediction_results[ticker][feature]['test_predicted'])]
        yhat_test = final_prediction_results[ticker][feature]['test_real'][~np.isnan(final_prediction_results[ticker][feature]['test_real'])]
        accuracies_detailed[ticker][feature] = {
            'mse':{
                'train':mean_squared_error(y_train,yhat_train),
                'validation':mean_squared_error(y_validation,yhat_validation),
                'test':mean_squared_error(y_test,yhat_test),
            },
            'mape':{
                'train':np.mean(np.abs((y_train - yhat_train) / y_train)) * 100,
                'validation':np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100,
                'test':np.mean(np.abs((y_test - yhat_test) / y_test)) * 100,
            }
        }

        if feature == 'Close':
            adj_close_accuracies[ticker] = {
                'mse': mean_squared_error(y_test,yhat_test),
                'mape': np.mean(np.abs((y_validation - yhat_validation) / y_validation)) * 100
            }

# pd.DataFrame.from_dict(accuracies_detailed[plot_ticker][plot_feature])
df_close_accuracies = pd.DataFrame.from_dict(adj_close_accuracies).T
df_close_accuracies.to_csv(fileDirectory + f"/data/DNN_metrics/{'_'.join(imfs_to_predict_with_neural)}_full_spline_{experiment_time}.csv", sep=',', encoding='utf-8')
df_close_accuracies

Unnamed: 0,mape,mse
MSFT,6.276005,271.254092
