In [29]:
from datetime import timedelta, datetime
import pandas as pd
import numpy as np
import pickle

from scipy.interpolate import CubicSpline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error 
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense, LSTM, LeakyReLU
from tensorflow.keras.callbacks import ModelCheckpoint
from tqdm import tqdm
import sys, os

experiment_time = datetime.now().strftime("%H_%M_%S_%m_%d_%Y")


features_in_order = ['Open', 'High', 'Low', 'Volume', 'Close'] # target feature must be the last one here
target_feature = 'Close'

absolutepath = os.path.abspath('')
fileDirectory = os.path.dirname(absolutepath)

class ManyToOneTimeSeriesGenerator(TimeseriesGenerator):
  def __getitem__(self, idx):
    x, y = super().__getitem__(idx)
    last_element_index = y.shape[1]-1
    return x, y[:,last_element_index].reshape(1,-1)

In [30]:
experiment_time

'22_00_50_11_19_2021'

In [20]:
np.__version__

'1.19.5'

In [31]:
# helper function to model later IMFs as splines
class SplineModel():
    def __init__(self,time_series_generator):
        self.name = "SplineModel"
        self.gen = time_series_generator
    
    def predict(self, x_window, verbose=0):
        result = []
        x_window = np.squeeze(x_window, axis=0)
        last_element_index = x_window.shape[1]-1
        series = x_window[:,last_element_index].reshape(-1)
        cs = CubicSpline(np.arange(len(series)), series)
        next_value = cs(len(series)+1)
        result += [next_value]

        return np.array(result).reshape(1,-1) # 1,-1

# Read in Log Transformed and Standard Scaler Transform
## 00_DNN_pre_processing.ipynb

In [32]:
with open(fileDirectory + f'/data/std_scale_trans_75_25_split_on_year_decomposed_ticker_features_series_log_2021-11-17_15.44.40.pkl', 'rb') as fi:
    decomposed_ticker_features_series = pickle.load(fi)

In [33]:
tickers = list(decomposed_ticker_features_series.keys())
years = list(decomposed_ticker_features_series['MSFT'].keys())

In [34]:
# Example of nested structure
decomposed_ticker_features_series['MSFT'][2019]['Open']['IMF1']

array([ 1.13038013e-04,  1.10429028e-03, -1.03732709e-03,  3.46936006e-04,
        5.83459829e-04,  6.62470565e-04, -4.13004284e-04,  7.40215529e-04,
       -1.27108983e-03, -1.29599952e-03,  1.52096065e-03, -1.59589046e-03,
        1.64420873e-03, -3.02100855e-04, -1.61368830e-03, -2.04555935e-04,
        1.15085746e-03,  6.34560561e-04, -4.16424978e-04,  2.41264601e-04,
       -3.42745297e-04,  8.82930959e-04, -1.66253006e-03,  2.12253514e-03,
        2.09916049e-03, -9.82567989e-04, -1.64631056e-03,  1.23345787e-03,
       -9.84824794e-04,  9.59186698e-04, -1.03347013e-03,  1.09025327e-03,
        1.23361530e-04,  8.07760606e-04, -1.68340386e-03,  1.52141277e-03,
        1.68938839e-03, -1.33843530e-03, -1.54047158e-03, -1.15936202e-03,
        3.29083489e-04,  7.73011582e-04,  5.83768969e-06,  8.35053884e-04,
        1.60440994e-03, -1.78162307e-03,  6.02642137e-04,  1.70160447e-03,
        1.37920319e-03, -4.52730803e-04, -1.07939121e-03, -1.14213071e-03,
        1.35218903e-03, -

# Data organization
## Read note about series_cut

In [67]:
max_window_size = 10
windows_sizes_for_imf_level = {
    'IMF1': 4,
    'IMF2': 4,
    'IMF3': 4,
    'IMF4': 4,
    'IMF5': 4,
    'IMF6': 4,
    'IMF7': 4,
    'IMF8': 4,
    'Rsd': 6,
    'DEFAULT': 4
}

target_feature_max_imf_level = {}

# Coupling together the IMFs of the same level for different features to create exogenous input
# The number of imfs for each feature decomposition may differ, thus some of the last imfs may not match in number of features
series = {}
for ticker in decomposed_ticker_features_series:
    
    series[ticker] = {}
    target_feature_max_imf_level[ticker] ={}
    
    for y in decomposed_ticker_features_series[ticker]:
        series[ticker][y] = {}

        for feature in decomposed_ticker_features_series[ticker][y]:
            
            imfs = pd.DataFrame.from_dict(decomposed_ticker_features_series[ticker][y][feature])
            
            for imf in imfs:
                if imf not in series[ticker][y]:
                    series[ticker][y][imf] = []
                _series = imfs[imf].values
                _series = _series.reshape((len(_series),1)) # reshaping to get into column format
                series[ticker][y][imf] += [_series]
                if feature == target_feature:
                    target_feature_max_imf_level[ticker][y] = imf

# cut_spare_imfs: when any of the exogenous features have more imfs than the target feature. This solves a bug, if not excluded, these spare imfs from exogenous features would be wrongly added in the recomposition of the target feature.
series_cut = {}
for ticker in series:
    if ticker not in series_cut:
        series_cut[ticker] = {}
    for y in series[ticker]:
        series_cut[ticker][y] = {}
        for imf_level_string in series[ticker][y]:
            imf_level_int = int(imf_level_string[3:])
            if imf_level_int > int(target_feature_max_imf_level[ticker][y][3:]):
                continue
            else:
                #print(f'ticker = {str(ticker)}, y = {str(y)} imf_level_string ={imf_level_string}')
                series_cut[ticker][y][imf_level_string] = series[ticker][y][imf_level_string].copy()
# if doing full and not spline then uncomment below
series = series_cut

dataset = {}
# # horizontal stack
for ticker in series:
    dataset[ticker] = {}
    for y in series[ticker]:
        dataset[ticker][y] = {}
        for imf_level in series[ticker][y]:
            dataset[ticker][y][imf_level] = np.hstack(tuple(series[ticker][y][imf_level]))


In [None]:
series['HD'][2020].keys()

dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'IMF6', 'IMF7'])

In [9]:
# example of structure
dataset['HD'][2008].keys()

dict_keys(['IMF1', 'IMF2', 'IMF3', 'IMF4', 'IMF5', 'IMF6'])

In [68]:
# data set split rates
# create generators
# NOTE STANDARD SCALER was FIT on .75 split so leakage if TRAIN VALIDATE GOES PAST

train = 0.55
validation = 0.2
test = 0.25

train_dataset = {}
validation_dataset = {}
test_dataset = {}

train_generators = {}
validation_generators = {}
test_generators = {}

for ticker in dataset:

    train_dataset[ticker] = {}
    validation_dataset[ticker] = {}
    test_dataset[ticker] = {}

    train_generators[ticker] = {}
    validation_generators[ticker] = {}
    test_generators[ticker] = {}
    
    for y in [2020,2021]:
        train_dataset[ticker][y] = {}
        validation_dataset[ticker][y] = {}
        test_dataset[ticker][y] = {}

        train_generators[ticker][y] = {}
        validation_generators[ticker][y] = {}
        test_generators[ticker][y] = {}

        for imf_level in dataset[ticker][y]:
            
            # splitting data sets according to rates
            train_dataset[ticker][y][imf_level] = dataset[ticker][y][imf_level][:round(train*dataset[ticker][y][imf_level].shape[0]),:]
            validation_dataset[ticker][y][imf_level] = dataset[ticker][y][imf_level][round(train*dataset[ticker][y][imf_level].shape[0]):round((train+validation)*dataset[ticker][y][imf_level].shape[0]),:]
            test_dataset[ticker][y][imf_level] = dataset[ticker][y][imf_level][round((train+validation)*dataset[ticker][y][imf_level].shape[0]):,:]

            if imf_level in windows_sizes_for_imf_level:
                window_size = windows_sizes_for_imf_level[imf_level]
            else: 
                window_size = windows_sizes_for_imf_level['DEFAULT']
            # windowing
            train_generators[ticker][y][imf_level] = ManyToOneTimeSeriesGenerator(train_dataset[ticker][y][imf_level], train_dataset[ticker][y][imf_level], length=window_size, batch_size=1)
            validation_generators[ticker][y][imf_level] = ManyToOneTimeSeriesGenerator(validation_dataset[ticker][y][imf_level], validation_dataset[ticker][y][imf_level], length=window_size, batch_size=1)
            test_generators[ticker][y][imf_level] = ManyToOneTimeSeriesGenerator(test_dataset[ticker][y][imf_level], test_dataset[ticker][y][imf_level], length=window_size, batch_size=1)


In [46]:
target_feature_max_imf_level['HD'].keys()

dict_keys([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021])

In [19]:
# Model Training

models = {}

model_epochs = {
    'IMF1': 2500,
    'IMF2': 2000,
    'IMF3': 1500,
    'IMF4': 1500,
    'IMF5': 1500,
    'IMF6': 1200,
    'IMF7': 1200,
    'IMF8': 1000,
    'Rsd': 1000,
    'DEFAULT': 1000
}

imfs_to_predict_with_neural = ['IMF1', 'IMF2'] # set to ['IMF1'] , ['IMF1', 'IMF2'], ['IMF1','IMF2','IMF3'] and so on



for ticker in train_generators:
    models[ticker] = {}
    for y in train_generators[ticker]:
        models[ticker][y] = {}
        reached_max_imf_of_target_feature = False
        for imf_level in train_generators[ticker][y]:
            if imf_level in imfs_to_predict_with_neural:
                print(f'Training model [{ticker}][{y}][{imf_level}]')
                if reached_max_imf_of_target_feature is True:
                    break # no need to predict further if target feature doesn't contain greater IMF levels

                if target_feature_max_imf_level[ticker][y] == imf_level:
                    reached_max_imf_of_target_feature = True
                # Prediction model
                model = Sequential()
                current_dataset = train_dataset[ticker][y][imf_level]
                n_features = current_dataset.shape[1]
                cur_tmp_gen = train_generators[ticker][y][imf_level]
                cur_tmp_val_gen = validation_generators[ticker][y][imf_level]

                if imf_level in windows_sizes_for_imf_level:
                    window_size = windows_sizes_for_imf_level[imf_level]
                else: 
                    window_size = windows_sizes_for_imf_level['DEFAULT']

                model.add(LSTM(128, activation='tanh', return_sequences=True, input_shape=(window_size, n_features)))
                model.add(LSTM(64, activation='tanh', input_shape=(window_size, 128)))
                model.add(Dense(16))
                model.add(LeakyReLU())
                model.add(Dense(4))
                model.add(LeakyReLU())
                model.add(Dense(1)) # 1 target feature only
                model.compile(optimizer='adam', loss='mse')

                number_of_epochs = model_epochs[imf_level]
                checkpoint_path = fileDirectory + "/data/DNN_tmp/" +f"{ticker}_{y}_" +f"ltsm_spline.h5"
                
                callbacks = [ ModelCheckpoint(checkpoint_path, monitor='loss', mode="max", verbose=0,save_best_only=True, save_weights_only=False, save_freq=250)]

                # fit model
                #model.fit_generator(cur_tmp_gen, steps_per_epoch=1, epochs=number_of_epochs, verbose=0)
                model.fit(cur_tmp_gen, validation_data=cur_tmp_val_gen, steps_per_epoch=10, epochs=number_of_epochs, verbose=0, callbacks=callbacks)

                models[ticker][y][imf_level] = model
            else:
                # Spline prediction model
                cur_tmp_gen = train_generators[ticker][y][imf_level]
                model = SplineModel(cur_tmp_gen)
                models[ticker][y][imf_level] = model
                


Training model [HD][2021][IMF1]
Training model [HD][2021][IMF2]
Training model [ADSK][2021][IMF1]
Training model [ADSK][2021][IMF2]
Training model [MTD][2021][IMF1]
Training model [MTD][2021][IMF2]
Training model [WAT][2021][IMF1]
Training model [WAT][2021][IMF2]
Training model [V][2021][IMF1]
Training model [V][2021][IMF2]
Training model [MSFT][2021][IMF1]
Training model [MSFT][2021][IMF2]
Training model [CARR][2021][IMF1]
Training model [CARR][2021][IMF2]
Training model [AMAT][2021][IMF1]
Training model [AMAT][2021][IMF2]
Training model [JNJ][2021][IMF1]
Training model [JNJ][2021][IMF2]
Training model [UNH][2021][IMF1]
Training model [UNH][2021][IMF2]
Training model [XOM][2021][IMF1]
Training model [XOM][2021][IMF2]


In [32]:
# example 
models['HD'][2021]

{'IMF1': <keras.engine.sequential.Sequential at 0x294626460>,
 'IMF2': <keras.engine.sequential.Sequential at 0x282db36d0>,
 'IMF3': <__main__.SplineModel at 0x2903196d0>,
 'IMF4': <__main__.SplineModel at 0x2cd715220>,
 'IMF5': <__main__.SplineModel at 0x2cd715d30>,
 'IMF6': <__main__.SplineModel at 0x2cd715d00>}

In [37]:
for t in models:
    for y in [2021]:
        for IM in models[t][2021]:
            if IM in ['IMF1','IMF2']:
                models[t][y][IM].save(os.path.join(fileDirectory+f"/data/tf_LSTM_{t}_{y}_{IM}network.h5"))
            else:
                with open(fileDirectory+f'/data/LSTM_{t}_{y}_{IM}_spline.pkl', 'wb') as fi:
                    pickle.dump(models[t][y][IM], fi)

In [42]:
# export supporting data
exports = {'train_dataset':train_dataset,
               'validation_dataset':validation_dataset,
               'test_dataset':test_dataset,
               'train_generators':train_generators,
               'validation_generators':validation_generators,
               'test_generators':test_generators}

for k,v in exports.items():
    with open(fileDirectory+f'/data/LTSM_1st_export_all_tickers_2021_{k}.pkl', 'wb') as fi:
                    pickle.dump(v, fi)

# Repeating the Process

In [53]:
def model_trainer(imfs_to_predict=['IMF1','IMF2','IMF3','IMF4','IMF5','IMF6','IMF7','IMF8','RSD'],note='full', spe=1, year=2020):
    # Model Training

    models_full = {}

    model_epochs = {
        'IMF1': 2500,
        'IMF2': 2000,
        'IMF3': 1500,
        'IMF4': 1500,
        'IMF5': 1500,
        'IMF6': 1200,
        'IMF7': 1200,
        'IMF8': 1000,
        'Rsd': 1000,
        'DEFAULT': 1000
    }

    imfs_to_predict_with_neural = imfs_to_predict # set to ['IMF1'] , ['IMF1', 'IMF2'], ['IMF1','IMF2','IMF3'] and so on



    for ticker in train_generators:
        models_full[ticker] = {}
        for y in train_generators[ticker]:
            if y == year:
                models_full[ticker][y] = {}
                reached_max_imf_of_target_feature = False
                for imf_level in train_generators[ticker][y]:
                    if imf_level in imfs_to_predict_with_neural:
                        print(f'Training model [{ticker}][{y}][{imf_level}]')
                        if reached_max_imf_of_target_feature is True:
                            break # no need to predict further if target feature doesn't contain greater IMF levels

                        if target_feature_max_imf_level[ticker][y] == imf_level:
                            reached_max_imf_of_target_feature = True
                        # Prediction model
                        model = Sequential()
                        current_dataset = train_dataset[ticker][y][imf_level]
                        n_features = current_dataset.shape[1]
                        cur_tmp_gen = train_generators[ticker][y][imf_level]
                        cur_tmp_val_gen = validation_generators[ticker][y][imf_level]

                        if imf_level in windows_sizes_for_imf_level:
                            window_size = windows_sizes_for_imf_level[imf_level]
                        else: 
                            window_size = windows_sizes_for_imf_level['DEFAULT']

                        model.add(LSTM(128, activation='tanh', return_sequences=True, input_shape=(window_size, n_features)))
                        model.add(LSTM(64, activation='tanh', input_shape=(window_size, 128)))
                        model.add(Dense(16))
                        model.add(LeakyReLU())
                        model.add(Dense(4))
                        model.add(LeakyReLU())
                        model.add(Dense(1)) # 1 target feature only
                        model.compile(optimizer='adam', loss='mse')

                        number_of_epochs = model_epochs[imf_level]
                        checkpoint_path = fileDirectory + "/data/DNN_tmp/" +f"{ticker}_{y}_" +f"ltsm_{note}.h5"
                        
                        callbacks = [ ModelCheckpoint(checkpoint_path, monitor='loss', mode="max", verbose=0,save_best_only=True, save_weights_only=False, save_freq=250)]

                        # fit model
                        #model.fit_generator(cur_tmp_gen, steps_per_epoch=1, epochs=number_of_epochs, verbose=0)
                        model.fit(cur_tmp_gen, validation_data=cur_tmp_val_gen, steps_per_epoch=spe, epochs=number_of_epochs, verbose=0, callbacks=callbacks)

                        models_full[ticker][y][imf_level] = model
                    else:
                        # Spline prediction model
                        cur_tmp_gen = train_generators[ticker][y][imf_level]
                        model = SplineModel(cur_tmp_gen)
                        models_full[ticker][y][imf_level] = model
    
    return models_full
        

In [59]:
models_full = model_trainer()

Training model [HD][2020][IMF1]
Training model [HD][2020][IMF2]
Training model [HD][2020][IMF3]
Training model [HD][2020][IMF4]
Training model [HD][2020][IMF5]
Training model [HD][2020][IMF6]
Training model [ADSK][2020][IMF1]
Training model [ADSK][2020][IMF2]
Training model [ADSK][2020][IMF3]
Training model [ADSK][2020][IMF4]
Training model [ADSK][2020][IMF5]
Training model [ADSK][2020][IMF6]
Training model [MTD][2020][IMF1]
Training model [MTD][2020][IMF2]
Training model [MTD][2020][IMF3]
Training model [MTD][2020][IMF4]
Training model [MTD][2020][IMF5]
Training model [WAT][2020][IMF1]
Training model [WAT][2020][IMF2]
Training model [WAT][2020][IMF3]
Training model [WAT][2020][IMF4]
Training model [WAT][2020][IMF5]
Training model [WAT][2020][IMF6]
Training model [V][2020][IMF1]
Training model [V][2020][IMF2]
Training model [V][2020][IMF3]
Training model [V][2020][IMF4]
Training model [V][2020][IMF5]
Training model [V][2020][IMF6]
Training model [MSFT][2020][IMF1]
Training model [MSFT]

In [37]:
models_full = {}
note='full' 
spe=1 
year=2020

model_epochs = {
    'IMF1': 2500,
    'IMF2': 2000,
    'IMF3': 1500,
    'IMF4': 1500,
    'IMF5': 1500,
    'IMF6': 1200,
    'IMF7': 1200,
    'IMF8': 1000,
    'Rsd': 1000,
    'DEFAULT': 1000
}

imfs_to_predict_with_neural = ['IMF1','IMF2','IMF3','IMF4','IMF5','IMF6','IMF7','IMF8','RSD'] # set to ['IMF1'] , ['IMF1', 'IMF2'], ['IMF1','IMF2','IMF3'] and so on



for ticker in train_generators:
    models_full[ticker] = {}
    for y in train_generators[ticker]:
        if y == year:
            models_full[ticker][y] = {}
            reached_max_imf_of_target_feature = False
            for imf_level in train_generators[ticker][y]:
                if imf_level in imfs_to_predict_with_neural:
                    print(f'Training model [{ticker}][{y}][{imf_level}]')
                    if reached_max_imf_of_target_feature is True:
                        break # no need to predict further if target feature doesn't contain greater IMF levels

                    if target_feature_max_imf_level[ticker][y] == imf_level:
                        reached_max_imf_of_target_feature = True
                    # Prediction model
                    model = Sequential()
                    current_dataset = train_dataset[ticker][y][imf_level]
                    n_features = current_dataset.shape[1]
                    cur_tmp_gen = train_generators[ticker][y][imf_level]
                    cur_tmp_val_gen = validation_generators[ticker][y][imf_level]

                    if imf_level in windows_sizes_for_imf_level:
                        window_size = windows_sizes_for_imf_level[imf_level]
                    else: 
                        window_size = windows_sizes_for_imf_level['DEFAULT']

                    model.add(LSTM(128, activation='tanh', return_sequences=True, input_shape=(window_size, n_features)))
                    model.add(LSTM(64, activation='tanh', input_shape=(window_size, 128)))
                    model.add(Dense(16))
                    model.add(LeakyReLU())
                    model.add(Dense(4))
                    model.add(LeakyReLU())
                    model.add(Dense(1)) # 1 target feature only
                    model.compile(optimizer='adam', loss='mse')

                    number_of_epochs = model_epochs[imf_level]
                    checkpoint_path = fileDirectory + "/data/DNN_tmp/" +f"{ticker}_{y}_" +f"ltsm_{note}.h5"
                    
                    callbacks = [ ModelCheckpoint(checkpoint_path, monitor='loss', mode="max", verbose=0,save_best_only=True, save_weights_only=False, save_freq=250)]

                    # fit model
                    #model.fit_generator(cur_tmp_gen, steps_per_epoch=1, epochs=number_of_epochs, verbose=0)
                    model.fit(cur_tmp_gen, validation_data=cur_tmp_val_gen, steps_per_epoch=spe, epochs=number_of_epochs, verbose=0, callbacks=callbacks)

                    models_full[ticker][y][imf_level] = model
                else:
                    # Spline prediction model
                    cur_tmp_gen = train_generators[ticker][y][imf_level]
                    model = SplineModel(cur_tmp_gen)
                    models_full[ticker][y][imf_level] = model

Training model [HD][2020][IMF1]
Training model [HD][2020][IMF2]
Training model [HD][2020][IMF3]
Training model [HD][2020][IMF4]
Training model [HD][2020][IMF5]
Training model [HD][2020][IMF6]
Training model [ADSK][2020][IMF1]
Training model [ADSK][2020][IMF2]
Training model [ADSK][2020][IMF3]
Training model [ADSK][2020][IMF4]
Training model [ADSK][2020][IMF5]
Training model [ADSK][2020][IMF6]
Training model [MTD][2020][IMF1]
Training model [MTD][2020][IMF2]
Training model [MTD][2020][IMF3]
Training model [MTD][2020][IMF4]
Training model [MTD][2020][IMF5]
Training model [WAT][2020][IMF1]
Training model [WAT][2020][IMF2]
Training model [WAT][2020][IMF3]
Training model [WAT][2020][IMF4]
Training model [WAT][2020][IMF5]
Training model [WAT][2020][IMF6]
Training model [V][2020][IMF1]
Training model [V][2020][IMF2]
Training model [V][2020][IMF3]
Training model [V][2020][IMF4]
Training model [V][2020][IMF5]
Training model [V][2020][IMF6]
Training model [MSFT][2020][IMF1]
Training model [MSFT]

In [38]:
models_full.keys()

dict_keys(['HD', 'ADSK', 'MTD', 'WAT', 'V', 'WFC', 'MSFT', 'CARR', 'AMAT', 'JNJ', 'UNH', 'XOM'])

In [40]:
for t in models_full:
    for y in [2020]:
        for IM in models_full[t][2020]:
            models_full[t][y][IM].save(os.path.join(fileDirectory+f"/data/tf_LSTM_full_{t}_{y}_{IM}.h5"))


### It is notable that training a DNN for IMF 1+2 only took ~280 M for 11 stocks. Whereas training 5-6 IMFs for 11 stocks took ~765 M.

In [69]:
# you need to reset the generators before running the following cell
# export supporting data
exports = {'train_dataset':train_dataset,
               'validation_dataset':validation_dataset,
               'test_dataset':test_dataset,
               'train_generators':train_generators,
               'validation_generators':validation_generators,
               'test_generators':test_generators}

for k,v in exports.items():
    with open(fileDirectory+f'/data/LSTM_full_2nd_export_2020_models_{k}.pkl', 'wb') as fi:
                    pickle.dump(v, fi)

# Neither of the 1st two models did particularly well. Spline was the better of the two. Next we will do a version without the log transform and see how that does in the spline.