In [58]:
# 4. Project prototype (implementation)
## Install Dependencies and import libraries

# pip install pandas numpy yfinance pandas-ta scikit-learn tensorflow

# https://pypi.org/project/yfinance/ (""" it's an open-source tool that uses Yahoo's publicly available APIs, and is intended for research and educational purposes. """)
# import yfinance, our data source
import yfinance as yf

# https://pypi.org/project/pandas-ta/ ("""An easy to use Python 3 Pandas Extension with 130+ Technical Analysis Indicators. Can be called from a Pandas DataFrame or standalone""")
# import pandas-ta
import pandas_ta as ta

# import pandas and numpy
import pandas as pd 
import numpy as np

# import matplotlib for data visualisation
import matplotlib.pyplot as plt

# import from scikit-learn
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay

# import from tensorflow
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import SimpleRNN, Dense, LSTM, Input, GRU
from tensorflow.keras.utils import to_categorical

# Load Data

In [4]:
# insert the stock symbols into a list
symbols_list = ['PFE', 'ROP', 'XYL', 'CPAY', 'INCY']

# define a function to load the data from source (yfinance API), and save it as a csv to local storage
def loadData(symbols=symbols_list, period='10y', interval='1wk'):
    
    try:
        # load the the dataframe from the csv file if it already exist
        df = pd.read_csv(f'{interval}_stocks_data.csv').set_index(['Date', 'Ticker'])
        
        print("Data loaded from directory")
        
    except FileNotFoundError:
        # print a message stating the data does not already exists and need to be downloaded from yfinance
        print(f"There is no {interval}_stocks_data.csv. Data will be downloaded from yfinance.")
        
        # download the data from source and store it in the stock_data variable which will hold the data as a pandas dataframe
        stocks_data =  yf.download(symbols, period=period, interval=interval)

        # reshape the dataframe as a multi-level index dataframe
        stocks_data = stocks_data.stack()

        # source: https://www.statology.org/pandas-change-column-names-to-lowercase/
        # convert column names to lowercase
        stocks_data.columns = stocks_data.columns.str.lower()

        # save the dataframe to a csv file (Save the data to a CSV so we don't have to make any extra unnecessary requests to the API every time we reload the notebook)
        stocks_data.to_csv(f'{interval}_stocks_data.csv', index=True)

        # load the the dataframe from the csv file
        df = pd.read_csv(f'{interval}_stocks_data.csv').set_index(['Date', 'Ticker'])

    finally: 
        # create a dict to store the dataframe of each unique symbol where keys are symbol, values are dataframes
        df_dict = {}

        # iterate over the symbols
        for symbol in symbols:

            # source of inspiration https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.xs.html [11]
            # extract the specific stock data at the 'Ticker' level of this multi index dataframe and save it as a dataframe
            symbol_df = df.xs(symbol, axis=0, level='Ticker', drop_level=True)

            # store the datafram into the df_dict
            df_dict[symbol] = symbol_df

        # return the dictionary
        return df_dict


dfs = loadData()

There is no 1wk_stocks_data.csv. Data will be downloaded from yfinance.


[*********************100%%**********************]  5 of 5 completed


In [13]:
dfs[symbols_list[0]]

Unnamed: 0_level_0,adj close,close,high,low,open,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-08-11,18.445234,27.172676,27.419355,26.726755,26.802656,62383098
2014-08-18,18.625566,27.438330,27.542694,27.220114,27.277040,102843207
2014-08-25,18.928263,27.884251,28.149904,27.428843,27.447819,99802628
2014-09-01,19.095709,28.130930,28.140417,27.666035,27.713472,86197384
2014-09-08,18.954029,27.922201,28.130930,27.523720,28.064516,97524618
...,...,...,...,...,...,...
2024-07-15,29.552921,29.969999,30.690001,28.830000,29.030001,180142400
2024-07-22,30.341789,30.770000,30.930000,29.309999,30.110001,179544200
2024-07-29,30.430000,30.430000,31.540001,29.780001,30.690001,254667700
2024-08-05,28.549999,28.549999,30.049999,28.450001,29.090000,157625900


# Add Targets

In [6]:
# create a function that takes a dataframe and create 'next_close' column based on its 'close' column
def get_next_close(_df):
    
    # create the 'next_close' column to be equal to the next closing price
    # this can be accomplished easily by shifting the close column backward by 1
    return _df['close'].shift(-1)

# create a function that returns 1 if the the next closing price is higher than current closing price and 0 otherwise.
def assign_trend(row):
    if row['next_close'] > row['close']:
        return 1
    elif row['next_close'] < row['close']:
        return 0
    else: # if the next value is missing then return NaN
        return np.nan

# create a function that add the target columns to the dataframe
def add_targets(_df):
    
    # add the next_close column to the dataframe
    _df['next_close'] = get_next_close(_df)
    
    # add the trend column to the dataframe
    _df['trend'] = _df.apply(assign_trend, axis=1)
    
    # drop the NaN values
    _df.dropna(inplace=True)
    
    # fix the 'trend' data type to be int
    _df = _df.astype({'trend': int})
    
    return _df

# add target columns to all the data frames
# df = add_targets(dfs[symbols_list[0]])
# df

# Features Selection

In [338]:
#  we can easily check the available indicators in the pandas-ta library
help(df.ta.indicators())

Pandas TA - Technical Analysis Indicators - v0.3.14b0
Total Indicators & Utilities: 205
Abbreviations:
    aberration, above, above_value, accbands, ad, adosc, adx, alma, amat, ao, aobv, apo, aroon, atr, bbands, below, below_value, bias, bop, brar, cci, cdl_pattern, cdl_z, cfo, cg, chop, cksp, cmf, cmo, coppock, cross, cross_value, cti, decay, decreasing, dema, dm, donchian, dpo, ebsw, efi, ema, entropy, eom, er, eri, fisher, fwma, ha, hilo, hl2, hlc3, hma, hwc, hwma, ichimoku, increasing, inertia, jma, kama, kc, kdj, kst, kurtosis, kvo, linreg, log_return, long_run, macd, mad, massi, mcgd, median, mfi, midpoint, midprice, mom, natr, nvi, obv, ohlc4, pdist, percent_return, pgo, ppo, psar, psl, pvi, pvo, pvol, pvr, pvt, pwma, qqe, qstick, quantile, rma, roc, rsi, rsx, rvgi, rvi, short_run, sinwma, skew, slope, sma, smi, squeeze, squeeze_pro, ssf, stc, stdev, stoch, stochrsi, supertrend, swma, t3, td_seq, tema, thermo, tos_stdevall, trima, trix, true_range, tsi, tsignals, ttm_trend, ui, 

In [330]:
help(ta.donchian)
df.ta.donchian()

Help on function donchian in module pandas_ta.volatility.donchian:

donchian(high, low, lower_length=None, upper_length=None, offset=None, **kwargs)
    Donchian Channels (DC)
    
    Donchian Channels are used to measure volatility, similar to
    Bollinger Bands and Keltner Channels.
    
    Sources:
        https://www.tradingview.com/wiki/Donchian_Channels_(DC)
    
    Calculation:
        Default Inputs:
            lower_length=upper_length=20
        LOWER = low.rolling(lower_length).min()
        UPPER = high.rolling(upper_length).max()
        MID = 0.5 * (LOWER + UPPER)
    
    Args:
        high (pd.Series): Series of 'high's
        low (pd.Series): Series of 'low's
        lower_length (int): The short period. Default: 20
        upper_length (int): The short period. Default: 20
        offset (int): How many periods to offset the result. Default: 0
    
    Kwargs:
        fillna (value, optional): pd.DataFrame.fillna(value)
        fill_method (value, optional): Type

Unnamed: 0_level_0,DCL_20_20,DCM_20_20,DCU_20_20
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-08-04,,,
2014-08-11,,,
2014-08-18,,,
2014-08-25,,,
2014-09-01,,,
...,...,...,...
2024-07-01,25.200001,27.465000,29.730000
2024-07-08,25.200001,27.465000,29.730000
2024-07-15,25.200001,27.945001,30.690001
2024-07-22,25.200001,28.065001,30.930000


65 different technical indicators columns were added in this function.

In [7]:
# for the time being let's create a function that add all the technical indicators we want to a df
def add_technical_indicators(_df):
    # apply macd on the close column in a df and add it to the dataframe    
    macd = ta.macd(_df['close'])
    # The MACD (Moving Average Convergence/Divergence) is a popular indicator to that is used to identify a trend
    _df.insert(6, 'macd', macd.iloc[:,0])
    # Histogram is the difference of MACD and Signal
    _df.insert(7, 'macd_histogram', macd.iloc[:,1])
    # Signal is an EMA (exponential moving average) of MACD
    _df.insert(8, 'macd_signal', macd.iloc[:,2])
    
    # apply RSI on the Close column in a df and add it to the dataframe    
    # RSI (Relative Strength Index) is popular momentum oscillator. Measures velocity and magnitude a trend
    rsi = ta.rsi(_df['close'])
    _df.insert(9, 'rsi', rsi)

    # apply SMA on the Close column in a df and add it to the dataframe    
    # SMA (Simple Moving Average) is the classic moving average that is the equally weighted average over n periods.
    sma = ta.sma(_df['close'])
    _df.insert(10, 'sma', sma)

    # apply EMA on the Close column in a df and add it to the dataframe    
    # EMA (Exponential Moving Average). The weights are determined by alpha which is proportional to it's length.
    ema = ta.ema(_df['close'])
    _df.insert(11, 'ema', ema)

    ######## repeat the same proccess for all the technical indicators we want to include ##########
    # aberration: A volatility indicator
    aberration = ta.aberration(_df['high'], _df['low'], _df['close'])
    _df.insert(12, 'aberration_zg', aberration.iloc[:,0])
    _df.insert(13, 'aberration_sg', aberration.iloc[:,1])
    _df.insert(14, 'aberration_xg', aberration.iloc[:,2])
    _df.insert(15, 'aberration_atr', aberration.iloc[:,3])
    
    # bbands: A popular volatility indicator by John Bollinger.
    bbands = ta.bbands(_df['close'])
    _df.insert(16, 'bbands_lower', bbands.iloc[:,0])
    _df.insert(17, 'bbands_mid', bbands.iloc[:,1])
    _df.insert(18, 'bbands_upper', bbands.iloc[:,2])
    _df.insert(19, 'bbands_bandwidth', bbands.iloc[:,3])
    _df.insert(20, 'bbands_percent', bbands.iloc[:,4])
    
    # adx:  Average Directional Movement is meant to quantify trend strength by measuring the amount of movement in a single direction.    
    adx = ta.adx(_df['high'], _df['low'], _df['close'])
    _df.insert(21, 'adx_adx', adx.iloc[:,0])
    _df.insert(22, 'adx_dmp', adx.iloc[:,1])
    _df.insert(23, 'adx_dmn', adx.iloc[:,2])

    # atr: Averge True Range is used to measure volatility, especially volatility caused by gaps or limit moves.
    atr = ta.atr(_df['high'], _df['low'], _df['close'])
    _df.insert(24, 'atr', atr)
    
    # stoch: The Stochastic Oscillator (STOCH) was developed by George Lane in the 1950's. He believed this indicator was a good way to measure momentum because changes in momentum precede changes in price.
    stoch = ta.stoch(_df['high'], _df['low'], _df['close'])
    _df.insert(25, 'stoch_k', stoch.iloc[:,0])
    _df.insert(26, 'stoch_d', stoch.iloc[:,1])
    
    # obv: On Balance Volume is a cumulative indicator to measure buying and selling pressure.
    obv = ta.obv(_df['close'], _df['volume'])
    _df.insert(27, 'obv', obv)
    
    # Supertrend: is an overlap indicator. It is used to help identify trend direction, setting stop loss, identify support and resistance, and/or generate buy & sell signals.
    supertrend = ta.supertrend(_df['high'], _df['low'], _df['close'])
    _df.insert(28, 'supertrend_trend', supertrend.iloc[:,0])
    _df.insert(29, 'supertrend_direction', supertrend.iloc[:,1])
    
    # dema: The Double Exponential Moving Average attempts to a smoother average with less lag than the normal Exponential Moving Average (EMA).
    dema = ta.dema(_df['close'])
    _df.insert(30, 'dema', dema)
    
    # tema: A less laggy Exponential Moving Average.
    tema = ta.tema(_df['close'])
    _df.insert(31, 'tema', tema)

    # roc: Rate of Change is an indicator is also referred to as Momentum. It is a pure momentum oscillator that measures the percent change in price with the previous price 'n' (or length) periods ago.
    roc = ta.roc(_df['close'])
    _df.insert(32, 'roc', roc)
    
    # mom: Momentum is an indicator used to measure a security's speed (or strength) of movement.  Or simply the change in price.
    mom = ta.mom(_df['close'])
    _df.insert(33, 'mom', mom)
    
    # cci: Commodity Channel Index is a momentum oscillator used to primarily identify overbought and oversold levels relative to a mean.
    cci = ta.cci(_df['high'], _df['low'], _df['close'])
    _df.insert(34, 'cci', cci)
    
    # aroon: attempts to identify if a security is trending and how strong.
    aroon = ta.aroon(_df['high'], _df['low'])
    _df.insert(35, 'aroon_up', aroon.iloc[:,0])
    _df.insert(36, 'aroon_down', aroon.iloc[:,1])
    _df.insert(37, 'aroon_osc', aroon.iloc[:,2])
    
    # natr: Normalized Average True Range attempt to normalize the average true range.
    natr = ta.natr(_df['high'], _df['low'], _df['close'])
    _df.insert(38, 'natr', natr)
    
    # William's Percent R is a momentum oscillator similar to the RSI that attempts to identify overbought and oversold conditions.
    willr = ta.willr(_df['high'], _df['low'], _df['close'])
    _df.insert(39, 'willr', willr)
    
    # vortex: Two oscillators that capture positive and negative trend movement.
    vortex = ta.vortex(_df['high'], _df['low'], _df['close'])
    _df.insert(40, 'vortex_vip', vortex.iloc[:,0])
    _df.insert(41, 'vortex_vim', vortex.iloc[:,1])
        
    # kama: Developed by Perry Kaufman, Kaufman's Adaptive Moving Average (KAMA) is a moving average designed to account for market noise or volatility. KAMA will closely follow prices when the price swings are relatively small and the noise is low. KAMA will adjust when the price swings widen and follow prices from a greater distance. This trend-following indicator can be used to identify the overall trend, time turning points and filter price movements.
    kama = ta.kama(_df['close'])
    _df.insert(42, 'kama', kama)
                       
    # trix: is a momentum oscillator to identify divergences.
    trix = ta.trix(_df['close'])
    _df.insert(43, 'trix', trix.iloc[:,0])
    _df.insert(44, 'trixs', trix.iloc[:,1])
                       
    # hlc3: the average of high, low, and close prices
    hlc3 = ta.hlc3(_df['high'], _df['low'], _df['close'])
    _df.insert(45, 'hlc3', hlc3)

    # ohlc4: the average of open, high, low, and close prices
    ohlc4 = ta.ohlc4(_df['open'], _df['high'], _df['low'], _df['close'])
    _df.insert(46, 'ohlc4', ohlc4)
    
    # hma: The Hull Exponential Moving Average attempts to reduce or remove lag in moving averages.
    hma = ta.hma(_df['close'])
    _df.insert(47, 'hma', hma)

    # vwma: Volume Weighted Moving Average.
    vwma = ta.vwma(_df['close'], _df['volume'])
    _df.insert(48, 'vwma', vwma)
    
    # accbands: Acceleration Bands created by Price Headley plots upper and lower envelope bands around a simple moving average.
    accbands = ta.accbands(_df['high'], _df['low'], _df['close'])
    _df.insert(49, 'accbands_lower', accbands.iloc[:,0])
    _df.insert(50, 'accbands_mid', accbands.iloc[:,1])
    _df.insert(51, 'accbands_upper', accbands.iloc[:,2])
    
    # adosc: Accumulation/Distribution Oscillator indicator utilizes Accumulation/Distribution and treats it similarily to MACD or APO.
    adosc = ta.adosc(_df['high'], _df['low'], _df['close'], _df['volume'])
    _df.insert(52, 'adosc', adosc)
    
    # alma: The ALMA moving average uses the curve of the Normal (Gauss) distribution, which can be shifted from 0 to 1. This allows regulating the smoothness and high sensitivity of the indicator. Sigma is another parameter that is responsible for the shape of the curve coefficients. This moving average reduces lag of the data in conjunction with smoothing to reduce noise.
    alma = ta.alma(_df['close'])
    _df.insert(53, 'alma', alma)
    
    # apo: The Absolute Price Oscillator is an indicator used to measure a security's momentum.  It is simply the difference of two Exponential Moving Averages (EMA) of two different periods. Note: APO and MACD lines are equivalent.
    apo = ta.apo(_df['close'])
    _df.insert(54, 'apo', apo)
    
    # cfo: The Forecast Oscillator calculates the percentage difference between the actualprice and the Time Series Forecast (the endpoint of a linear regression line).
    cfo = ta.cfo(_df['close'])
    _df.insert(55, 'cfo', cfo)
    
    # cg: The Center of Gravity Indicator by John Ehlers attempts to identify turning points while exhibiting zero lag and smoothing.
    cg = ta.cg(_df['close'])
    _df.insert(56, 'cg', cg)    
    
    # chop: The Choppiness Index was created by Australian commodity trader E.W. Dreiss and is designed to determine if the market is choppy (trading sideways) or not choppy (trading within a trend in either direction). Values closer to 100 implies the underlying is choppier whereas values closer to 0 implies the underlying is trending.
    chop = ta.chop(_df['high'], _df['low'], _df['close'])
    _df.insert(57, 'chop', chop)
    
    # cmf: Chailin Money Flow measures the amount of money flow volume over a specific period in conjunction with Accumulation/Distribution.
    cmf = ta.cmf(_df['high'], _df['low'], _df['close'], _df['volume'])
    _df.insert(58, 'cmf', cmf)
    
    # cmo: Attempts to capture the momentum of an asset with overbought at 50 and oversold at -50.
    cmo = ta.cmo(_df['close'])
    _df.insert(59, 'cmo', cmo)
    
    # coppock: Coppock Curve (originally called the "Trendex Model") is a momentum indicator is designed for use on a monthly time scale.  Although designed for monthly use, a daily calculation over the same period can be made, converting the periods to 294-day and 231-day rate of changes, and a 210-day weighted moving average.
    coppock = ta.coppock(_df['close'])
    _df.insert(60, 'coppock', coppock)
    
    # cti: The Correlation Trend Indicator is an oscillator created by John Ehler in 2020. It assigns a value depending on how close prices in that range are to following a positively- or negatively-sloping straight line. Values range from -1 to 1. This is a wrapper for ta.linreg(close, r=True).
    cti = ta.cti(_df['close'])
    _df.insert(61, 'cti', cti)
    
    # decay: Creates a decay moving forward from prior signals like crosses. The default is "linear". Exponential is optional as "exponential" or "exp".
    decay = ta.decay(_df['close'])
    _df.insert(62, 'decay', decay)
    
    # decreasing: Returns True if the series is decreasing over a period, False otherwise. If the kwarg 'strict' is True, it returns True if it is continuously decreasing over the period. When using the kwarg 'asint', then it returns 1 for True or 0 for False.
    decreasing = ta.decreasing(_df['close'])
    _df.insert(63, 'decreasing', decreasing)
    
    # dm: The Directional Movement was developed by J. Welles Wilder in 1978 attempts to determine which direction the price of an asset is moving. It compares prior highs and lows to yield to two series +DM and -DM.
    dm = ta.dm(_df['high'], _df['low'])
    _df.insert(64, 'dm_positive', dm.iloc[:,0])
    _df.insert(65, 'dm_negative', dm.iloc[:,1])

    # donchian: Donchian Channels are used to measure volatility, similar to Bollinger Bands and Keltner Channels.
    donchian = ta.donchian(_df['high'], _df['low'])
    _df.insert(66, 'donchian_lower', donchian.iloc[:,0])
    _df.insert(67, 'donchian_mid', donchian.iloc[:,1])
    _df.insert(68, 'donchian_upper', donchian.iloc[:,2])
    
    # ebsw: This indicator measures market cycles and uses a low pass filter to remove noise. Its output is bound signal between -1 and 1 and the maximum length of a detected trend is limited by its length input.
    ebsw = ta.ebsw(_df['close'])
    _df.insert(69, 'ebsw', ebsw)
    
    # efi: Elder's Force Index measures the power behind a price movement using price and volume as well as potential reversals and price corrections.
    efi = ta.efi(_df['close'], _df['volume'])
    _df.insert(70, 'efi', efi)
    
    # entropy: Introduced by Claude Shannon in 1948, entropy measures the unpredictability of the data, or equivalently, of its average information. A die has higher entropy (p=1/6) versus a coin (p=1/2).
    entropy = ta.entropy(_df['close'])
    _df.insert(71, 'entropy', entropy)

    #### we can add more technical indicators if we want using the same process ####
    
    # remove the NaN values and return the new dataframe
    _df.dropna(inplace=True)
    
    return _df

# call the function on the selected dataframe
# full_df = add_technical_indicators(df.copy(deep=True))
# full_df

# Prepare the data for training

In [51]:
# make a deep copy of the data
# full_df = add_technical_indicators(df.copy(deep=True))


# create a function to apply a given scaler to the features
def apply_scaler(scaler, features):
    
    # set the training and test ratio to be 70-30
    training_ratio = int(len(features) * 0.7)

    # devide the feature set into training and test set
    X_train, X_test = features[:training_ratio], features[training_ratio:]
    
    # apply a scaler on the training and test sets in isolation so we don't allow the test set to influence the scaling process, which reduces the likelihood of overfitting 
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # concat the two scaled sets into one
    X = np.concatenate((X_train_scaled, X_test_scaled), axis=0)

    # return the scaled features
    return X

# source of isnpiration: https://stackoverflow.com/questions/47945512/how-to-reshape-input-for-keras-lstm?rq=4 [13]
# create a function to reshape X and y into sequences of x timesteps
def create_seqs(features, target, num_rows):
    # create 2 empty lists to store the newly shaped features and target lists
    X, y = [], []
    
    # iterate over the features
    for i in range(len(features) - num_rows):
        # create indexes of the start and end of each sequence
        seq_s = i
        seq_e = i + num_rows
        
        # the ith sequence will be a slice of the features between the indexes, create it and add it to X
        xi = features[seq_s : seq_e]
        X.append(xi)
        
        # do the same for the target and add it to y
        yi = target[seq_e]
        y.append(yi)
    
    # return the X and y as numpy arraies
    return np.array(X), np.array(y)


# create a function to convert a dataframe into training and test sets
def create_train_test_sets(_df, scaler, target="classification", timesteps=6):

    # reset the index
    _df.reset_index(inplace = True)
    
    # drop the Date column as it's not necessary for now
    _df.drop(['Date'], axis=1, inplace=True)

    # set the features set
    X = _df.iloc[:, :-2]
    
    # set the target 
    if (target == "classification"):
        # trend is the target for classification
        y = _df.iloc[:, -1]
    else:
        # next_close is the target for regression
        y = _df.iloc[:, -2]

    # apply a scaler on the features set
    X = apply_scaler(scaler, X)
    
    # create sequences
    X_seq, y_seq = create_seqs(X, y, timesteps)
    
    # source of inspiration: https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical [14]
    # use to_categorical from tf to converts the target (trend) to binary class matrix, this will help us assign confidences to the classification prediction
    if (target == "classification"):
        y_seq = to_categorical(y_seq)

    # devide the data into a training set and a test set in 70-30 ratio
    training_ratio = int(len(X) * 0.7)
    # add a vaidation ratio at 20% of the data, this will leave 10% as test
    validation_ratio = int(len(X) * 0.2)
    
    X_train, X_vald, X_test = X_seq[:training_ratio], X_seq[training_ratio:training_ratio + validation_ratio], X_seq[training_ratio + validation_ratio:]
    y_train, y_vald, y_test = y_seq[:training_ratio], y_seq[training_ratio:training_ratio + validation_ratio], y_seq[training_ratio + validation_ratio:]

    # return the sets and the last_date
    return X_train, X_vald, X_test, y_train, y_vald, y_test



# initialize a MinMaxScaler instance for a range between 0 and 1
min_max_scaler = MinMaxScaler(feature_range=(0, 1))

# initialize a StandardScaler instance
standard_scaler = StandardScaler()

# initialize a RobustScaler instance
robust_scaler = RobustScaler()

# X_train, X_test, y_train, y_test = create_train_test_sets(full_df, robust_scaler, "classification", 6)

# X_train.shape, X_test.shape

# Apply the helper functions on all the dataframes

In [29]:
# create a function that takes a dict of dataframes, and return a dict of training, validation and testing datasets
def prepare_data_to_train(dfs_dict, scaler, target, timesteps):
    
    # create a dict of dicts to store training, validation and test sets for each stock
    sets_dict = {}
    
    for symbol in dfs_dict.keys():
        # add target columns to all the data frames
        _df = add_targets(dfs_dict[symbol].copy(deep=True))
        
        # add technical indicators on the selected dataframe
        _df = add_technical_indicators(_df)
        
        # convert the dataframe into training, validation and test sets
        X_train, X_vald, X_test, y_train, y_vald, y_test = create_train_test_sets(_df, scaler, target, timesteps)
        
        # create a dict of the sets and add it to the sets_dict
        sets_dict[symbol] = {
            'X_train': X_train, 'X_vald': X_vald, 'X_test': X_test, 
            'y_train': y_train, 'y_vald': y_vald, 'y_test': y_test
        }
    
    # return the sets
    return sets_dict

# set a list of options for the timesteps
timesteps_options = list(range(4, 11))

# create a dict to store the different timesteps datasets
timesteps_options_dict = {}

# iterate over the different timesteps
for timesteps in timesteps_options:
    
    # call prepare_data_to_train on each option 
    data_set = prepare_data_to_train(dfs, standard_scaler, "classification", timesteps)
    
    
    
    timesteps_options_dict[timesteps] = data_set

# timesteps_options_dict

In [50]:
print(timesteps_options_dict[10]['PFE']['X_train'].shape)
print(timesteps_options_dict[10]['PFE']['X_vald'].shape)
print(timesteps_options_dict[10]['PFE']['X_test'].shape)

(335, 10, 72)
(95, 10, 72)
(39, 10, 72)


In [57]:
## Create and train the baseline classification model

# source of inspiration: François Chollet (11, 2017), “Deep Learning with Python” chapter 6 [8]
# construct the model
def create_model(_timesteps, X_train_shape):
    # initialize a sequential model
    model = Sequential()
    
    # add the model layers
    # input layer
    model.add(Input(shape=(_timesteps, X_train_shape[2])))
    
    model.add(SimpleRNN(64, return_sequences=True))
    model.add(SimpleRNN(64, return_sequences=False))
    model.add(Dense(2, activation='softmax'))

    # compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['precision', 'accuracy', 'recall'])
    
    return model

# setup the data to be passed to the model
X_train, y_train = timesteps_options_dict[5]['PFE']['X_train'], timesteps_options_dict[5]['PFE']['y_train']
X_vald, y_vald = timesteps_options_dict[5]['PFE']['X_vald'], timesteps_options_dict[5]['PFE']['y_vald']
X_test, y_test = timesteps_options_dict[5]['PFE']['X_test'], timesteps_options_dict[5]['PFE']['y_test']

# initialize the model
model1 = create_model(5, X_train.shape)

# train the model
history = model1.fit(X_train, y_train, validation_data=(X_vald, y_vald), epochs=20, batch_size=32, verbose=0)


## Model evaluation and prototype conclusion

# test the model accuracy
model1.evaluate(X_test, y_test, verbose=2)

# list the model architecture
model1.summary()

2/2 - 0s - 15ms/step - loss: 1.4369 - precision: 0.4545


# create models archive

In [61]:
# create a function that takes a model creation function, dictionary of datasets as inputs, 
# and train the given model on these datasets one by one, then save the models to a dictionary where keys are stock symbols, 
# and values are dictionaries containing the models and meta data about the models
def models_archive(_create_model, _dataset_dict, _timesteps, model_name):
    
    # create the models archive dictionary
    archive = {}
    
    # get the datasets associated with the given timesteps 
    dataset_timestep = _dataset_dict[_timesteps]
    
    # iterate over the symbols in the dictionary
    for symbol in dataset_timestep.keys():
        
        # initiate a dict for the symbol
        archive[symbol] = {}
        
        # setup the data to be passed to the model
        X_train, y_train = dataset_timestep[symbol]['X_train'], dataset_timestep[symbol]['y_train']
        X_vald, y_vald = dataset_timestep[symbol]['X_vald'], dataset_timestep[symbol]['y_vald']
        X_test, y_test = dataset_timestep[symbol]['X_test'], dataset_timestep[symbol]['y_test']

        # initialize the model
        model = _create_model(_timesteps, X_train.shape)

        # train the model
        history = model.fit(X_train, y_train, validation_data=(X_vald, y_vald), epochs=20, batch_size=32, verbose=0)

        # source of inspiration: https://www.tensorflow.org/tutorials/keras/save_and_load
        # save model to device
        model.save(f'models/{model_name}_{symbol}.keras')        
        
        # store the model in the associated symbol dict
        archive[symbol]['model'] = load_model(f'models/{model_name}_{symbol}.keras')
        
        # evaluate the model on the test_set and store it in the associated symbol dict
        archive[symbol]['evaluation'] = model.evaluate(X_test, y_test, verbose=2)  
        
    return archive

    
stocks_model_archive = models_archive(create_model, timesteps_options_dict, 6, "baseline_SimpleRNN")

2/2 - 0s - 13ms/step - loss: 0.9822 - precision: 0.5349
2/2 - 1s - 261ms/step - loss: 1.0006 - precision: 0.5581
2/2 - 0s - 183ms/step - loss: 1.0127 - precision: 0.5349
2/2 - 1s - 265ms/step - loss: 1.3909 - precision: 0.3953
2/2 - 1s - 291ms/step - loss: 0.9874 - precision: 0.4884


# create model evaluation function 

In [64]:
# create a function that takes a model archive (a dict of models) and calculate the average precision for all the models in it
def evaluate_models_archive(_models_archive):
    
    # create a total precision variable and initialize it to 0
    total_precision = 0
    
    # iterate over the symbols of the dictionary
    for symbol in _models_archive.keys():
        total_precision += _models_archive[symbol]['evaluation'][1]
        
    # calculate average precision
    average_precision = total_precision / len(_models_archive.keys())
    
    return average_precision
        
evaluate_models_archive(stocks_model_archive)

0.5023255944252014

# Hyperparameters optimization