# LSTM

Fitting baseline (univariate) and multivariate LSTM. I used Google Colab for GPU.

### Set up

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from numpy import array
import numpy as np
import datetime as dt
from keras.models import Sequential
from keras.layers import LSTM, Dense, Reshape

from google.colab import drive
drive.mount('drive')

Mounted at drive


### About LSTM

[Tutorial](https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/)

### Dataset

* 01/01/2018 - 07/01/2023
* Train: 2018-2021
* Test: 2022-July 2023 (tech recession!)
* Companies: Amazon, Apple, Google, Microsoft, Nvidia

In [None]:
# # to get this file, first run the notebook: Retrieve entire stock price data.ipynb
# stocks = pd.read_csv('quandl_data_table_downloads/QUOTEMEDIA/PRICES_20230712.zip')

# company_tickers = ['AMZN', 'AAPL', 'GOOG', 'MSFT', 'NVDA']
# start_date = pd.to_datetime('2017-12-01')
# end_date = pd.to_datetime('2023-07-01')

# stocks = stocks.loc[stocks['ticker'].isin(company_tickers)]
# stocks = stocks[['date', 'ticker', 'adj_close']]
# stocks['date'] = pd.to_datetime(stocks['date'])
# stocks = stocks.loc[(stocks['date'] >= pd.to_datetime(start_date))
#                       & (stocks['date'] <= pd.to_datetime(end_date))]
# stocks = stocks.sort_values('date')

# stocks.to_csv('stocks_filtered.csv', index=False)

# # # this will be needed later to merge with sentiment analysis dataset
# # stocks = stocks.set_index('date').tz_localize('utc')

In [None]:
# stocks.shape # there are missing dates

In [None]:
# 1 - (7015/(365*5))  / 5.5 # percent of missing dates

### Directly read data

In [3]:
stocks = pd.read_csv('stocks_filtered.csv')
stocks['date'] = pd.to_datetime(stocks['date'])
stocks = stocks.set_index('date')

In [None]:
stocks.head()

Unnamed: 0,date,ticker,adj_close
0,2017-12-01,GOOG,50.5085
1,2017-12-01,AMZN,57.977862
2,2017-12-01,AAPL,40.542754
3,2017-12-01,NVDA,48.95794
4,2017-12-01,MSFT,79.231484


# Baseline LSTM

No sentiment analysis; only one company

In [136]:
def split_data(df, n_steps, count_imputations=False,
               start_date='2018-01-10', end_date='2023-07-01'):
    """
    reformats stock price data to be a sequence of prices from n_steps days ago
    fills in missing values with the most recent available price data

    has an option to count the number of imputations

    returns three arrays:
    1. y,
    2. X, each element has length n_steps
    3. imputations as arrays, where X has n_steps number of columns of
    previous n_steps stock prices

    df: DataFrame with 'date' and 'adj_close' price columns
    n_steps: look back window. < 30

    left and right inclusive
    """

    start_dt = pd.to_datetime(start_date)
    end_dt = pd.to_datetime(end_date)

    all_dates = pd.date_range(start = start_dt, end = end_dt)
    missing_dates = all_dates.difference(df.index)
    y_dates = df.index[(df.index >= start_dt) &
                       (df.index <= end_dt)]

    delta = pd.Timedelta(str(n_steps) + " days")

    y = []
    X = []
    imputations = []

    for y_date in y_dates:

        y.append(df.adj_close.loc[y_date])

        # dates with price data
        X_dates = df.index[(df.index >= y_date - delta) &
                                (df.index < y_date)]

        all_X_dates = pd.date_range(start = y_date - delta,
                                    end = y_date,
                                    inclusive = "left") # exclude y_date

        missing = all_X_dates.difference(X_dates)

        X_prices = []
        count_imputations = 0

        for date in all_X_dates:

            if date in missing:

                # most recent date with price data
                impute_date = max(df.index[df.index < date])
                X_prices.append(df['adj_close'].loc[impute_date])

                count_imputations += 1

            else:

                X_prices.append(df['adj_close'].loc[date])

        X.append(X_prices)
        imputations.append(count_imputations)

    if count_imputations:
        return array(X), array(y), array(imputations)

    return array(X), array(y)




In [138]:
def baseline(company, df, n_steps,
             train_start='2018-01-10', train_end='2021-12-31',
             test_start='2022-01-01', test_end='2023-07-01'):
  """
  fit univariate LSTM to training data

  return the fitted stock price model for training period and test period (tuple)
  """

  # select stock price of the company
  df = df.loc[df['ticker'] == company]

  train = split_data(df, n_steps, start_date=train_start, end_date=train_end)
  X_train, y_train = train[0], train[1]

  test = split_data(df, n_steps, start_date=test_start, end_date=test_end)
  X_test, y_test_true = test[0], test[1]

  n_features = 1 # univariate time series

  # define model
  model = Sequential()
  model.add(LSTM(100, activation='relu', return_sequences=True,  input_shape=(n_steps, n_features)))
  model.add(LSTM(100, activation='relu'))
  model.add(Dense(1))
  # print(model.output_shape)
  model.compile(optimizer='adam', loss='mse')

  # reshape from [samples, timesteps] into [samples, timesteps, features]
  X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], n_features))

  model.fit(X_train, y_train, epochs=200, verbose=0)

  # fit on test period
  y_test_fitted = []

  test_start = pd.to_datetime(test_start)
  test_end = pd.to_datetime(test_end)
  all_test_dates = pd.date_range(start=test_start, end=test_end)


  # dates with existing stock prices
  y_test_dates = df.index[(df.index >= test_start) &
                      (df.index <= test_end)]



  # fit first day with known data
  X_test = X_test[0].reshape((1, n_steps, 1))

  predict_counter = 0
  impute_counter = 0

  for date in all_test_dates:

    # don't predict, just impute with most recent data
    if date < min(y_test_dates):

      y_test_fitted.append(X_test[0, -1, 0])

      impute_counter += 1

    # predicting first day
    elif (date == min(y_test_dates)) & (len(y_test_fitted) == 0):

      y_test_fitted.append(
          model.predict(X_test, verbose=0)[0][0] # index into [[predicted]]
      )

    # update X_test before predicting subsequent days
    elif date in y_test_dates:

      X_test_sequence = X_test[0,:,0]

      X_test = np.append(X_test_sequence[1:len(X_test_sequence)], # go back n_steps
                         [y_test_fitted[-1]], # update previous day's predicted price
                         axis=0).reshape((1, n_steps, 1))

      y_test_fitted.append(
          model.predict(X_test, verbose=0)[0][0] # index into [[predicted]]
      )

      predict_counter += 1

    # if stock price data is missing, impute with most recent price
    else:

      impute = y_test_fitted[-1]
      y_test_fitted.append(impute)

      impute_counter += 1

  assert len(all_test_dates) == predict_counter + impute_counter
  assert len(y_test_dates) == predict_counter

  result = pd.DataFrame({'y_fitted': array(y_test_fitted).flatten()}, index=all_test_dates)

  return result, y_test_dates




### Fit

In [139]:
for ticker in ['AAPL', 'AMZN', 'GOOG', 'NVDA', 'MSFT']:

  result, dates = baseline(ticker, stocks, 5)
  result = result[result.index.isin(dates)].reset_index()

  result.to_csv('drive/My Drive/' + ticker + 'baseline.csv')

In [None]:
AAPL_result, AAPL_dates = baseline("AAPL", stocks, 5)

AMZN_result, AMZN_dates = baseline("AMZN", stocks, 5)

GOOG_result, GOOG_dates = baseline("GOOG", stocks, 5)

IndexError: ignored

In [None]:
MSFT_result, MSFT_dates = baseline("MSFT", stocks, 5)

NVDA_result, NVDA_dates = baseline("NVDA", stocks, 5)

In [None]:
AAPL_result = AAPL_result[AAPL_result.index.isin(AAPL_dates)].reset_index()
AMZN_result = AMZN_result[AMZN_result.index.isin(AMZN_dates)].reset_index()
GOOG_result = GOOG_result[GOOG_result.index.isin(GOOG_dates)].reset_index()
MSFT_result = MSFT_result[MSFT_result.index.isin(MSFT_dates)].reset_index()
NVDA_result = NVDA_result[NVDA_result.index.isin(NVDA_dates)].reset_index()


In [None]:
AAPL_result.to_csv('AAPL_baseline.csv')
!cp AAPL_baseline.csv "drive/My Drive/"

In [None]:
AMZN_result.to_csv('AMZN_baseline.csv')
!cp AMZN_baseline.csv "drive/My Drive/"

GOOG_result.to_csv('GOOG_baseline.csv')
!cp GOOG_baseline.csv "drive/My Drive/"

MSFT_result.to_csv('MSFT_baseline.csv')
!cp MSFT_baseline.csv "drive/My Drive/"

NVDA_result.to_csv('NVDA_baseline.csv')
!cp NVDA_baseline.csv "drive/My Drive/"

# Multivariate LSTM

In [8]:
sentiment = pd.read_csv('daily_sentiment.csv')
sentiment['date'] = pd.to_datetime(sentiment['date'])
sentiment = sentiment.set_index('date')

In [51]:
compound = pd.pivot(sentiment, columns="ticker", values="compound")

print('missing values')
print(compound.isna().sum())

print('\nshape')
print(compound.shape)

compound.head()

missing values
ticker
AAPL    103
AMZN    113
GOOG    112
MSFT    119
NVDA    282
dtype: int64

shape
(1999, 5)


ticker,AAPL,AMZN,GOOG,MSFT,NVDA
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01,0.2553,0.0703,0.24968,0.4847,
2018-01-02,0.064093,0.269283,-0.189467,-0.098667,0.0
2018-01-03,-0.04851,0.336656,0.071183,-0.000418,
2018-01-04,0.173443,0.1779,-0.0025,0.293314,0.212233
2018-01-05,-0.0085,0.104264,0.17533,-0.04292,0.0


In [52]:
volume = pd.pivot(sentiment, columns="ticker", values="volume")

print('missing values')
print(volume.isna().sum())

print('\nshape')
print(volume.shape)

volume.head()

missing values
ticker
AAPL    103
AMZN    113
GOOG    112
MSFT    119
NVDA    282
dtype: int64

shape
(1999, 5)


ticker,AAPL,AMZN,GOOG,MSFT,NVDA
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01,2.0,4.0,5.0,2.0,
2018-01-02,14.0,6.0,3.0,3.0,1.0
2018-01-03,10.0,9.0,6.0,11.0,
2018-01-04,7.0,6.0,5.0,7.0,3.0
2018-01-05,5.0,11.0,10.0,5.0,1.0


### Missing data

For missing data, there's a few options:

*   The compound score can be imputed with the most recently available compound score. Assuming that, in the absence of headlines data, the sentiment of a company stays the same.
*   The volume, however, should probably be 0. Because that's true.




In [12]:
def impute_compound(df, start_date="2018-01-01", end_date="2023-07-01"):
  """
  replace NaN compound scores with the most recently available compound score.
  the index of df is date

  df is already pivoted!

  if the earliest compound score is NaN, replace with 0.
  """

  all_dates = pd.date_range(start=start_date, end=end_date)

  missing_rows = all_dates.difference(df.index)
  missing_df = pd.DataFrame({'AAPL': np.repeat(np.nan, len(missing_rows)),
                             'AMZN': np.repeat(np.nan, len(missing_rows)),
                             'GOOG': np.repeat(np.nan, len(missing_rows)),
                             'NVDA': np.repeat(np.nan, len(missing_rows)),
                             'MSFT': np.repeat(np.nan, len(missing_rows))},
                            index=missing_rows)

  # print(missing_df.shape)

  df = pd.concat([missing_df, df]).sort_index()

  # print(df.shape)

  for ticker in ['AAPL', 'AMZN', 'GOOG', 'MSFT', 'NVDA']:

    ticker_df = df[ticker]
    missing_dates = ticker_df.loc[ticker_df.isna()].index

    dates_with_data = all_dates.difference(missing_dates)

    # if the first missing date is the first date, impute with 0
    if min(missing_dates) == min(df.index):

      df.loc[min(missing_dates)][ticker] = 0

      # remove the first date, now that we've filled in
      missing_dates = missing_dates.delete(0)

    for date in missing_dates:

      impute_idx = max(dates_with_data[dates_with_data < date])
      df.loc[date][ticker] = df.loc[impute_idx][ticker]

  return(df)

In [49]:
def add_rows_volume(df, start_date="2018-01-01", end_date="2023-07-01"):

  all_dates = pd.date_range(start=start_date, end=end_date)

  missing_rows = all_dates.difference(df.index)
  missing_df = pd.DataFrame({'AAPL': np.repeat(np.nan, len(missing_rows)),
                             'AMZN': np.repeat(np.nan, len(missing_rows)),
                             'GOOG': np.repeat(np.nan, len(missing_rows)),
                             'NVDA': np.repeat(np.nan, len(missing_rows)),
                             'MSFT': np.repeat(np.nan, len(missing_rows))},
                            index=missing_rows)

  df = pd.concat([missing_df, df]).sort_index()


  return(df)


In [53]:
compound = impute_compound(compound)
volume = add_rows_volume(volume).fillna(0) # by definition, 0 articles

assert compound.shape == (2008, 5)
assert all(count_na == 0 for count_na in compound.isna().sum())

In [196]:
compound

Unnamed: 0,AAPL,AMZN,GOOG,NVDA,MSFT
2018-01-01,0.255300,0.070300,0.249680,0.000000,0.484700
2018-01-02,0.064093,0.269283,-0.189467,0.000000,-0.098667
2018-01-03,-0.048510,0.336656,0.071183,0.000000,-0.000418
2018-01-04,0.173443,0.177900,-0.002500,0.212233,0.293314
2018-01-05,-0.008500,0.104264,0.175330,0.000000,-0.042920
...,...,...,...,...,...
2023-06-27,0.077088,0.314122,-0.011920,0.170586,0.076175
2023-06-28,0.229400,0.140825,-0.056575,-0.189127,0.158977
2023-06-29,0.092988,0.286330,-0.216886,0.121033,0.018392
2023-06-30,0.169933,0.177900,-0.216886,0.121033,-0.005150


### Format data (split_datas)


In [125]:
def split_data_multivariate(ticker, stocks_df, compound_df, volume_df,
               n_steps,
               start_date='2018-01-15', end_date='2023-07-01'):
    """
    df: Dataframe with 'date', 'adj_close' and 'ticker'. index is datetime
    compound_df, volume_df:
        index is datetime
        columns are the different tickers
        already imputed
    n_steps: look back window. < 30
    dates: left and right inclusive

    reformats stock price data to be a sequence of prices from n_steps days ago
    fills in missing values with the most recent available price data

    combines this with imputed compound_df and volume_df

    has an option to count the number of imputations

    returns three arrays:
    1. y,
    2. X, each element is n_steps by 3
    """

    # stocks_df = stocks_df.loc[stocks_df['ticker'] == ticker]

    all_dates = pd.date_range(start=start_date, end = end_date)
    missing_dates = all_dates.difference(stocks_df.index)
    y_dates = stocks_df.index[(stocks_df.index >= start_date) &
                       (stocks_df.index <= end_date)]

    delta = pd.Timedelta(str(n_steps) + " days")

    y = []
    X = []
    imputations = []

    for y_date in y_dates:

        y.append(stocks_df.adj_close.loc[y_date])

        # dates with price data
        X_dates = stocks_df.index[(stocks_df.index >= y_date - delta) &
                                (stocks_df.index < y_date)]

        # dates in look back window
        all_X_dates = pd.date_range(start = y_date - delta,
                                    end = y_date,
                                    inclusive = "left") # exclude y_date

        # missing in stocks df
        missing = all_X_dates.difference(X_dates)

        X_multivariate = []

        for date in all_X_dates:

            # add stock price
            if date in missing:

                # most recent date with price data
                impute_date = max(stocks_df.index[stocks_df.index < date])
                X_multivariate.append(stocks_df['adj_close'].loc[impute_date])

            else:

                X_multivariate.append(stocks_df['adj_close'].loc[date])

            # add compound score
            X_multivariate.append(compound_df[ticker].loc[date])
            # add volume amount
            X_multivariate.append(volume_df[ticker].loc[date])

        X.append(np.array(X_multivariate).reshape(n_steps, 3))

    return array(X), array(y)

In [69]:
tmp = stocks.iloc[0:1000,:]
tmp

Unnamed: 0_level_0,ticker,adj_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-12-01,GOOG,50.508500
2017-12-01,AMZN,57.977862
2017-12-01,AAPL,40.542754
2017-12-01,NVDA,48.957940
2017-12-01,MSFT,79.231484
...,...,...
2018-09-18,GOOG,58.061000
2018-09-18,NVDA,67.239988
2018-09-18,AAPL,52.320647
2018-09-18,MSFT,107.827513


In [92]:
tmp_splt = split_data_multivariate("AAPL", tmp, compound, volume, 5,
                                   start_date='2018-01-10', end_date='2018-09-18')
X = tmp_splt[0][0].reshape((1, 5, 3))

In [126]:
def multivariate_LSTM(ticker, stocks_df, compound_df, volume_df, n_steps=5,
                      train_start='2018-01-10', train_end='2021-12-31',
                      test_start='2022-01-01', test_end='2023-07-01'):
  """
  fit multivariate LSTM to training data

  all df's have datetime index

  return the fitted stock price model for training period and test period (tuple)
  """

  # filter, train, test
  stocks_df = stocks_df.loc[stocks_df['ticker'] == ticker]
  train = split_data_multivariate(ticker, stocks_df, compound_df, volume_df,
                                  n_steps, start_date=train_start, end_date=train_end)
  X_train, y_train = train[0], train[1]
  test = split_data_multivariate(ticker, stocks_df, compound_df, volume_df,
                                 n_steps, start_date=test_start, end_date=test_end)
  X_test, y_test_true = test[0], test[1]

  n_features = 3 # multivariate time series

  # define Stacked LSTM model
  model = Sequential()
  model.add(LSTM(100, activation='relu', return_sequences=True,  input_shape=(n_steps, n_features)))
  model.add(LSTM(100, activation='relu'))
  model.add(Dense(1))
  # print(model.output_shape)
  model.compile(optimizer='adam', loss='mse')

  # reshape from [samples, timesteps] into [samples, timesteps, features]
  X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], n_features))

  model.fit(X_train, y_train, epochs=200, verbose=0)

  # fit on test period
  y_test_fitted = []

  test_start = pd.to_datetime(test_start)
  test_end = pd.to_datetime(test_end)
  all_test_dates = pd.date_range(start=test_start, end=test_end)

  # dates with existing stock prices
  y_test_dates = stocks_df.index[(stocks_df.index >= test_start) &
                                 (stocks_df.index <= test_end)]

  # fit first day with known data
  X_test = X_test[0].reshape((1, n_steps, 3))

  oneday = pd.Timedelta("1 days")

  predict_counter = 0
  impute_counter = 0

  for date in all_test_dates:

    # if the first date is missing stock price
    # don't predict, just impute with most recent data
    if date < min(y_test_dates):

      y_test_fitted.append(X_test[0, -1, 0])
      # 0: into array, -1: most recent, 0: price

      impute_counter += 1

    # predicting first day
    elif (date == min(y_test_dates)) & (len(y_test_fitted) == 0):

      y_test_fitted.append(
          model.predict(X_test, verbose=0)[0][0] # index into [[predicted]]
      )

    # update X_test before predicting subsequent days
    elif date in y_test_dates:

      # go back n_steps (relative to date)
      update_X_test = X_test[0, 1:n_steps, :]

      X_test = np.append(update_X_test,
                         [[y_test_fitted[-1], # update previous day's predicted price
                           compound_df[ticker].loc[date-oneday], # update compound
                           volume_df[ticker].loc[date-oneday]]], # update volume
                         axis=0).reshape((1, n_steps, 3))

      y_test_fitted.append(
          model.predict(X_test, verbose=0)[0][0] # index into [[predicted]]
      )

      predict_counter += 1

    # if stock price data is missing, impute with most recent price
    else:

      impute = y_test_fitted[-1]
      y_test_fitted.append(impute)

      impute_counter += 1

  assert len(all_test_dates) == predict_counter + impute_counter
  assert len(y_test_dates) == predict_counter

  result = pd.DataFrame({'y_fitted': array(y_test_fitted).flatten()}, index=all_test_dates)

  return result, y_test_dates




### Fit

In [127]:
tmp = multivariate_LSTM('AAPL', stocks, compound, volume, n_steps=5,
                      train_start='2018-01-10', train_end='2018-12-31',
                      test_start='2019-01-01', test_end='2019-07-01')

In [133]:
for ticker in ['AAPL', 'AMZN', 'GOOG', 'NVDA', 'MSFT']:

  result, dates = multivariate_LSTM(ticker, stocks, compound, volume)
  result = result[result.index.isin(dates)].reset_index()

  result.to_csv('drive/My Drive/' + ticker + '_multivariateLSTM.csv')




# Tutorial

In [15]:
# multivariate data preparation
from numpy import array
from numpy import hstack
# define input sequence
in_seq1 = array([10, 20, 30, 40, 50, 60, 70, 80, 90])
in_seq2 = array([15, 25, 35, 45, 55, 65, 75, 85, 95])
out_seq = array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
dataset = hstack((in_seq1, in_seq2, out_seq))
print(dataset)

# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
  X, y = list(), list()
  for i in range(len(sequences)):
    # find the end of this pattern
    end_ix = i + n_steps
 # check if we are beyond the dataset
    if end_ix > len(sequences):
      break
  # gather input and output parts of the pattern
    seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
    X.append(seq_x)
    y.append(seq_y)
  return array(X), array(y)

 # choose a number of time steps
n_steps = 3
# convert into input/output
X, y = split_sequences(dataset, n_steps)
print(X.shape, y.shape)
# summarize the data
for i in range(len(X)):
	print(X[i], y[i])

[[ 10  15  25]
 [ 20  25  45]
 [ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]
 [ 90  95 185]]
(7, 3, 2) (7,)
[[10 15]
 [20 25]
 [30 35]] 65
[[20 25]
 [30 35]
 [40 45]] 85
[[30 35]
 [40 45]
 [50 55]] 105
[[40 45]
 [50 55]
 [60 65]] 125
[[50 55]
 [60 65]
 [70 75]] 145
[[60 65]
 [70 75]
 [80 85]] 165
[[70 75]
 [80 85]
 [90 95]] 185


In [16]:
X

array([[[10, 15],
        [20, 25],
        [30, 35]],

       [[20, 25],
        [30, 35],
        [40, 45]],

       [[30, 35],
        [40, 45],
        [50, 55]],

       [[40, 45],
        [50, 55],
        [60, 65]],

       [[50, 55],
        [60, 65],
        [70, 75]],

       [[60, 65],
        [70, 75],
        [80, 85]],

       [[70, 75],
        [80, 85],
        [90, 95]]])

In [17]:
y

array([ 65,  85, 105, 125, 145, 165, 185])

In [18]:
AAPLx

array([[ 40.54986462,  40.11137239,  40.11137239,  40.11137239,
         40.11137239],
       [ 40.11137239,  40.11137239,  40.11137239,  40.11137239,
         40.82955155],
       [ 40.11137239,  40.11137239,  40.11137239,  40.82955155,
         40.82244086],
       ...,
       [131.47662257, 131.47662257, 131.47662257, 131.47662257,
        129.65194321],
       [131.47662257, 131.47662257, 131.47662257, 129.65194321,
        125.67354397],
       [131.47662257, 131.47662257, 129.65194321, 125.67354397,
        129.23316435]])

In [19]:
AAPLy

array([ 40.82955155,  40.82244086,  41.01205912, ..., 125.67354397,
       129.23316435, 129.55223396])

In [45]:
# split a univariate sequence into samples
def split_sequence(company, sentiment_df, n_steps,
                   start_date='2018-01-01', end_date='2023-07-01'):
  """
  sentiment_df has company as columns and Datetime Index
  """

  dates_idx = pd.date_range(start=start_date, end=end_date)

  sequence = sentiment_df.loc[dates_idx][company]
  print(len(sequence))

  X = []
  for i in range(len(sequence)):
    # find the end of this pattern
    end_ix = i + n_steps
    # check if we are beyond the sequence
    if end_ix > len(sequence)-1:
      break
  # gather input and output parts of the pattern
    seq_x = sequence[i:end_ix]
    X.append(seq_x)

  return array(X)

In [46]:
AAPLco = split_sequence('AAPL', compound, 5, end_date='2018-12-31')

365


In [43]:
AAPLco

array([[ 0.2553    ,  0.06409286, -0.04851   ,  0.17344286, -0.0085    ],
       [ 0.06409286, -0.04851   ,  0.17344286, -0.0085    ,  0.11513333],
       [-0.04851   ,  0.17344286, -0.0085    ,  0.11513333,  0.        ],
       ...,
       [-0.0588    , -0.0588    ,  0.1668    ,  0.1909    ,  0.124275  ],
       [-0.0588    ,  0.1668    ,  0.1909    ,  0.124275  ,  0.20146667],
       [ 0.1668    ,  0.1909    ,  0.124275  ,  0.20146667,  0.        ]])

In [44]:
AAPLco.shape

(360, 5)

In [41]:
AAPLx.shape

(251, 5)