## Multivariate Time Series Modeling

**1.Load data from local device using colab import files upload method**

In [1]:
# import colab files method and upload data
from google.colab import files
files.upload()

Saving stocks.txt to stocks.txt


In [42]:
# Check data upload to colab

# import libs
import numpy as np
import pandas as pd

# set numeric format
pd.options.display.float_format = '{:,.2f}'.format

# create data objects

stocks = pd.read_csv('stocks.txt', sep='\t')

# QC data
print(stocks.head())
print(stocks.tail())
print(stocks.describe().T)
print(stocks[stocks['DATE'].str.contains("JAN17|FEB17|MAR17|APR17|MAY17|JUN17|JUL17|AUG17|SEP17|OCT17|NOV17|DEC17", na=False)]).nunique()
# df[df['ids'].str.contains("ball")]
# print(stocks[['TICKER','DATE']].groupby('DATE')['TICKER'].nunique())

  TICKER      DATE  TREND  ...  THIRD DERIVATIVE  FOURTH DERIVATIVE  Unnamed: 10
0     A   06SEP16       1  ...             -0.00               0.00          nan
1     A   07SEP16       2  ...              0.01               0.01          nan
2     A   08SEP16       3  ...             -0.01              -0.02          nan
3     A   09SEP16       4  ...             -0.04              -0.03          nan
4     A   12SEP16       5  ...              0.11               0.15          nan

[5 rows x 11 columns]
       TICKER      DATE  ...  FOURTH DERIVATIVE  Unnamed: 10
598720  ZYNE   21JUL17   ...               0.06          nan
598721  ZYNE   24JUL17   ...              -0.06          nan
598722  ZYNE   25JUL17   ...              -0.05          nan
598723  ZYNE   26JUL17   ...               0.20          nan
598724  ZYNE   27JUL17   ...              -0.23          nan

[5 rows x 11 columns]
                       count   mean   std    min   25%    50%    75%      max
TREND             598,72

AttributeError: ignored

In [33]:
print(stocks[stocks['DATE'].str.contains("JAN|FEB")]) 

TICKER                 2661
DATE                     39
TREND                    39
RAWPRICE              16159
LOGPRICE              16159
RETURNS               79788
FIRST DERIVATIVE      77952
SECOND DERIVATIVE     98245
THIRD DERIVATIVE     102188
FOURTH DERIVATIVE    103456
Unnamed: 10               0
dtype: int64


**USER INPUTS**
1. csv training filename
2. csv test filename
3. column name to filter for single output

In [0]:
# ENTER USER INPUTS BELOW:

# 1. csv training filename (inside quotes)
train_csv = 'DailyStockDataSample_5Tickers_train.csv'

# 2. csv test filename (inside quotes)
test_csv = 'DailyStockDataSample_5Tickers_test.csv'

# 3. filter column name (for single output example)
filter_colname = 'AIZ'

# 4. train: inputs + output colnames in list with quotes
inputs_output_colnames_list = ['X1','X2','X3','X4','LOG PRICE']

# 5. test: inputs colnames in list with quotes
inputs_colnames_list = ['X1','X2','X3','X4']

# 6. number input time steps for model
n_in_steps = 3

# 7. number output time steps to predict
n_out_steps = 2

# 8. number inputs
n_in = 4


**2. Import libraries and set output format**

In [0]:
# import libs
import numpy as np
import pandas as pd

# set numeric format
pd.options.display.float_format = '{:,.2f}'.format

**3. Read csv files + QC check**

In [0]:
# create data objects

train = pd.read_csv(train_csv)
test = pd.read_csv(test_csv)

# QC data
print(train.head())
print(test.head())

   LOG PRICE    X1    X2    X3    X4  AIZ  DUK  NML  SEAS  SMLP  DAY
0       4.51 -0.00 -0.01 -0.03 -0.05    1    0    0     0     0    1
1       4.51  0.00  0.01  0.02  0.05    1    0    0     0     0    2
2       4.51 -0.00 -0.01 -0.01 -0.03    1    0    0     0     0    3
3       4.50 -0.01 -0.01 -0.00  0.01    1    0    0     0     0    4
4       4.51  0.01  0.02  0.03  0.03    1    0    0     0     0    5
   LOG PRICE    X1    X2    X3    X4  AIZ  DUK  NML  SEAS  SMLP  DAY
0       4.51 -0.00  0.01  0.03  0.04    1    0    0     0     0   15
1       4.52  0.01  0.01  0.00 -0.03    1    0    0     0     0   16
2       4.53  0.01  0.00 -0.01 -0.01    1    0    0     0     0   17
3       4.53 -0.00 -0.01 -0.02 -0.01    1    0    0     0     0   18
4       4.54  0.01  0.01  0.03  0.04    1    0    0     0     0   19


**4. Multivariate mlp (multiple inputs / single output) - multistep prediction (multiple period output prediction without forecasting input series)**

### multivariate multi-step mlp vector model

**Description**

*1. Data prep step:*
* *Panel data must be split into samples*
* *Sample contains a sequence of time periods for inputs and the output*
* *Each sample represents the inputs/output sequence in the next time period* 

>  **time series example (3 time period sequences):** 

> **X1:** 0, 1, 2, 3, 4

> **X2:** 5, 6, 7, 8

> **y:** 9, 10, 11, 12

> * **1st sample:** 0, 1, 2

>  * **2nd sample:** 1, 2, 3

> * **3rd sample:** 2, 3, 4

**aiz as example of multivariate input/single output with multi time step prediction**

In [0]:
# aiz - multivariate input/single output - multi step

# training data prep

df_train = train.loc[train['AIZ']==1] [inputs_output_colnames_list]
print(df_train.head())

# convert pandas dataframe train to numpy array
ar_train = df_train.to_numpy()
print('numpy array dimensions: train')
print(ar_train.shape)


# test data prep

df_test = test.loc[test['AIZ']==1] [inputs_colnames_list]
print(df_test.head())

# convert pandas dataframe train to numpy array
ar_test = df_test.to_numpy()
print('numpy array dimensions: test')
print(ar_test.shape)

     X1    X2    X3    X4  LOG PRICE
0 -0.00 -0.01 -0.03 -0.05       4.51
1  0.00  0.01  0.02  0.05       4.51
2 -0.00 -0.01 -0.01 -0.03       4.51
3 -0.01 -0.01 -0.00  0.01       4.50
4  0.01  0.02  0.03  0.03       4.51
numpy array dimensions: train
(14, 5)
     X1    X2    X3    X4
0 -0.00  0.01  0.03  0.04
1  0.01  0.01  0.00 -0.03
2  0.01  0.00 -0.01 -0.01
3 -0.00 -0.01 -0.02 -0.01
4  0.01  0.01  0.03  0.04
numpy array dimensions: test
(6, 4)


In [0]:
# multivariate multi-step mlp example
from numpy import array
from numpy import hstack
from keras.models import Sequential
from keras.layers import Dense

# split a multivariate sequence into samples
def split_sequences(sequences, n_steps_in, n_steps_out):
	X, y = list(), list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_steps_in
		out_end_ix = end_ix + n_steps_out-1
		# check if we are beyond the dataset
		if out_end_ix > len(sequences):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1:out_end_ix, -1]
		X.append(seq_x)
		y.append(seq_y)
	return array(X), array(y)

# horizontally stack columns (ALREADY IN STACKED FORMAT FROM TOM)
dataset = ar_train

# choose a number of time steps (FOR MLP INPUT/OUTPUT SAMPLE)
n_steps_in, n_steps_out = n_in_steps, n_out_steps

# convert into input/output samples
X, y = split_sequences(dataset, n_steps_in, n_steps_out)

# flatten input (TO SPEED UP CALCULATIONS IN TF)
n_input = X.shape[1] * X.shape[2]  # - FLATTENED NUMBER OF COLUMNS IN INPUT SAMPLE
X = X.reshape((X.shape[0], n_input)) # CREATES FLATTENED INPUT SAMPLE

# define model
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=n_input))
model.add(Dense(n_steps_out))
model.compile(optimizer='adam', loss='mse')
# fit model
model.fit(X, y, epochs=2000, verbose=0)

# predict test data 
n_input = n_steps_in*n_in  # number input time steps (3) * number inputs (4) in test data
print('n_input: ', n_input)
x_input = ar_test  
print('len(x_input): ', len(x_input))
print('n_steps_in: ', n_steps_in)
n_samples_test = int(len(x_input) / n_steps_in)
print('n_samples_test: ', n_samples_test)
x_input = x_input.reshape((n_samples_test, n_input))  # number samples, number flattened inputs
yhat = model.predict(x_input, verbose=0)
print(yhat)

n_input:  12
len(x_input):  6
n_steps_in:  3
n_samples_test:  2
[[4.5046487 4.5124807]
 [4.497058  4.498165 ]]
