In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import plotly.express as px
import yfinance as yf
from sklearn.linear_model import LinearRegression
# from sklearn.svm import SVR
# from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score

In [2]:
# get data from yahoo finance
df = yf.download('GLD', start='2020-01-01')
df.head()

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,143.860001,144.210007,143.399994,143.949997,143.949997,7733800
2020-01-03,145.75,146.320007,145.399994,145.860001,145.860001,12272800
2020-01-06,148.440002,148.479996,146.949997,147.389999,147.389999,14403300
2020-01-07,147.570007,148.139999,147.429993,147.970001,147.970001,7978500
2020-01-08,148.490005,148.610001,146.139999,146.860001,146.860001,22248500


In [3]:
# drop unnecessary columns
df.drop(columns=['Open', 'High', 'Low', 'Adj Close', 'Volume'], inplace=True)
df.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2020-01-02,143.949997
2020-01-03,145.860001
2020-01-06,147.389999
2020-01-07,147.970001
2020-01-08,146.860001


In [4]:
# Info
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 924 entries, 2020-01-02 to 2023-09-01
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   924 non-null    float64
dtypes: float64(1)
memory usage: 14.4 KB


In [5]:
# compute Log Return
df['Log_Ret'] = np.log(df['Close']).diff()
df.head()

Unnamed: 0_level_0,Close,Log_Ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02,143.949997,
2020-01-03,145.860001,0.013181
2020-01-06,147.389999,0.010435
2020-01-07,147.970001,0.003927
2020-01-08,146.860001,-0.00753


In [6]:
# Split data into train and test
Ntest = 10
train = df.iloc[:-Ntest]
test = df.iloc[-Ntest:]
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 914 entries, 2020-01-02 to 2023-08-18
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Close    914 non-null    float64
 1   Log_Ret  913 non-null    float64
dtypes: float64(2)
memory usage: 21.4 KB


In [7]:
# Make Supervised Dataset
# Let's see if we can use T past values to predict the next value!
series = df['Log_Ret'].to_numpy()[1:] # [1:] Because first value is NaN
T = 120
X = []
Y = []
for t in range(len(series) - T):
  x = series[t: t+T]
  X.append(x)
  y = series[t+T]
  Y.append(y)

X = np.array(X).reshape(-1,T)
Y = np.array(Y)

print('X shape:', X.shape, 'Y shape:', Y.shape)

X shape: (803, 120) Y shape: (803,)


In [8]:
# split supervised data into train and test
Xtrain, Ytrain = X[:-Ntest], Y[:-Ntest]
Xtest, Ytest = X[-Ntest:], Y[-Ntest:]

In [9]:
# shape X
Xtest.shape

(10, 120)

In [10]:
# shape Y
Ytest.shape

(10,)

In [11]:
# Create and Fit LinearRegression Model
model = LinearRegression()
model.fit(Xtrain, Ytrain)

In [12]:
# Train and Test R^2 Score
print('Train Score: ', model.score(Xtrain, Ytrain))
print('Test Score: ', model.score(Xtest, Ytest))

Train Score:  0.15018782441290413
Test Score:  -0.776795244520535


In [13]:
# Index Train and Test sets
train_idx = df.index <= train.index[-1]
test_idx = ~train_idx

In [14]:
# since first T values are not predictable:
train_idx[:T+1] = False

In [15]:
# predict in-sample and out-of-sample values
df.loc[train_idx, 'Train_Pred'] = model.predict(Xtrain)
df.loc[test_idx, 'Test_Pred'] = model.predict(Xtest)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 924 entries, 2020-01-02 to 2023-09-01
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Close       924 non-null    float64
 1   Log_Ret     923 non-null    float64
 2   Train_Pred  793 non-null    float64
 3   Test_Pred   10 non-null     float64
dtypes: float64(4)
memory usage: 36.1 KB


In [17]:
# plot
px.line(df, x=df.index, y=['Log_Ret', 'Train_Pred', 'Test_Pred'])

In [18]:
# convert predicted values to the original scale
df['Log_Close'] = np.log(df['Close'])
df['Shifted_Log_Close'] = df['Log_Close'].shift(1)

# Undo Differencing
df['Train_Pred_Un_Diff'] = df['Shifted_Log_Close'] + df['Train_Pred']
df['Test_Pred_Un_Diff'] = df['Shifted_Log_Close'] + df['Test_Pred']

# Undo Log
df['Train_Pred_Un_Log'] = np.e ** (df['Train_Pred_Un_Diff'])
df['Test_Pred_Un_Log'] = np.e ** (df['Test_Pred_Un_Diff'])

In [19]:
# 1-Step Forecast
df.loc[train_idx, '1_Step_Train'] = df['Train_Pred_Un_Log']
df.loc[test_idx, '1_Step_Test'] = df['Test_Pred_Un_Log']

In [20]:
# Plot 1 Step Forecast
px.line(df, x= df.index, y=['Close', '1_Step_Train', '1_Step_Test'])

In [21]:
# Multi-Step Forecast
multistep_predictions = []

# last train row for predicting the first test row
last_x = Xtrain[-1]

# We have to call reshape method because scikit-learn only uses 2D array
while len(multistep_predictions) < Ntest:
  p = model.predict(last_x.reshape(1,-1))[0] # [0] Returns just a number otherwise we'll have a bunch of array.
  # Update the predictions list
  multistep_predictions.append(p)
  # Make the new input
  last_x = np.roll(last_x, -1) # Roll function spins our list 1 unit.
  last_x[-1] = p # P value will replace with last value of our list.

multistep_predictions

[0.002552458142262303,
 -0.000130295209069151,
 0.0004158025943159708,
 0.002567091555442455,
 -0.0008786857491682808,
 -0.0004999565057559523,
 -0.0035295057245531575,
 -0.0007057301477800578,
 -0.004213548694360537,
 0.001109702661270305]

In [22]:
# last train value
last_train = train.iloc[-1]['Log_Ret']

In [23]:
# Save multi-step forecast to dataframe
df.loc[test_idx, 'Multi_Step_Pred'] = last_train + np.cumsum(multistep_predictions)

In [24]:
# convert predicted values to the original scale
# Undo Differencing
df['Test_Pred_Un_Diff_Multi'] = df['Shifted_Log_Close'] + df['Multi_Step_Pred']
# Undo Log
df['Test_Pred_Un_Log_Multi'] = np.e ** (df['Test_Pred_Un_Diff_Multi'])
# for short
df.loc[test_idx, 'Multi_Step'] = df['Test_Pred_Un_Log_Multi']

In [25]:
# Plot 1-Step and Molti-Step Forecast
px.line(df, x=df.index, y=['Close', '1_Step_Test', 'Multi_Step'])

In [26]:
# Make Multi-Output Supervised Dataset
Tx = T # Tx represent the number of time steps in the input
Ty = Ntest # Ty represent the number of time steps in the output
X = []
Y = []
for t in range(len(series) - Tx - Ty + 1):
  x = series[t: t+Tx]
  X.append(x)
  y = series[t+Tx: t+Tx+Ty]
  Y.append(y)

X = np.array(X).reshape(-1, Tx)
Y = np.array(Y).reshape(-1, Ty)

print('X shape: ', X.shape, 'Y shape: ', Y.shape)

X shape:  (794, 120) Y shape:  (794, 10)


In [27]:
# Split Multi-Output Supervised dataset into train and test
Xtrain_m, Ytrain_m = X[:-1], Y[:-1]
Xtest_m, Ytest_m = X[-1:], Y[-1:]

In [28]:
# create and Fit LinearRegression Model
model_m = LinearRegression()
model_m.fit(Xtrain_m, Ytrain_m)

In [29]:
# Train and Test Score
print('Train_m Score: ', model_m.score(Xtrain_m, Ytrain_m))
print('Test_m Score: ', model_m.score(Xtest_m, Ytest_m)) # R^2 score is not well-defined with less than two samples.

Train_m Score:  0.1508372018925553
Test_m Score:  nan



R^2 score is not well-defined with less than two samples.



In [30]:
# R^2 Score
r2_score(model_m.predict(Xtest_m).flatten(), Ytest_m.flatten()) # flatten() Return a copy of the array collapsed into one dimension.

-6.931233864042985

In [31]:
# Save Multi-Step Forecast to DataFrame
df.loc[test_idx, 'Multi_OutPut_Pred'] = last_train + np.cumsum(model_m.predict(Xtest_m).flatten())

In [32]:
# convert predicted values to the original scale
# Undo Differencing
df['Test_Pred_Un_Diff_Multi_OutPut'] = df['Shifted_Log_Close'] + df['Multi_OutPut_Pred']
# Undo Log
df['Test_Pred_Un_Log_Multi_OutPut'] = np.e ** (df['Test_Pred_Un_Diff_Multi_OutPut'])
# for short
df.loc[test_idx, 'Multi_OutPut'] = df['Test_Pred_Un_Log_Multi_OutPut']

In [33]:
# Plot all Forecasts
px.line(df, x=df.index, y=['Close', '1_Step_Test', 'Multi_Step', 'Multi_OutPut'])

In [34]:
# MAPE
close_test = df.iloc[-Ntest:]['Close']
mape_1 = mean_absolute_percentage_error(close_test, df.loc[test_idx, '1_Step_Test'])
print('1 Step MAPE: ', mape_1)
mape_2 = mean_absolute_percentage_error(close_test, df.loc[test_idx, 'Multi_Step'])
print('Multi Step MAPE: ', mape_2)
mape_3 = mean_absolute_percentage_error(close_test, df.loc[test_idx, 'Multi_OutPut'])
print('Multi OutPut MAPE: ', mape_3)

1 Step MAPE:  0.003866491399234044
Multi Step MAPE:  0.0038956196319840904
Multi OutPut MAPE:  0.005470462759813827


In [35]:
# R^2
close_test = df.iloc[-Ntest:]['Close']
r2_1 = r2_score(close_test, df.loc[test_idx, '1_Step_Test'])
print('1 Step r2: ', r2_1)
r2_2 = r2_score(close_test, df.loc[test_idx, 'Multi_Step'])
print('Multi Step r2: ', r2_2)
r2_3 = r2_score(close_test, df.loc[test_idx, 'Multi_OutPut'])
print('Multi OutPut r2: ', r2_3)

1 Step r2:  0.6467883600610553
Multi Step r2:  0.7011482754005773
Multi OutPut r2:  0.4694397530198364
