# Load ETF Overview

In [1]:
! pip install etfpy ta pmdarima -q

In [2]:
import numpy as np
import pandas as pd

import ta
from etfpy import ETF, load_etf, get_available_etfs_list

import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

import logging 
logger = logging.getLogger() 
logger.setLevel(logging.CRITICAL)

# Get Specific ETF Data

In [3]:
spy = ETF("SPY")
spy_tabular = spy.to_tabular()
data = spy_tabular.get_quotes(interval="daily", periods=365*40)

In [4]:
data["date"] = pd.to_datetime(data.date)

# Feature Engineering

* Moving Averages (SMA_20 and SMA_50): Simple Moving Averages over 20 and 50 days.
* RSI (Relative Strength Index): Momentum oscillator that measures the speed and change of price movements.
* MACD (Moving Average Convergence Divergence): A trend-following momentum indicator that shows the relationship between two moving averages of a security’s price.
* Bollinger Bands: Consist of a middle band (20-day SMA) and two outer bands (standard deviations above and below the SMA).

In [5]:
# Calculate Moving Averages
data['SMA_20'] = data['close'].rolling(window=20).mean()
data['SMA_50'] = data['close'].rolling(window=50).mean()

# Calculate RSI
data['RSI'] = ta.momentum.rsi(data['close'], window=14)

# Calculate MACD
data['MACD'] = ta.trend.macd(data['close'])
data['MACD_Signal'] = ta.trend.macd_signal(data['close'])
data['MACD_Diff'] = ta.trend.macd_diff(data['close'])

# Calculate Bollinger Bands
bollinger = ta.volatility.BollingerBands(close=data['close'], window=20, window_dev=2)
data['BB_High'] = bollinger.bollinger_hband()
data['BB_Low'] = bollinger.bollinger_lband()
data['BB_Mid'] = bollinger.bollinger_mavg()

In [6]:
data.tail()

Unnamed: 0,symbol,date,open,high,low,close,volume,SMA_20,SMA_50,RSI,MACD,MACD_Signal,MACD_Diff,BB_High,BB_Low,BB_Mid
7915,SPY,2024-07-09,556.26,557.18,555.52,555.82,27314125,546.08,531.89,75.72,6.2,5.82,0.37,556.62,535.53,546.08
7916,SPY,2024-07-10,557.07,561.67,556.77,561.32,38701200,547.36,532.95,80.04,6.69,6.0,0.7,558.73,535.98,547.36
7917,SPY,2024-07-11,561.44,562.33,555.83,556.48,53054100,548.34,533.88,68.49,6.62,6.12,0.5,559.31,537.36,548.34
7918,SPY,2024-07-12,557.63,563.67,557.15,559.99,53084400,549.27,535.04,71.68,6.77,6.25,0.52,560.86,537.67,549.27
7919,SPY,2024-07-15,562.03,564.84,561.39,563.54,13626089,550.32,536.31,74.49,7.09,6.42,0.67,563.03,537.61,550.32


# Train-Test Split

In [7]:
mask = (data['date'] > '2020-01-01')

train = data.loc[~mask]
test = data.loc[mask]

train_size = len(train)
test_size = len(test)

train_size, test_size

(6780, 1140)

# Model Training

Choose a Model: Several models can be used for time series forecasting, including:
* ARIMA/SARIMA: Good for linear time series data.
* Exponential Smoothing (ETS): Simple and efficient for trend and seasonality.
* LSTM/GRU (Neural Networks): Capable of capturing complex patterns.
* Prophet (by Facebook): User-friendly and handles seasonality well.

## ARIMA Model

In [8]:
from statsmodels.tsa.arima.model import ARIMA

In [9]:
# Train ARIMA model
arima_model = ARIMA(train['close'], order=(5, 1, 0))
arima_model_fit = arima_model.fit()

# ARIMA Example
predictions = arima_model_fit.forecast(steps=len(test))
print(arima_model_fit.summary())

# Calculate performance metrics
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test['close'], predictions)
print(f'Mean Squared Error: {mse}')

                               SARIMAX Results                                
Dep. Variable:                  close   No. Observations:                 6780
Model:                 ARIMA(5, 1, 0)   Log Likelihood              -12346.235
Date:                Mon, 15 Jul 2024   AIC                          24704.470
Time:                        16:15:06   BIC                          24745.400
Sample:                             0   HQIC                         24718.597
                               - 6780                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.0497      0.008     -6.610      0.000      -0.064      -0.035
ar.L2         -0.0485      0.007     -7.227      0.000      -0.062      -0.035
ar.L3         -0.0038      0.007     -0.509      0.6

In [10]:
import joblib

# Save the model to a file
joblib.dump(arima_model, 'arima_model.pkl')

['arima_model.pkl']

## LSTM Model

In [11]:
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Scale data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data['close'].values.reshape(-1, 1))

# Prepare data for LSTM
def create_dataset(dataset, look_back=1):
    X, Y = [], []
    for i in range(len(dataset) - look_back - 1):
        a = dataset[i:(i + look_back), 0]
        X.append(a)
        Y.append(dataset[i + look_back, 0])
    return np.array(X), np.array(Y)

look_back = 60
X_train, y_train = create_dataset(scaled_data[0:train_size, :], look_back)
X_test, y_test = create_dataset(scaled_data[train_size-look_back:, :], look_back)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# Create and fit the LSTM network
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(look_back, 1)))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(1))

lstm_model.compile(loss='mean_squared_error', optimizer='adam')
lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=2)

2024-07-15 16:15:09.992714: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-15 16:15:09.992894: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-15 16:15:10.183384: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/50
210/210 - 15s - 73ms/step - loss: 7.1090e-04
Epoch 2/50
210/210 - 12s - 58ms/step - loss: 4.6444e-05
Epoch 3/50
210/210 - 12s - 56ms/step - loss: 4.2569e-05
Epoch 4/50
210/210 - 12s - 57ms/step - loss: 4.1195e-05
Epoch 5/50
210/210 - 20s - 96ms/step - loss: 4.1988e-05
Epoch 6/50
210/210 - 21s - 99ms/step - loss: 3.5811e-05
Epoch 7/50
210/210 - 12s - 56ms/step - loss: 3.5378e-05
Epoch 8/50
210/210 - 12s - 56ms/step - loss: 3.4438e-05
Epoch 9/50
210/210 - 12s - 56ms/step - loss: 2.9858e-05
Epoch 10/50
210/210 - 12s - 56ms/step - loss: 2.8931e-05
Epoch 11/50
210/210 - 12s - 57ms/step - loss: 2.9809e-05
Epoch 12/50
210/210 - 12s - 56ms/step - loss: 2.7436e-05
Epoch 13/50
210/210 - 20s - 97ms/step - loss: 2.7338e-05
Epoch 14/50
210/210 - 12s - 56ms/step - loss: 2.6516e-05
Epoch 15/50
210/210 - 12s - 58ms/step - loss: 2.2640e-05
Epoch 16/50
210/210 - 20s - 96ms/step - loss: 2.1775e-05
Epoch 17/50
210/210 - 12s - 57ms/step - loss: 2.2021e-05
Epoch 18/50
210/210 - 20s - 96ms/step - 

<keras.src.callbacks.history.History at 0x7a146d8e2b00>

In [12]:
# LSTM Example
predictions = lstm_model.predict(X_test)
predictions = scaler.inverse_transform(predictions)

# Calculate performance metrics
mse = mean_squared_error(test['close'][1:], predictions)
print(f'Mean Squared Error: {mse}')

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step
Mean Squared Error: 113.00956091405698


In [13]:
# Save the model to a file
joblib.dump(lstm_model, 'lstm_model.pkl')

['lstm_model.pkl']