# Import Dependencies

In [2]:
import pandas as pd
import numpy as np
from pmdarima.arima import AutoARIMA
import plotly.express as px
import plotly.graph_objects as go
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error
from datetime import date, timedelta
import yfinance as yf

# Getting the Data

In [31]:
# Getting the date five years ago to download the current timeframe
years = (date.today() - timedelta(weeks=260)).strftime("%Y-%m-%d")

# Stocks to analyze
stocks = ['FB','AMZN']

# Getting the data for multiple stocks
df = yf.download(stocks, start=years).dropna()

# Storing the dataframes in a dictionary
stock_df = {}

for col in set(df.columns.get_level_values(0)):
    
    # Assigning the data for each stock in the dictionary
    stock_df[col] = df[col]

[*********************100%***********************]  2 of 2 completed


In [32]:
stock_df

{'Volume':                 AMZN        FB
 Date                          
 2016-02-01   6355100  46132700
 2016-02-02   6312000  59778600
 2016-02-03  10048700  56919300
 2016-02-04   6199100  38890200
 2016-02-05   9708900  76894700
 ...              ...       ...
 2021-01-15   4244000  24942900
 2021-01-19   3305100  28028500
 2021-01-20   5309800  25199900
 2021-01-21   4945100  20838700
 2021-01-22   2816300  21910000
 
 [1254 rows x 2 columns],
 'Close':                    AMZN          FB
 Date                               
 2016-02-01   574.809998  115.089996
 2016-02-02   552.099976  114.610001
 2016-02-03   531.070007  112.690002
 2016-02-04   536.260010  110.489998
 2016-02-05   502.130005  104.070000
 ...                 ...         ...
 2021-01-15  3104.250000  251.360001
 2021-01-19  3120.760010  261.100006
 2021-01-20  3263.379883  267.480011
 2021-01-21  3306.989990  272.869995
 2021-01-22  3292.229980  274.500000
 
 [1254 rows x 2 columns],
 'Low':                    A

In [33]:
# Finding the log returns
stock_df['LogReturns'] = stock_df['Adj Close'].apply(np.log).diff().dropna()
# Using Moving averages
stock_df['MovAvg'] = stock_df['Adj Close'].rolling(10).mean().dropna()
# Logarithmic scaling of the data and rounding the result
stock_df['Log'] = stock_df['MovAvg'].apply(np.log).apply(lambda x: round(x, 2))

In [34]:
stock_df

{'Volume':                 AMZN        FB
 Date                          
 2016-02-01   6355100  46132700
 2016-02-02   6312000  59778600
 2016-02-03  10048700  56919300
 2016-02-04   6199100  38890200
 2016-02-05   9708900  76894700
 ...              ...       ...
 2021-01-15   4244000  24942900
 2021-01-19   3305100  28028500
 2021-01-20   5309800  25199900
 2021-01-21   4945100  20838700
 2021-01-22   2816300  21910000
 
 [1254 rows x 2 columns],
 'Close':                    AMZN          FB
 Date                               
 2016-02-01   574.809998  115.089996
 2016-02-02   552.099976  114.610001
 2016-02-03   531.070007  112.690002
 2016-02-04   536.260010  110.489998
 2016-02-05   502.130005  104.070000
 ...                 ...         ...
 2021-01-15  3104.250000  251.360001
 2021-01-19  3120.760010  261.100006
 2021-01-20  3263.379883  267.480011
 2021-01-21  3306.989990  272.869995
 2021-01-22  3292.229980  274.500000
 
 [1254 rows x 2 columns],
 'Low':                    A

# Training the Data

In [35]:
# Days in the past to train on
days_to_train = 180 

# Days in the future to predict
days_to_predict = 5

In [36]:
# Establishing a new DF for predictions
stock_df['Predictions'] = pd.DataFrame(index=stock_df['Log'].index,
                                       columns=stock_df['Log'].columns)

In [38]:
# Iterate through each stock
for stock in tqdm(stocks):
    
    # Current predicted value
    pred_val = 0
    
    # Training the model in a predetermined date range
    for day in tqdm(range(1000, 
                          stock_df['Log'].shape[0]-days_to_predict)):        

        # Data to use, containing a specific amount of days
        training = stock_df['Log'][stock].iloc[day-days_to_train:day+1].dropna()
        
        # Determining if the actual value crossed the predicted value
        cross = ((training[-1] >= pred_val >= training[-2]) or 
                 (training[-1] <= pred_val <= training[-2]))
        
        # Running the model when the latest training value crosses the predicted value or every other day 
        if cross or day % 2 == 0:

            # Finding the best parameters
            model    = AutoARIMA(start_p=0, start_q=0,
                                 start_P=0, start_Q=0,
                                 max_p=8, max_q=8,
                                 max_P=5, max_Q=5,
                                 error_action='ignore',
                                 information_criterion='bic',
                                 suppress_warnings=True)

            # Getting predictions for the optimum parameters by fitting to the training set            
            forecast = model.fit_predict(training,
                                         n_periods=days_to_predict)

            # Getting the last predicted value from the next N days
            stock_df['Predictions'][stock].iloc[day:day+days_to_predict] = np.exp(forecast[-1])


            # Updating the current predicted value
            pred_val = forecast[-1]


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=240.0), HTML(value='')))





KeyboardInterrupt: 

In [None]:
pred_val

In [46]:
# Shift ahead by 1 to compare the actual values to the predictions
pred_df = stock_df['Predictions'].shift(1).astype(float).dropna()

In [47]:
pred_df

Unnamed: 0_level_0,AMZN,FB
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-05,1964.852048,211.350732
2020-02-06,1964.852048,211.350732
2020-02-07,1969.444235,210.593524
2020-02-10,1969.444235,210.593524
2020-02-11,2066.192220,208.456084
...,...,...
2021-01-14,3146.231030,254.301636
2021-01-15,3146.231030,254.301636
2021-01-19,3146.231030,254.301636
2021-01-20,3146.231030,254.301636


In [49]:
for stock in stocks:
    
    fig = go.Figure()
    
    # Plotting the actual values
    fig.add_trace(go.Scatter(x=pred_df.index,
                             y=stock_df['MovAvg'][stock].loc[pred_df.index],
                             name='Actual Moving Average',
                             mode='lines'))
    
    # Plotting the predicted values
    fig.add_trace(go.Scatter(x=pred_df.index,
                             y=pred_df[stock],
                             name='Predicted Moving Average',
                             mode='lines'))
    
    # Setting the labels
    fig.update_layout(title=f'Predicting the Moving Average for the Next {days_to_predict} days for {stock}',
                      xaxis_title='Date',
                      yaxis_title='Prices')
    
    fig.show()


In [41]:
for stock in stocks:
    
    # Finding the root mean squared error
    rmse = mean_squared_error(stock_df['MovAvg'][stock].loc[pred_df.index], pred_df[stock], squared=False)
print(f"On average, the model is off by {rmse} for {stock}\n")

On average, the model is off by 51.58072749465081 for AMZN



In [42]:
def get_positions(difference, thres=3, short=True):
    """
    Compares the percentage difference between actual 
    values and the respective predictions.
    
    Returns the decision or positions to long or short 
    based on the difference.
    
    Optional: shorting in addition to buying
    """
    
    if difference > thres/100:
        
        return 1
    
    
    elif short and difference < -thres/100:
        
        return -1
    
    
    else:
        
        return 0

In [43]:
# Creating a DF dictionary for trading the model
trade_df = {}

# Getting the percentage difference between the predictions and the actual values
trade_df['PercentDiff'] = (stock_df['Predictions'].dropna() / 
                           stock_df['MovAvg'].loc[stock_df['Predictions'].dropna().index]) - 1

# Getting positions
trade_df['Positions'] = trade_df['PercentDiff'].applymap(lambda x: get_positions(x, 
                                                                                 thres=1, 
                                                                                 short=True) / len(stocks))

# Preventing lookahead bias by shifting the positions
trade_df['Positions'] = trade_df['Positions'].shift(2).dropna()

# Getting Log Returns
trade_df['LogReturns'] = stock_df['LogReturns'].loc[trade_df['Positions'].index]                                    


In [50]:
# Calculating Returns by multiplying the 
# positions by the log returns
returns = trade_df['Positions'] * trade_df['LogReturns']
# Calculating the performance as we take the cumulative 
# sum of the returns and transform the values back to normal
performance = returns.cumsum().apply(np.exp)
# Plotting the performance per stock
px.line(performance,
        x=performance.index,
        y=performance.columns,
        title='Returns Per Stock Using ARIMA Forecast',
        labels={'variable':'Stocks',
                'value':'Returns'})

In [45]:
# Returns for the portfolio
returns = (trade_df['Positions'] * trade_df['LogReturns']).sum(axis=1)

# Returns for SPY
spy = yf.download('SPY', start=returns.index[0]).loc[returns.index]

spy = spy['Adj Close'].apply(np.log).diff().dropna().cumsum().apply(np.exp)

# Calculating the performance as we take the cumulative sum of the returns and transform the values back to normal
performance = returns.cumsum().apply(np.exp)

# Plotting the comparison between SPY returns and ARIMA returns
fig = go.Figure()

fig.add_trace(go.Scatter(x=spy.index,
                         y=spy,
                         name='SPY Returns',
                         mode='lines'))

fig.add_trace(go.Scatter(x=performance.index,
                         y=performance.values,
                         name='ARIMA Returns on Portfolio',
                         mode='lines'))

fig.update_layout(title='SPY vs ARIMA Overall Portfolio Returns',
                  xaxis_title='Date',
                  yaxis_title='Returns')

fig.show()

[*********************100%***********************]  1 of 1 completed
