In [18]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

## Import Libraries

In [42]:
import yfinance as yf
import pandas as pd
import numpy as np
import sys
import warnings
import plotly.express as px

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from plotnine import *
from sklearn.metrics import make_scorer, mean_squared_error
# from plotly.tools import mpl_to_plotly as ggplotly


## Add Data Wrappers

In [86]:
class StockDataWrapper:
    """A wrapper class to bundle stock data (stored in a pandas DataFrame) together with information
    about how the data was collected (such as what the time frame is between two data points)

    A StockDataWrapper class at the minimum requires a pandas DataFrame (the primary object we are 
    interested in wrapping) as well as the respective stock symbol passed as a string so that we are
    able to easily refer to the specific stock in later formatting. 
    
    Certain variables must be set (from either user input or a config file) prior to calling a handful 
    of methods. Specifically, 
    
    

    Attributes:
        data_frame:    A pandas DataFrame object containing stocks data such as Open, Close,
            Volume, High, Low, etc.
            
        stock_symbol:   A string representing the stock's symbol (i.e. AAPL for Apple)
        
        granularity:    The time interval between two data points [1m, 2m, 5m, 15m, 30m, 1h]
        
        num_days:    The number of days of stock data that we are looking at  
        
        lag_length:    The number of previous days
    """
    

    def __init__(self, data_frame, stock_symbol):
        """Initializes the instance based on a pandas DataFrame.

        Args:
            data_frame: a pandas DataFrame object
            stock_symbol: a string representing the stock's symbol (i.e. AAPL for Apple)
        Raises:
            ValueError: if one of the arguments is not the specified data type
        """
        
        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError("data_frame must be a pandas DataFrame object")
        if not isinstance(stock_symbol, str):
            raise ValueError("stock_symbol must be a string")
            
        self.data_frame = data_frame
        self.stock_symbol = stock_symbol
        self.granularity = 0
        self.num_days = 0
        self.lag_length = 0

    def __str__(self):
        """Indicataes how to represent a StockDataWrapper object when
        passed as a string to print
        """
        my_str = f'Stock Name: {self.stock_symbol}\n'
        
        # Check whether several variables have been set
        if self.granularity == 0:
            my_str += "Granularity of data not set\n"
        else:
            my_str += f"Granularity: {self.granularity}\n"
        
        if self.num_days == 0:
            my_str += "Number of days of stock data not set\n"
        else:
            my_str += f"Number of days of stock data: {self.num_days}\n"
        
        if self.lag_length == 0:
            my_str += "Number of lag variables not set\n"
        else:
            my_str += f"Number of lag variables: {self.lag_length}\n"

        my_str += "\n\n"
        # Call the underlying Pandas DataFrame object's __str__
        my_str += str(self.data_frame)

        return my_str
    
    
    
    def add_lag_variables(self, num_lags):

        for i in range(1,num_lags + 1):
            index_str = "Close_L" + str(i)
            self.data_frame[index_str] = self.data_frame['Close'].shift(i)

        # Backfill the entries to remove any NaN
        self.data_frame = self.data_frame.bfill(axis=0)
        
        
    
    def clean_lag_variables(self):
        existing_lag_names = list(self.data_frame.filter(regex='Close_L'))
        
        if (len(existing_lag_names) > 0):  
            self.data_frame = self.data_frame[
                self.data_frame.columns.drop(existing_lag_names)
            ]
    
    def compute_maximal_lag_variables(self, ratio):
        """Once the stock data has been set

        Args:
            ratio: a pandas DataFrame object
        Returns:
            
        Raises:
            ValueError: if one of the arguments is not the specified data type
        """
        if self.granularity == 0 or self.num_days == 0:
            raise NameError("Both the granularity and number of days of stock data" +
                  " must be set prior to computing the maximum number of lag variables\n")
        if not isinstance(ratio, float):
            raise ValueError("Ratio must be a decimal between 0 and 1")
        if ratio <= 0 or ratio >= 1:
            raise ValueError("Ratio must be a decimal between 0 and 1")
        
        if self.granularity == '1m':
            data_entries = 389 * self.num_days
        elif self.granularity == '2m':
            data_entries = 195 * self.num_days
        elif self.granularity == '5m':
            data_entries = 78 * self.num_days
        elif self.granularity == '15m':
            data_entries = 26 * self.num_days
        elif self.granularity == '30m':
            data_entries = 13 * self.num_days
        elif self.granularity == '1h':
            data_entries = 7 * self.num_days
        else:
            raise NameError("Granularity is not set properly: must be set to " +
                  "one of: 1m, 2m, 5m, 15m, 30m, 1h\n")
        
        return int(0.3 * data_entries)
    

    
    def add_simple_moving_standard_deviation(self, length):
    
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")
        
        lag_predictors = []
        column_label = "SMSD_"  + str(length)
        
        for i in range(1, length + 1):
            lag_name = 'Close_L' + str(i)
            lag_predictors.append(lag_name)

        self.data_frame[column_label] = self.data_frame[lag_predictors].std(axis = 1, skipna = True)
    
    
    def add_simple_moving_average(self, length):
    
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        
        lag_predictors = []
        column_label = "SMA_"  + str(length)
        
        for i in range(1, length + 1):
            lag_name = 'Close_L' + str(i)
            lag_predictors.append(lag_name)

        self.data_frame[column_label] = (self.data_frame[lag_predictors].sum(axis = 1,
                                                                             skipna = True) / float(length))
        
        
    def add_upper_bollinger(self, length):
    
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        SMSD_string = "SMSD_" + str(length)
        SMA_string = "SMA_" + str(length)
        
        if SMSD_string not in self.data_frame.columns:
            self.add_simple_moving_standard_deviation(length)

        if SMA_string not in self.data_frame.columns:
            self.add_simple_moving_average(length)


        upper_bollinger_str = "upper_boll_" + str(length)
        self.data_frame[upper_bollinger_str] = self.data_frame[SMA_string] + self.data_frame[SMSD_string]
        
        
    def add_lower_bollinger(self, length):
    
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        SMSD_string = "SMSD_" + str(length)
        SMA_string = "SMA_" + str(length)
        
        if SMSD_string not in self.data_frame.columns:
            self.add_simple_moving_standard_deviation(length)

        if SMA_string not in self.data_frame.columns:
            self.add_simple_moving_average(length)


        lower_bollinger_str = "lower_boll_" + str(length)
        self.data_frame[lower_bollinger_str] = self.data_frame[SMA_string] - self.data_frame[SMSD_string]
        
        
    def _exponential_moving_average_helper_(self, input_vec, smoothing_factor):
        
        vec_len = input_vec.shape[0]
        output_vec = list(range(vec_len))
        output_vec[0] = input_vec.iloc[0]

        for i in range(1,vec_len):
            output_vec[i] = smoothing_factor * input_vec.iloc[i] + (1 - smoothing_factor) * output_vec[i-1]

        return(output_vec[-1])



    def add_exponential_moving_average(self, lag_length):
        if not isinstance(lag_length, int):
            raise ValueError("Parameter lag_length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding exponental moving average")
        if lag_length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                      f" (currently {self.lag_length})\n")

        column_label = "EMA_"  + str(lag_length)
        length = self.data_frame.shape[0]
        smoothing_factor = float(2/(lag_length + 1))


        buffer = list(range(length))
        buffer[0] = self.data_frame["Close"].iloc[0]
        buffer[1] = self.data_frame["Close"].iloc[0]


        for i in range(2, length):
            # If there are less that lag_period of data previous to the current date,
            # simply take the average of all the days prior to get the closest thing
            # to a running average


            if i <= lag_length:
                buffer[i] = self._exponential_moving_average_helper_(
                    self.data_frame["Close"].iloc[0:i],
                    2/(i + 1)
                )
            else:
                buffer[i] = self._exponential_moving_average_helper_(
                    self.data_frame["Close"].iloc[i-lag_length:i],
                    smoothing_factor
                )


        self.data_frame[column_label] = buffer


    def add_MACD(self, max_lag, min_lag):
        
        if not isinstance(max_lag, int) or not isinstance(min_lag, int):
            raise ValueError("Parameters max_lag and min_lag must be integer")
        if min_lag >= max_lag:
            raise ValueError("min_lag must be strictly smaller than max_lag")

        EMA_string_min = "EMA_" + str(min_lag)
        EMA_string_max = "EMA_" + str(max_lag)
        column_label = "MACD_"  + str(min_lag) + "_" + str(max_lag)

        if EMA_string_min not in self.data_frame.columns:
            self.add_exponential_moving_average(min_lag)

        if EMA_string_max not in self.data_frame.columns:
            self.add_exponential_moving_average(max_lag)

        self.data_frame[column_label] = (self.data_frame[EMA_string_min] - self.data_frame[EMA_string_max])


## Import Data from User Input

In [82]:
def create_stock_data_from_input():

    user_code = 0

    while user_code == 0:
        num_days_to_build = input("How many days of intraday stock market data should " + 
                                  " we use to build our model? Enter a value between " +
                                  " 1 and 7:\n(Type 'Exit' to quit)\n")

        if num_days_to_build.isnumeric():
            num_days_to_build = int(num_days_to_build)
            if num_days_to_build > 0 and num_days_to_build < 8:
                user_code = 1
            else:
                print("The number of days must be between 1 and 7 "+
                      "— please retry. \n (Type 'Exit' to quit)\n")
        elif "exit" in num_days_to_build.lower():
            sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry. \n(Type 'Exit' to quit)\n")


    today = date.today()
    num_days_prior = today - timedelta(num_days_to_build)

    granularity_options = ['1m', '2m', '5m', '15m', '30m', '1h']
    user_code = 0

    while user_code == 0:
        granulrity_input = input("How often should our model look at" + 
                                 " stock prices? Choose from 1m, 2m, 5m," +
                                 " 15m, 30m or 1h.\n(Type 'Exit' to quit)\n"
                                ).lower()
        
        if granulrity_input in granularity_options:
            user_code = 1
        elif "exit" in granulrity_input:
            sys.exit("Exiting program")
        else:
            print("Input was not among the options 1m, 2m," +
                  " 5m, 15m, 30m, or 1h — please retry."+
                  "\n(Type 'Exit' to quit)\n")
            
    
    stock_symbol = input("Please input the stock symbol you would like to examine: (e.g. AAPL)\n").upper()
        


    df = pd.DataFrame(yf.download(stock_symbol,
                                start=num_days_prior,
                                end=today,
                                interval=granulrity_input)
                            )
    
    stock_data = StockDataWrapper(df, stock_symbol)
    stock_data.num_days = num_days_to_build
    stock_data.granularity = granulrity_input

    return stock_data



def add_lags_from_input(stock_data):

    user_code = 0
    max_lag_len = stock_data.compute_maximal_lag_variables(0.3)
    if max_lag_len == -1:
        print("Cannot set lag variables without knowing granularity of number of days of data")
        return

    # Clean up previous entries
    stock_data.clean_lag_variables()

    while user_code == 0:
        lag_length = input("How many previous data points should our model look at?\n (Type 'Exit' to quit)\n")

        if lag_length.isnumeric():
            lag_length = int(lag_length)
            if lag_length <= max_lag_len:
                stock_data.lag_length = lag_length
                stock_data.add_lag_variables(lag_length)
                user_code = 1
            else:
                print("The number of previous data points considered should" +
                      f" not exceed 30%% \nof the total number of data points (in this case, {max_lag_len}) -- please" +
                       " retry. \n\n(Type 'Exit' to quit)")
        elif "exit" in num_days_to_build.lower():
                sys.exit("Exiting program")
        else:
            print("""Non-integer passed as input — please retry.
                  \n
                  (Type 'Exit' to quit)\n""")

# Exploratory Data Analysis

In [85]:
stonks = create_stock_data_from_input()


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y='Close',
                 title=f"{stonks.stock_symbol} Stock Prices")
    fig.show()

add_lags_from_input(stonks)

display_days = int(0.8 * stonks.lag_length)
SMA_string = 'SMA_' + str(display_days)
lower_boll_str = 'lower_boll_' + str(display_days)
upper_boll_str = 'upper_boll_' + str(display_days)
EMA_string = 'EMA_' + str(display_days)
EMA_string_2 = 'EMA_' + str(int(display_days / 2))
MACD_string = 'MACD_' + str(int(display_days / 2)) + '_' + str(display_days)


stonks.add_simple_moving_average(display_days)
stonks.add_upper_bollinger(display_days)
stonks.add_lower_bollinger(display_days)
stonks.add_exponential_moving_average(display_days)
stonks.add_exponential_moving_average(int(display_days/2))
stonks.add_MACD(display_days, int(display_days/2))


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=[SMA_string, upper_boll_str, lower_boll_str, 'Close',], 
                  color_discrete_map={
                      SMA_string: "#85deb1",
                      upper_boll_str: "#b6d1c3",
                      lower_boll_str: "#b6d1c3",
                      "Close": "#0059ff",
                  },
                 title=f"{stonks.stock_symbol} Stock Prices w/ Bollinger Bands").update_traces(
        selector={"name": upper_boll_str},
        line={"dash": "dot"}
    ).update_traces(
        selector={"name": lower_boll_str}, 
        line={"dash": "dot"})

    fig.show()


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=[EMA_string, EMA_string_2], 
                  color_discrete_map={
                      EMA_string:"#38b9ff",
                      EMA_string_2:"#44fcd1"
                  },
                 title=f"{stonks.stock_symbol} Exponential Moving Averages")
    fig.show()
    

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=MACD_string, 
                  color_discrete_map={
                      MACD_string:"#edff47"
                  },
                 title=f"{stonks.stock_symbol} MACD")
    fig.show()


How many days of intraday stock market data should  we use to build our model? Enter a value between  1 and 7:
(Type 'Exit' to quit)
1
How often should our model look at stock prices? Choose from 1m, 2m, 5m, 15m, 30m or 1h.
(Type 'Exit' to quit)
1m
Please input the stock symbol you would like to examine: (e.g. AAPL)
aapl


[*********************100%%**********************]  1 of 1 completed


How many previous data points should our model look at?
 (Type 'Exit' to quit)20


# Setting Up Models

In [49]:
# Predictors
predictors = stonks.data_frame.filter(regex='Close_L')
# Outcome
outcome=stonks.data_frame["Close"]

(trainX, testX, trainY, testY) = train_test_split(predictors, 
                                                  outcome,
                                                  random_state=43,
                                                  test_size=0.25)
scaler = preprocessing.StandardScaler()
trainX = scaler.fit_transform(trainX)
testX = scaler.transform(testX)

In [53]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
l1_ratios = np.arange(0.0, 1.0, 0.1)

eNet = ElasticNet()

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid = GridSearchCV(eNet, {"alpha": alphas,
                               "l1_ratio": l1_ratios},
                        scoring='r2', cv=10)
    grid.fit(trainX, trainY)
    
# predY = grid.predict(testX)


scores_mean = grid.cv_results_['mean_test_score'].reshape(len(alphas),
                                                          len(l1_ratios))


elastic_net_performance = pd.DataFrame({
    "L1_Ratio" : l1_ratios
})

for idx, val in enumerate(alphas):
    alpha_str = "alpha_" + str(val)
    elastic_net_performance[alpha_str] = scores_mean[idx,:]
    
fig = px.line(elastic_net_performance, x="L1_Ratio",
                  y=[col for col in elastic_net_performance.columns if 'alpha' in col],
                 title="Elastic Net Performance").update_layout(
    yaxis_title="R^2 value"
)
fig.show()


Datetime
2024-06-06 09:30:00-04:00    196.130005
2024-06-06 09:31:00-04:00    196.259995
2024-06-06 09:32:00-04:00    196.108200
2024-06-06 09:33:00-04:00    195.899994
2024-06-06 09:34:00-04:00    195.770004
                                ...    
2024-06-06 15:55:00-04:00    194.320007
2024-06-06 15:56:00-04:00    194.175003
2024-06-06 15:57:00-04:00    194.410004
2024-06-06 15:58:00-04:00    194.345001
2024-06-06 15:59:00-04:00    194.479996
Name: Close, Length: 390, dtype: float64
Datetime
2024-06-06 09:30:00-04:00    196.130005
2024-06-06 09:31:00-04:00    196.259995
2024-06-06 09:32:00-04:00    196.108200
2024-06-06 09:33:00-04:00    195.899994
2024-06-06 09:34:00-04:00    195.770004
2024-06-06 09:35:00-04:00    195.486801
2024-06-06 09:36:00-04:00    195.576096
2024-06-06 09:37:00-04:00    195.529999
2024-06-06 09:38:00-04:00    195.529999
2024-06-06 09:39:00-04:00    195.520004
Name: Close, dtype: float64
