In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

## Import Libraries

In [6]:
import yfinance as yf
import pandas as pd
import numpy as np
import sys
import warnings
import plotly.express as px

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from plotnine import *
from sklearn.metrics import make_scorer, mean_squared_error
# from plotly.tools import mpl_to_plotly as ggplotly


## Add Data Wrappers

In [7]:
class StockDataWrapper:
    """
    A wrapper class to bundle stock data (stored in a pandas DataFrame) together with information
    about how the data was collected (such as what the time frame is between two data points)

    A StockDataWrapper class at the minimum requires a pandas DataFrame (the primary object we are 
    interested in wrapping) as well as the respective stock symbol passed as a string so that we are
    able to easily refer to the specific stock in later formatting. 
    
        IMPORTANT: It is expected that the lag_lengthand num_days are set whenever the stock data is
                loaded into the data_frame object
    
    Certain variables must be set (either from user input or a configuration file) prior to calling
    a handful of methods. Specifically, the lag length must be set prior to calling any functions which
    utilize prior data (e.g. MACD, simple_moving_average, exponential_moving_average, etc.)
    
    
    Attributes:
        data_frame:    A pandas DataFrame object containing stocks data such as Open, Close,
            Volume, High, Low, etc.
            
        stock_symbol:   A string representing the stock's symbol (i.e. AAPL for Apple)
        
        granularity:    The time interval between two data points [1m, 2m, 5m, 15m, 30m, 1h]
        
        num_days:    The number of days of stock data that have been pulled 
        
        lag_length:    The number of previous data points that are availible to use for computations
            (such as moving averages)
    """
    

    def __init__(self, data_frame, stock_symbol):
        """
        Initializes the instance based on a pandas DataFrame.

        Args:
            data_frame: a pandas DataFrame object
            stock_symbol: a string representing the stock's symbol (i.e. AAPL for Apple)
        Raises:
            ValueError: if one of the arguments is not the specified data type
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError("data_frame must be a pandas DataFrame object")
        if not isinstance(stock_symbol, str):
            raise ValueError("stock_symbol must be a string")
            
        self.data_frame = data_frame
        self.stock_symbol = stock_symbol
        # granularity and num_days should be set 
        self.granularity = 0
        self.num_days = 0
        self.lag_length = 0

    def __str__(self):
        """
        Indicataes how to represent a StockDataWrapper object when
        passed as a string to print
        
        Returns:
            String concatenating the __str__ of the pandas DataFrame together with
            information on the StockDataWrapper's Varaibles
        """
        my_str = f'Stock Name: {self.stock_symbol}\n'
        
        # Check whether several variables have been set
        if self.granularity == 0:
            my_str += "Granularity: NOT SET\n"
        else:
            my_str += f"Granularity: {self.granularity}\n"
        
        if self.num_days == 0:
            my_str += "Number of days of stock data: NOT SET\n"
        else:
            my_str += f"Number of days of stock data: {self.num_days}\n"
        
        if self.lag_length == 0:
            my_str += "Number of lag variables: NOT SET\n"
        else:
            my_str += f"Number of lag variables: {self.lag_length}\n"
        
        
        # Call the underlying Pandas DataFrame object's __str__
        my_str += "\n\n"
        my_str += str(self.data_frame)

        return my_str
    
    
    
    def add_lag_variables(self, num_lags):
        """
        Adds precisely num_lags lag variables to the underlying data frame. 
        The lag variables are simply additional columns where the entries are shifted
        up in time index.
        
            WARNING: A large number of lag variables can lead to fragmentation in the 
                DataFrame
        
        The (column) names of the lag variables that are added are of the form:
                            Close_L#
        where # is how far back that particular lag variable is looking.

        Args:
            num_lags: the number of lag variables we wish to add
        Raises:
            ValueError: if num_lags is not an integer or is larger than the number of
                observations or is negative
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(num_lags, int):
            raise ValueError("Parameter length must be integer")
        if num_lags < 1 or num_lags >= self.data_frame.shape[0]:
            raise ValueError("The number of lag variables must be between" +
                            " 1 and the total number of observations")

        for i in range(1,num_lags + 1):
            index_str = "Close_L" + str(i)
            self.data_frame[index_str] = self.data_frame['Close'].shift(i)

        # Backfill the entries to remove any NaN
        self.data_frame = self.data_frame.bfill(axis=0)
        
    
    def delete_lag_variables(self):
        """
        Removes any possible lag variables from the Data_Frame. This method
        implicitly assumes that all lag variables follow the naming convention
        
                        Close_L#
                        
        where # is how far back that particular lag variable is looking.
        """
        existing_lag_names = list(self.data_frame.filter(regex='Close_L'))
        
        if (len(existing_lag_names) > 0):  
            self.data_frame = self.data_frame[
                self.data_frame.columns.drop(existing_lag_names)
            ]
            # Reset lag_length so that other methods know not to compute
            # data from lag variables
            self.lag_length = 0
    
    def add_simple_moving_average(self, length):
        """
        Computes the simple moving average of the stock's closing price (i.e.
        the mean of the first 'length number' of lag variables) and adds it to a
        new column in our pandas DataFrame. The column that is added follows
        the naming convention:
        
                            SMA_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the simple moving average 
                over (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        # Gather the column names that we wish to take the average over
        lag_predictors = []
        for i in range(1, length + 1):
            lag_name = 'Close_L' + str(i)
            lag_predictors.append(lag_name)

        # Add new column to data frame
        column_label = "SMA_"  + str(length)
        self.data_frame[column_label] = (self.data_frame[lag_predictors].sum(axis = 1,
                                                                             skipna = True) / float(length))

    
    def add_simple_moving_standard_deviation(self, length):
        """
        Computes the simple moving standard deviation of the stock's closing
        price (i.e. the standard deviation of the first 'length number' of lag variables)
        and adds it to a new column in our pandas DataFrame. The column that is added follows
        the naming convention:
        
                            SMSD_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the simple moving standard deviation 
                over (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")
        
        # Gather the column names that we wish to take the average over
        lag_predictors = []
        for i in range(1, length + 1):
            lag_name = 'Close_L' + str(i)
            lag_predictors.append(lag_name)
            
        # Add new column to data frame
        column_label = "SMSD_"  + str(length)
        self.data_frame[column_label] = self.data_frame[lag_predictors].std(axis = 1, skipna = True)
          
        
    def add_upper_bollinger(self, length):
        """
        Computes the upper Bollinger band of the stock's closing price, which is just the
        simple moving average + the simple moving standard deviation and adds it to a new
        column in our pandas DataFrame. The column that is added follows the naming convention:
        
                            upper_boll_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the bollinger band over 
                (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        SMSD_string = "SMSD_" + str(length)
        SMA_string = "SMA_" + str(length)
        
        # Ensure that both the simple moving average data and
        # the simple moving standard deviation data are availible in 
        # DataFrame
        if SMSD_string not in self.data_frame.columns:
            self.add_simple_moving_standard_deviation(length)
        if SMA_string not in self.data_frame.columns:
            self.add_simple_moving_average(length)

        # Add new column to data frame
        upper_bollinger_str = "upper_boll_" + str(length)
        self.data_frame[upper_bollinger_str] = self.data_frame[SMA_string] + self.data_frame[SMSD_string]
        
        
    def add_lower_bollinger(self, length):
        """
        Computes the lower Bollinger band of the stock's closing price, which is just the 
        simple moving average + the simple moving standard deviation and adds it to a new
        column in our pandas DataFrame. The column that is added follows the naming convention:
        
                            lower_boll_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the bollinger band over 
                (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
    
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        SMSD_string = "SMSD_" + str(length)
        SMA_string = "SMA_" + str(length)
        
        # Ensure that both the simple moving average data and
        # the simple moving standard deviation data are availible in 
        # DataFrame
        if SMSD_string not in self.data_frame.columns:
            self.add_simple_moving_standard_deviation(length)
        if SMA_string not in self.data_frame.columns:
            self.add_simple_moving_average(length)

        # Add new column to data frame
        lower_bollinger_str = "lower_boll_" + str(length)
        self.data_frame[lower_bollinger_str] = self.data_frame[SMA_string] - self.data_frame[SMSD_string]
        
        
    def _exponential_moving_average_helper_(self, input_vec, smoothing_factor):
        """
        Helper function 

        Args:
            input_vec: 
            
            smoothing_factor: 

        """
        
        vec_len = input_vec.shape[0]
        output_vec = list(range(vec_len))
        output_vec[0] = input_vec.iloc[0]

        for i in range(1,vec_len):
            output_vec[i] = smoothing_factor * input_vec.iloc[i] + (1 - smoothing_factor) * output_vec[i-1]

        return(output_vec[-1])



    def add_exponential_moving_average(self, lag_length):
        if not isinstance(lag_length, int):
            raise ValueError("Parameter lag_length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding exponental moving average")
        if lag_length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                      f" (currently {self.lag_length})\n")

        column_label = "EMA_"  + str(lag_length)
        length = self.data_frame.shape[0]
        smoothing_factor = float(2/(lag_length + 1))


        buffer = list(range(length))
        buffer[0] = self.data_frame["Close"].iloc[0]
        buffer[1] = self.data_frame["Close"].iloc[0]


        for i in range(2, length):
            # If there are less that lag_period of data previous to the current date,
            # simply take the average of all the days prior to get the closest thing
            # to a running average


            if i <= lag_length:
                buffer[i] = self._exponential_moving_average_helper_(
                    self.data_frame["Close"].iloc[0:i],
                    2/(i + 1)
                )
            else:
                buffer[i] = self._exponential_moving_average_helper_(
                    self.data_frame["Close"].iloc[i-lag_length:i],
                    smoothing_factor
                )


        self.data_frame[column_label] = buffer


    def add_MACD(self, max_lag, min_lag):
        
        if not isinstance(max_lag, int) or not isinstance(min_lag, int):
            raise ValueError("Parameters max_lag and min_lag must be integer")
        if min_lag >= max_lag:
            raise ValueError("min_lag must be strictly smaller than max_lag")

        EMA_string_min = "EMA_" + str(min_lag)
        EMA_string_max = "EMA_" + str(max_lag)
        column_label = "MACD_"  + str(min_lag) + "_" + str(max_lag)

        if EMA_string_min not in self.data_frame.columns:
            self.add_exponential_moving_average(min_lag)

        if EMA_string_max not in self.data_frame.columns:
            self.add_exponential_moving_average(max_lag)

        self.data_frame[column_label] = (self.data_frame[EMA_string_min] - self.data_frame[EMA_string_max])


## Import Data from User Input

In [8]:
def create_stock_data_from_input():

    prompting_user_input = True

    while prompting_user_input:
        num_days_to_build = input("How many days of intraday stock market data should " + 
                                  " we use to build our model? Enter a value between " +
                                  " 1 and 7:\n(Type 'Exit' to quit)\n")

        if num_days_to_build.isnumeric():
            num_days_to_build = int(num_days_to_build)
            if num_days_to_build > 0 and num_days_to_build < 8:
                prompting_user_input = False
            else:
                print("The number of days must be between 1 and 7 "+
                      "— please retry. \n (Type 'Exit' to quit)\n")
        elif "exit" in num_days_to_build.lower():
            sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry. \n(Type 'Exit' to quit)\n")


    today = date.today()
    num_days_prior = today - timedelta(num_days_to_build)

    
    granularity_options = ['1m', '2m', '5m', '15m', '30m', '1h']
    prompting_user_input = True

    while prompting_user_input:
        granulrity_input = input("How often should our model look at" + 
                                 " stock prices? Choose from 1m, 2m, 5m," +
                                 " 15m, 30m or 1h.\n(Type 'Exit' to quit)\n"
                                ).lower()
        
        if granulrity_input in granularity_options:
            prompting_user_input = False
        elif "exit" in granulrity_input:
            sys.exit("Exiting program")
        else:
            print("Input was not among the options 1m, 2m," +
                  " 5m, 15m, 30m, or 1h — please retry."+
                  "\n(Type 'Exit' to quit)\n")
            
    
    stock_symbol = input("Please input the stock symbol you would like to examine: (e.g. AAPL)\n").upper()

    df = pd.DataFrame(yf.download(stock_symbol,
                                start=num_days_prior,
                                end=today,
                                interval=granulrity_input)
                            )
    
    stock_data = StockDataWrapper(df, stock_symbol)
    stock_data.num_days = num_days_to_build
    stock_data.granularity = granulrity_input

    return stock_data



def add_lags_from_input(stock_data, threshold):


    # Clean up previous entries
    stock_data.delete_lag_variables()
    prompting_user_input = True

    while prompting_user_input:
        lag_length = input("How many previous data points should our model look at?\n(Type 'Exit' to quit)\n")

        if lag_length.isnumeric():
            lag_length = int(lag_length)
            
            if lag_length <= threshold:
                stock_data.lag_length = lag_length
                stock_data.add_lag_variables(lag_length)
                prompting_user_input = False
            else:
                print("\nThe number of previous data points considered should" +
                      f" not exceed 30%% \nof the total number of data points (in this case, {threshold}) -- please" +
                       " retry. \n")
        elif "exit" in num_days_to_build.lower():
                sys.exit("Exiting program")
        else:
            print("""Non-integer passed as input — please retry.
                  \n
                  (Type 'Exit' to quit)\n""")

# Exploratory Data Analysis

In [10]:
stonks = create_stock_data_from_input()

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y='Close',
                 title=f"{stonks.stock_symbol} Stock Prices")
    fig.show()

# The number of lag variables should not exceed 30% of the 
# number of observations
threshold = int(0.3 * stonks.data_frame.shape[0])
    
add_lags_from_input(stonks, threshold)

display_days = int(0.8 * stonks.lag_length)
SMA_string = 'SMA_' + str(display_days)
lower_boll_str = 'lower_boll_' + str(display_days)
upper_boll_str = 'upper_boll_' + str(display_days)
EMA_string = 'EMA_' + str(display_days)
EMA_string_2 = 'EMA_' + str(int(display_days / 2))
MACD_string = 'MACD_' + str(int(display_days / 2)) + '_' + str(display_days)


stonks.add_simple_moving_average(display_days)
stonks.add_upper_bollinger(display_days)
stonks.add_lower_bollinger(display_days)
stonks.add_exponential_moving_average(display_days)
stonks.add_exponential_moving_average(int(display_days/2))
stonks.add_MACD(display_days, int(display_days/2))


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=[SMA_string, upper_boll_str, lower_boll_str, 'Close',], 
                  color_discrete_map={
                      SMA_string: "#85deb1",
                      upper_boll_str: "#b6d1c3",
                      lower_boll_str: "#b6d1c3",
                      "Close": "#0059ff",
                  },
                 title=f"{stonks.stock_symbol} Stock Prices w/ Bollinger Bands").update_traces(
        selector={"name": upper_boll_str},
        line={"dash": "dot"}
    ).update_traces(
        selector={"name": lower_boll_str}, 
        line={"dash": "dot"})

    fig.show()


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=[EMA_string, EMA_string_2], 
                  color_discrete_map={
                      EMA_string:"#38b9ff",
                      EMA_string_2:"#44fcd1"
                  },
                 title=f"{stonks.stock_symbol} Exponential Moving Averages")
    fig.show()
    

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=MACD_string, 
                  color_discrete_map={
                      MACD_string:"#edff47"
                  },
                 title=f"{stonks.stock_symbol} MACD")
    fig.show()


How many days of intraday stock market data should  we use to build our model? Enter a value between  1 and 7:
(Type 'Exit' to quit)
1
How often should our model look at stock prices? Choose from 1m, 2m, 5m, 15m, 30m or 1h.
(Type 'Exit' to quit)
1m
Please input the stock symbol you would like to examine: (e.g. AAPL)
INTC


[*********************100%%**********************]  1 of 1 completed


How many previous data points should our model look at?
(Type 'Exit' to quit)
30


# Setting Up Models

In [11]:
# Predictors
predictors = stonks.data_frame.filter(regex='Close_L')
# Outcome
outcome=stonks.data_frame["Close"]

(trainX, testX, trainY, testY) = train_test_split(predictors, 
                                                  outcome,
                                                  random_state=43,
                                                  test_size=0.25)
scaler = preprocessing.StandardScaler()
trainX = scaler.fit_transform(trainX)
testX = scaler.transform(testX)

In [13]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
l1_ratios = np.arange(0.0, 1.0, 0.1)

eNet = ElasticNet()

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid = GridSearchCV(eNet, {"alpha": alphas,
                               "l1_ratio": l1_ratios},
                        scoring='r2', cv=10)
    grid.fit(trainX, trainY)
    
# predY = grid.predict(testX)


scores_mean = grid.cv_results_['mean_test_score'].reshape(len(alphas),
                                                          len(l1_ratios))


elastic_net_performance = pd.DataFrame({
    "L1_Ratio" : l1_ratios
})

for idx, val in enumerate(alphas):
    alpha_str = "alpha_" + str(val)
    elastic_net_performance[alpha_str] = scores_mean[idx,:]
    
fig = px.line(elastic_net_performance, x="L1_Ratio",
                  y=[col for col in elastic_net_performance.columns if 'alpha' in col],
                 title="Elastic Net Performance").update_layout(
    yaxis_title="R^2 value"
)
fig.show()
