In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

## Import Libraries

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import sys
import warnings
import plotly.express as px
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import mercury as mr # for widgets

from math import ceil, floor
from plotly.subplots import make_subplots
from datetime import date, datetime, timedelta
from sklearn import preprocessing
# from sklearn import linear_model
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.exceptions import ConvergenceWarning
from plotnine import *
from sklearn.metrics import make_scorer, mean_squared_error
# from plotly.tools import mpl_to_plotly as ggplotly


SEED = 3234
CROSS_FOLDS = 10
PERFORMANCE_METRIC = "r2"
PARALLELIZATION = -1


### Interactive Content

In [3]:
show_code = mr.Checkbox(value=True, label="Show code")
app = mr.App(title="yfinance Test",
             description="Testing Mercury functionality with yfinance",
            show_code=show_code.value,
            continuous_update=False)

mercury.Checkbox

## Add Data Wrappers

An important step in engineering our program is deciding the lifespan of each piece of information passed to us. It will ultimately be convenient to certain variables around, especially when they dictate the nature of the time-series we examine (e.g. time between observations). There's ultimately a few different ways we could go about accessing this data, though the easiest 

In [4]:
class StockDataWrapper:
    """
    A wrapper class to bundle stock data (stored in a pandas DataFrame) together with information
    about how the data was collected (such as what the time frame is between two data points)

    A StockDataWrapper class at the minimum requires a pandas DataFrame (the primary object we are 
    interested in wrapping) as well as the respective stock symbol passed as a string so that we are
    able to easily refer to the specific stock in later formatting. 
    
        IMPORTANT: It is expected that the lag_lengthand num_days are set whenever the stock data is
                loaded into the data_frame object
    
    Certain variables must be set (either from user input or a configuration file) prior to calling
    a handful of methods. Specifically, the lag length must be set prior to calling any functions which
    utilize prior data (e.g. MACD, simple_moving_average, exponential_moving_average, etc.)
    
    
    Attributes:
        data_frame:    A pandas DataFrame object containing stocks data such as Open, Close,
            Volume, High, Low, etc.
            
        stock_symbol:   A string representing the stock's symbol (i.e. AAPL for Apple)
        
        granularity:    The time interval between two data points [1m, 2m, 5m, 15m, 30m, 1h]
        
        num_days:    The number of days of stock data that have been pulled 
        
        lag_length:    The number of previous data points that are availible to use for computations
            (such as moving averages)
    """
    

    def __init__(self, data_frame, stock_symbol):
        """
        Initializes the instance based on a pandas DataFrame.

        Args:
            data_frame: a pandas DataFrame object
            stock_symbol: a string representing the stock's symbol (i.e. AAPL for Apple)
        Raises:
            ValueError: if one of the arguments is not the specified data type
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError("data_frame must be a pandas DataFrame object")
        if not isinstance(stock_symbol, str):
            raise ValueError("stock_symbol must be a string")
            
        self.data_frame = data_frame
        self.stock_symbol = stock_symbol
        # granularity and num_days should be set 
        self.granularity = 0
        self.num_days = 0
        self.lag_length = 0

    def __str__(self):
        """
        Indicataes how to represent a StockDataWrapper object when
        passed as a string to print
        
        Returns:
            String concatenating the __str__ of the pandas DataFrame together with
            information on the StockDataWrapper's Varaibles
        """
        my_str = f'Stock Name: {self.stock_symbol}\n'
        
        # Check whether several variables have been set
        if self.granularity == 0:
            my_str += "Granularity: NOT SET\n"
        else:
            my_str += f"Granularity: {self.granularity}\n"
        
        if self.num_days == 0:
            my_str += "Number of days of stock data: NOT SET\n"
        else:
            my_str += f"Number of days of stock data: {self.num_days}\n"
        
        if self.lag_length == 0:
            my_str += "Number of lag variables: NOT SET\n"
        else:
            my_str += f"Number of lag variables: {self.lag_length}\n"
        
        
        # Call the underlying Pandas DataFrame object's __str__
        my_str += "\n\n"
        my_str += str(self.data_frame)

        return my_str
    
    
    
    def add_lag_variables(self, num_lags):
        """
        Adds precisely num_lags lag variables to the underlying data frame. 
        The lag variables are simply additional columns where the entries are shifted
        up in time index.
        
            WARNING: A large number of lag variables can lead to fragmentation in the 
                DataFrame
        
        The (column) names of the lag variables that are added are of the form:
                            Close_L#
        where # is how far back that particular lag variable is looking.

        Args:
            num_lags: the number of lag variables we wish to add
        Raises:
            ValueError: if num_lags is not an integer or is larger than the number of
                observations or is negative
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(num_lags, int):
            raise ValueError("Parameter length must be integer")
        if num_lags < 1 or num_lags >= self.data_frame.shape[0]:
            raise ValueError("The number of lag variables must be between" +
                            " 1 and the total number of observations")

        for i in range(1,num_lags + 1):
            index_str = "Close_L" + str(i)
            self.data_frame[index_str] = self.data_frame['Close'].shift(i)

        # Backfill the entries to remove any NaN
        self.data_frame = self.data_frame.bfill(axis=0)
        
    
    def delete_lag_variables(self):
        """
        Removes any possible lag variables from the Data_Frame. This method
        implicitly assumes that all lag variables follow the naming convention
        
                        Close_L#
                        
        where # is how far back that particular lag variable is looking.
        """
        existing_lag_names = list(self.data_frame.filter(regex='Close_L'))
        
        if (len(existing_lag_names) > 0):  
            self.data_frame = self.data_frame[
                self.data_frame.columns.drop(existing_lag_names)
            ]
            # Reset lag_length so that other methods know not to compute
            # data from lag variables
            self.lag_length = 0
    
    def add_simple_moving_average(self, length):
        """
        Computes the simple moving average of the stock's closing price (i.e.
        the mean of the first 'length number' of lag variables) and adds it to a
        new column in our pandas DataFrame. The column that is added follows
        the naming convention:
        
                            SMA_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the simple moving average 
                over (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        # Gather the column names that we wish to take the average over
        lag_predictors = []
        for i in range(1, length + 1):
            lag_name = 'Close_L' + str(i)
            lag_predictors.append(lag_name)

        # Add new column to data frame
        column_label = "SMA_"  + str(length)
        self.data_frame[column_label] = (self.data_frame[lag_predictors].sum(axis = 1,
                                                                             skipna = True) / float(length))

    
    def add_simple_moving_standard_deviation(self, length):
        """
        Computes the simple moving standard deviation of the stock's closing
        price (i.e. the standard deviation of the first 'length number' of lag variables)
        and adds it to a new column in our pandas DataFrame. The column that is added follows
        the naming convention:
        
                            SMSD_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the simple moving standard deviation 
                over (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")
        
        # Gather the column names that we wish to take the average over
        lag_predictors = []
        for i in range(1, length + 1):
            lag_name = 'Close_L' + str(i)
            lag_predictors.append(lag_name)
            
        # Add new column to data frame
        column_label = "SMSD_"  + str(length)
        self.data_frame[column_label] = self.data_frame[lag_predictors].std(axis = 1, skipna = True)
          
        
    def add_upper_bollinger(self, length):
        """
        Computes the upper Bollinger band of the stock's closing price, which is just the
        simple moving average + the simple moving standard deviation and adds it to a new
        column in our pandas DataFrame. The column that is added follows the naming convention:
        
                            upper_boll_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the bollinger band over 
                (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        SMSD_string = "SMSD_" + str(length)
        SMA_string = "SMA_" + str(length)
        
        # Ensure that both the simple moving average data and
        # the simple moving standard deviation data are availible in 
        # DataFrame
        if SMSD_string not in self.data_frame.columns:
            self.add_simple_moving_standard_deviation(length)
        if SMA_string not in self.data_frame.columns:
            self.add_simple_moving_average(length)

        # Add new column to data frame
        upper_bollinger_str = "upper_boll_" + str(length)
        self.data_frame[upper_bollinger_str] = self.data_frame[SMA_string] + self.data_frame[SMSD_string]
        
        
    def add_lower_bollinger(self, length):
        """
        Computes the lower Bollinger band of the stock's closing price, which is just the 
        simple moving average + the simple moving standard deviation and adds it to a new
        column in our pandas DataFrame. The column that is added follows the naming convention:
        
                            lower_boll_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the bollinger band over 
                (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
    
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        SMSD_string = "SMSD_" + str(length)
        SMA_string = "SMA_" + str(length)
        
        # Ensure that both the simple moving average data and
        # the simple moving standard deviation data are availible in 
        # DataFrame
        if SMSD_string not in self.data_frame.columns:
            self.add_simple_moving_standard_deviation(length)
        if SMA_string not in self.data_frame.columns:
            self.add_simple_moving_average(length)

        # Add new column to data frame
        lower_bollinger_str = "lower_boll_" + str(length)
        self.data_frame[lower_bollinger_str] = self.data_frame[SMA_string] - self.data_frame[SMSD_string]
        
        
    def _exponential_moving_average_helper_(self, series, smoothing_factor):
        """
        Helper function for add_exponential_moving_average() which utilizes a 
        temporary buffer in memory to compute the recursive function:
        
            EMA(data, n+1) = β * data[n+1] +  (1 - β) * data[n]
            
        where β represents the smoothing factor. The smoothing factor ultimately dictates
        how heavily recent data points are weighted, and how quickly previous data points
        lose weight

        Args:
            series: a pandas.core.series.Series object. Typically passed as
                dataframe["column"].iloc[a:b]. Must be treated slightly differently than
                a standard array since integer indexing of series is depricated
            
            smoothing_factor: a floating point integer which dictates how heavily recent
                data points are weighted
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(series, pd.Series):
            raise ValueError("Parameter series must be pandas.Series data type")
        if not isinstance(smoothing_factor, float):
            raise ValueError("Parameter smoothing factor must be a floating point decimal")
        
        N = series.shape[0]
        # Create temporary storage to compute our outputs. When the lag_length of
        # the exponential moving average is large, this will likely lead to memory fragmentation
        buffer = list(range(N))
        buffer[0] = series.iloc[0]

        # Use temporary storage + iteration to compute the recursive formula
        #
        #       EMA(data, n+1) = β * data[n+1] +  (1 - β) * data[n]
        #
        for i in range(1,N):
            buffer[i] = smoothing_factor * series.iloc[i] + (1 - smoothing_factor) * buffer[i-1]
        
        # return the last data point (i.e. the EMA over the desired lag_length)
        return buffer[-1]



    def add_exponential_moving_average(self, length):
        """
        Computes the exponential moving standard average of the stock's closing
        price (i.e. the weighted average of the first [length] lag variables)
        and adds it to a new column in our pandas DataFrame. The column that is added follows
        the naming convention:
        
                            SMSD_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the simple moving standard deviation 
                over (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter lag_length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding exponental moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                      f" (currently {self.lag_length})\n")

        N = self.data_frame.shape[0]
        smoothing_factor = float(2/(length + 1))

        # Temporary output storage
        buffer = list(range(N))
        buffer[0] = self.data_frame["Close"].iloc[0]
        buffer[1] = self.data_frame["Close"].iloc[0]


        for i in range(2, N):
            # If there are less that lag_period of data previous to the current date,
            # simply take the average of all the days prior to get the closest thing
            # to a weighted average
            if i <= length:
                buffer[i] = self._exponential_moving_average_helper_(
                    self.data_frame["Close"].iloc[0:i],
                    2/(i + 1)
                )
            else:
                buffer[i] = self._exponential_moving_average_helper_(
                    self.data_frame["Close"].iloc[i-length:i],
                    smoothing_factor
                )
        # Add new column to data frame
        column_label = "EMA_"  + str(length)
        self.data_frame[column_label] = buffer


    def add_MACD(self, max_lag, min_lag):
        """
        Computes the moving average convergence-divergence of the stock's closing
        price (i.e. the difference of the exponential moving average taken over min_lag
        with the exponential moving average taken over max_lag)
        and adds it to a new column in our pandas DataFrame. The column that is added follows
        the naming convention:
        
                            MACD_(min_lag)_(max_lag)
                            
        where min_lag and max_lag are the first and second parameters passed, respectively.

        Args:
            length: the number of days to take the simple moving standard deviation 
                over (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(max_lag, int) or not isinstance(min_lag, int):
            raise ValueError("Parameters max_lag and min_lag must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding MACD")
        if min_lag >= max_lag:
            raise ValueError("min_lag must be strictly smaller than max_lag")

        EMA_string_min = "EMA_" + str(min_lag)
        EMA_string_max = "EMA_" + str(max_lag)

        # Ensure that both exponential moving averages are availible
        # in the DataFrame
        if EMA_string_min not in self.data_frame.columns:
            self.add_exponential_moving_average(min_lag)
        if EMA_string_max not in self.data_frame.columns:
            self.add_exponential_moving_average(max_lag)
            
        
        # Add new column to data frame
        column_label = "MACD_"  + str(min_lag) + "_" + str(max_lag)
        self.data_frame[column_label] = (self.data_frame[EMA_string_min] - self.data_frame[EMA_string_max])

# Exploratory Data Analysis

In [1]:
##################################
# STEP 1: OBTAIN DESIRED STOCK + INTERVALS
##################################

stock_symbol = mr.Text(label="Please input the stock symbol you would like to examine: (e.g. AAPL)",
                       value="AAPL")
num_days_to_build = mr.Slider(label="How many days of intraday stock market data should we use to build our model?",
                              value=1,
                              min=1,
                              max=7)
granularity_input = mr.Select(label="How often should our model look at stock prices?",
                                value="1m",
                                choices=["1m", "2m", "5m", "15m", "30m", "60m"])

# Apply num_days information to create Datetime variables
# which give a range of precisely (num_days) previous days from
# current date.
yesterday = date.today()
most_recent_stock_day = yesterday
    
# yfinance can only see the previous day's 24-hour stock period. 
# Thus, if the day is Sunday or Monday, there will be no stock data
# availible when probing date.today() in yfinance
if yesterday.weekday() == 6:
        most_recent_stock_day = yesterday - timedelta(1)
elif yesterday.weekday() == 0:
        most_recent_stock_day = yesterday - timedelta(2)
    
num_days_prior = most_recent_stock_day - timedelta(num_days_to_build.value)

df = pd.DataFrame(yf.download(stock_symbol.value,
                                start=num_days_prior,
                                end=most_recent_stock_day,
                                interval=granularity_input.value)
                            )
    
# add user input information to StockDataWrapper object
stonks = StockDataWrapper(df, stock_symbol.value)
stonks.num_days = num_days_to_build.value
stonks.granularity = granularity_input.value

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y='Close',
                 title=f"{stonks.stock_symbol} Stock Prices")
    fig.show()

    
##################################
# STEP 2: SET UP TIME-SERIES DATA
##################################

lag_length = mr.Slider(label=f"How many previous {stonks.granularity}in intervals should our machine learning models look at?",
                       value=15,
                       min=1,
                       max=int(0.3 * stonks.data_frame.shape[0])
                      )
stonks.lag_length = lag_length.value
stonks.add_lag_variables(stonks.lag_length)

SMA_string = 'SMA_' + str(stonks.lag_length)
lower_boll_str = 'lower_boll_' + str(stonks.lag_length)
upper_boll_str = 'upper_boll_' + str(stonks.lag_length)
EMA_string = 'EMA_' + str(stonks.lag_length)
EMA_string_2 = 'EMA_' + str(int(stonks.lag_length / 2))
MACD_string = 'MACD_' + str(int(stonks.lag_length / 2)) + '_' + str(stonks.lag_length)

stonks.add_simple_moving_average(stonks.lag_length)
stonks.add_upper_bollinger(stonks.lag_length)
stonks.add_lower_bollinger(stonks.lag_length)
stonks.add_exponential_moving_average(stonks.lag_length)
stonks.add_exponential_moving_average(int(stonks.lag_length/2))
stonks.add_MACD(stonks.lag_length, int(stonks.lag_length/2))


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=[SMA_string, upper_boll_str, lower_boll_str, 'Close',], 
                  color_discrete_map={
                      SMA_string: "#85deb1",
                      upper_boll_str: "#b6d1c3",
                      lower_boll_str: "#b6d1c3",
                      "Close": "#0059ff",
                  },
                 title=f"{stonks.stock_symbol} Stock Prices w/ Bollinger Bands").update_traces(
        selector={"name": upper_boll_str},
        line={"dash": "dot"}
    ).update_traces(
        selector={"name": lower_boll_str}, 
        line={"dash": "dot"})

    fig.show()


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=[EMA_string, EMA_string_2], 
                  color_discrete_map={
                      EMA_string:"#38b9ff",
                      EMA_string_2:"#44fcd1"
                  },
                 title=f"{stonks.stock_symbol} Exponential Moving Averages")
    fig.show()
    

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=MACD_string, 
                  color_discrete_map={
                      MACD_string:"#edff47"
                  },
                 title=f"{stonks.stock_symbol} MACD").update_layout(height=220)
    fig.show()
    

NameError: name 'mr' is not defined

# Setting Up Models

After examining the data over our fixed period, it is now time to construct several machine learning models which will ultimately be used to forecast future prices.

In [17]:
# Predictors
predictors = stonks.data_frame.filter(regex='Close_L')
# Outcome
outcome=stonks.data_frame["Close"]

(X_train, X_test, Y_train, Y_test) = train_test_split(predictors, 
                                                  outcome,
                                                  random_state=SEED,
                                                  test_size=0.25)

model_choice = mr.Select(value = "Gradient-Boosted Trees",
                         choices = ["Linear Regression",
                                    "k-Nearest Neighbors",
                                   "Random Forest",
                                   "Gradient-Boosted Trees"],
                         label = "What statistical learning model would you like to use to predict stock prices?")



mercury.Select

In [18]:
if model_choice.value == "Linear Regression":
    
    mr.Md("## Linear Regression Models")
    mr.Md("---------") 
    mr.Md("The simplest models we wish to consider in this analysis are modifications of the usual OLS (ordinary least squares) regression algorithms. ")
    mr.Md("Similar to above, we provide two different means for users to tune the models: direct keyboard input, or reading from a ` .config` file.")
        
    mr.Md("Similar to before, parsing user input data from scratch actually requires significantly more code than setting up the models themselves. With the parameters in hand, we can run")
    
    ############################
    # Step 1: Construct Pipeline
    ############################

    pipe_elastic_net = Pipeline([
        ('scaler', preprocessing.StandardScaler()), 
        ('classifier', ElasticNet())
    ])


    #################################
    # Step 2: Gather user input
    ################################# 
    alpha_exponents = mr.Range(label="Provide an exponent range for N such that 2^N is our penalty terms", value=[-5,3], min=-12, max=10)
    num_l1_ratios = mr.Slider(value=10, min=1, max=20, label="How many variations of Lasso / Ridge regression should we consider?", step=1)


    alphas = [(2**alpha) for alpha in range(alpha_exponents.value[0], alpha_exponents.value[1])]
    l1_ratios = np.linspace(start=0, stop=1, num=num_l1_ratios.value, endpoint=True)

    params = {"classifier__alpha": alphas,
                  "classifier__l1_ratio": l1_ratios}  

    #################################
    # Step 3: Grid Search
    ################################# 

    # NOTE: When parallelization is applied, the parallel tasks do NOT
    # have warnings supressed
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        mr.Md(f"\n\nRunning {CROSS_FOLDS * len(alphas) * len(l1_ratios)} models .....\n\n" )
        search = GridSearchCV(pipe_elastic_net,
                              params,
                              scoring=PERFORMANCE_METRIC,
                              cv=CROSS_FOLDS)
        search.fit(X_train, Y_train)

    # predY = grid.predict(testX)

    #################################
    # Step 4: Plot Grid Performance
    #################################
    scores_mean = search.cv_results_['mean_test_score'].reshape(len(alphas),
                                                              len(l1_ratios))


    elastic_net_performance = pd.DataFrame({
        "L1_Ratio" : l1_ratios
    })

    for idx, val in enumerate(alphas):
        alpha_str = "alpha_" + str(val)
        elastic_net_performance[alpha_str] = scores_mean[idx,:]

    fig = px.line(elastic_net_performance, x="L1_Ratio",
                      y=[col for col in elastic_net_performance.columns if 'alpha' in col],
                     title="Elastic Net Performance").update_layout(
        yaxis_title="R^2 value"
    )
    fig.show()


    mr.Md(f"The best performing Elastic Net regression model had an L1 ratio of {search.best_params_['classifier__l1_ratio']}" + 
         f"\n(i.e. {100*search.best_params_['classifier__l1_ratio']}% Lasso regression)\n" +
         f"together with penalty coefficient of {search.best_params_['classifier__alpha']}" +
         "\nThis gave a mean R^2 score of " +
         f"{search.best_score_} across the {CROSS_FOLDS} folds of the training data.\n")


    mr.Md("\n\n")
    mr.Md(f'Training set score: {str(search.score(X_train, Y_train))}')
    mr.Md(f'Test set score: {str(search.score(X_test, Y_test))}' )

In [19]:
if model_choice.value == "k-Nearest Neighbors": 
    
    mr.Md("## $k$-Nearest Neighbors")
    mr.Md("----------")
    mr.Md("A potential downside to regression based models is that they are highly mechanistic: we make a huge jump in assuming that the relationship between the predictors (i.e. previous day's prices) and the outcome (current price) follows a linear trend. By instead looking at empirically-driven models, we do not make any underlying assumptions about the relationship between our previous prices and current prices.")
    mr.Md('For $k$-Nearest neighbors specifically, we require some sort of notion of "distance" between our points')
    mr.Md("However, if one wishes to run the code using a pre-determined `.config` file, the following values must be set:")
    mr.Md("```\nclassifier__n_neighbors: min_neighbors, max_neighbors, step_size\nclassifier__p: p_val\n```")
    mr.Md("Upon loading the necessary user data, we proceed as before and construct a `sklearn.pipeline.Pipeline` object which we will then pass to a grid search:")
    
    ############################
    # Step 1: Construct Pipeline
    ############################
    pipe_knn = Pipeline([
        ('scaler', preprocessing.StandardScaler()), 
        ('classifier', KNeighborsRegressor())
    ])

    #################################
    # Step 2: Gather user input
    ################################# 

    neighbors_input = mr.Range(label="Give a range for the number of nearby data points our model should look at:", value=[1,10],
                                   min=1,
                                   max=min(stonks.data_frame.shape[0], 200))

    p_val_input = mr.Slider(value=5, min=1, max=10, label="Whats the largest L^p distance we should examine?", step=0.5)

    num_neighbors = list(set(np.linspace(neighbors_input.value[0],
                                neighbors_input.value[1],
                                20,
                                dtype=int,
                                endpoint=True)))
    p_values = np.arange(1, p_val_input.value + 0.5, 0.5)

    params = {"classifier__n_neighbors": num_neighbors,
               "classifier__p": p_values}


    #################################
    # Step 3: Grid Search
    ################################# 


    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        print(f"\n\nRunning {CROSS_FOLDS * len(p_values) * len(num_neighbors)} models .....\n\n" )

        search = GridSearchCV(estimator=pipe_knn,
                              param_grid=params,
                              scoring = PERFORMANCE_METRIC,
                              n_jobs = PARALLELIZATION,
                              cv = CROSS_FOLDS)

        search.fit(X_train, Y_train)


    #################################
    # Step 4: Plot Grid Performance
    #################################


    scores_mean = search.cv_results_['mean_test_score'].reshape(len(num_neighbors),
                                                              len(p_values))
    knn_performance = pd.DataFrame({
        "num_neighbors" : num_neighbors
    })

    for idx, val in enumerate(p_values):
        p_str = "p_" + str(val)
        knn_performance[p_str] = scores_mean[:,idx]

    fig = px.line(knn_performance, x="num_neighbors",
                      y=[col for col in knn_performance.columns if 'p_' in col],
                     title="k-Nearest Neighbors").update_layout(
        yaxis_title="R^2 value"
    )
    fig.show()


    mr.Md(f"The best performing k-Nearest Neighbors model looked at {search.best_params_['classifier__n_neighbors']}" + 
         f" neighboring data points\n" +
         f"using the L{search.best_params_['classifier__p']} metric (i.e. the distance between points x and y " + 
          f" is |x^{search.best_params_['classifier__p']} - y^{search.best_params_['classifier__p']}|^(1/{search.best_params_['classifier__p']}))" +
         "\nThis gave a mean R^2 score of " +
         f"{search.best_score_} across the {CROSS_FOLDS} folds of the training data.\n")

    mr.Md("\n\n")
    mr.Md(f'Training set score: {str(search.score(X_train, Y_train))}' )
    mr.Md(f'Test set score: {str(search.score(X_test, Y_test))}' )

In [20]:
if model_choice.value == "Random Forest": 
    
    mr.Md("## Tree-Based Algorithms")
    mr.Md("------")
    mr.Md("Similar to $k$-Nearest neighbors, tree-based algorithms are empirically-driven models which determine an " +
         "outcome based off previous data points in the training set. However, tree-based models wind up being " +
         "significantly more robust as they do not ultimately depend on the metric structure of the predictor space " +
         "(though they do depend on the underlying topology, as they are ultimately dependent on decision-trees).")
    
    mr.Md("<br/>")
    mr.Md("### Random Forest")
    mr.Md("--------")
    mr.Md("Some description of random forest here")

    ############################
    # Step 1: Construct Pipeline
    ############################
    pipe_random_forest = Pipeline([
            ('scaler', preprocessing.StandardScaler()), 
            ('classifier', RandomForestRegressor(random_state=SEED))
        ])

    #################################
    # Step 2: Gather user input
    ################################# 



    features_input = mr.Range(label=f"How many random {stonks.granularity}in intervals each tree in our forest can see?",
                              value=[int(stonks.lag_length/4), int(stonks.lag_length/2)],
                              min=1,
                              max=int(stonks.lag_length))

    estimators_input = mr.Range(label=f"How many trees should be in our forest?",
                              value=[10, 20],
                              min=1,
                              max=150)
    leaf_input = mr.Range(label=f"How many samples should be in a tree branch before it splits?",
                              value=[1, 10],
                              min=1,
                              max=min(stonks.data_frame.shape[0], 200))

    n_features = list(set(np.linspace(features_input.value[0],
                                      features_input.value[1],
                                      10,
                                      dtype=int,
                                      endpoint=True) ))
    n_estimators = list(set(np.linspace(estimators_input.value[0],
                                      estimators_input.value[1],
                                      10,
                                      dtype=int,
                                      endpoint=True) ))
    n_leaf = list(set(np.linspace(leaf_input.value[0],
                                      leaf_input.value[1],
                                      5,
                                      dtype=int,
                                      endpoint=True) ))



    params={'classifier__max_features': n_features,
            'classifier__n_estimators': n_estimators,
            'classifier__min_samples_leaf':n_leaf
            }


    #################################
    # Step 3: Grid Search
    ################################# 


    with warnings.catch_warnings():
        warnings.simplefilter("ignore")


        print(f"\n\nRunning {CROSS_FOLDS * len(n_features) * len(n_leaf) * len(n_estimators)} models .....\n\n" )
        search = GridSearchCV(pipe_random_forest,
                              params,
                              scoring = PERFORMANCE_METRIC,
                              cv=CROSS_FOLDS,
                              n_jobs=PARALLELIZATION,
                             )
        search.fit(X_train, Y_train)



    #################################
    # Step 4: Plot Grid Performance
    #################################
    scores_mean = search.cv_results_['mean_test_score'].reshape(len(n_features), 
                                                                len(n_leaf),
                                                               len(n_estimators))
    
    
    # create pandas.DataFrame to pr
    random_forest_performance = pd.DataFrame({
        "Trees" : n_estimators
    })


    titles=[f'min_samples_leaf={leaf}' for leaf in n_leaf]
    subs= make_subplots(rows=len(n_leaf),
                        cols=1, 
                        subplot_titles=titles
                       )

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for idx_1, val_1 in enumerate(n_leaf):
            for idx_2, val_2 in enumerate(n_features):
                feature_str = "features_" + str(val_2)  + "_leaf_" + str(val_1)
                random_forest_performance[feature_str] = scores_mean[idx_2,idx_1,:]

            figure_traces = []
            fig = px.line(random_forest_performance, x="Trees",
                          y=[col for col in random_forest_performance.columns if f'leaf_{val_1}' in col],
                         title=f"Random Forest Performance (leaf={val_1})").update_layout(yaxis_title="R^2 value")
            for trace in range(len(fig["data"])):
                figure_traces.append(fig["data"][trace])
            for traces in figure_traces:
                subs.append_trace(traces, row=(idx_1 + 1), col=1)

            subs.update_xaxes(title_text="# Trees", row=(idx_1 + 1), col=1)


        subs.update_layout(height=(220 * len(n_leaf)), title_text="Random Forest Performance")
        subs.show()        


    mr.Md(f"\nThe best performing Random Forest had {search.best_params_['classifier__n_estimators']} trees;" +
         "\nin each tree, a split point is only considered if it leaves " +
          f"{search.best_params_['classifier__min_samples_leaf']} samples in each of the left & right branches." +
         f"\nEach decision tree only considered {search.best_params_['classifier__max_features']}" +
         f" {stonks.granularity}in intervals when looking for the best split.\nThis gave a mean R^2 score of " +
         f"{search.best_score_} across the {CROSS_FOLDS} folds of the training data.\n")    


    mr.Md("\n\n")
    mr.Md(f'Training set score: {str(search.score(X_train, Y_train))}' )
    mr.Md(f'Test set score: {str(search.score(X_test, Y_test))}' )



In [None]:
if model_choice.value == "Gradient-Boosted Trees": 
    
    mr.Md("## Tree-Based Algorithms")
    mr.Md("------")
    mr.Md("Similar to $k$-Nearest neighbors, tree-based algorithms are empirically-driven models which determine an " +
         "outcome based off previous data points in the training set. However, tree-based models wind up being " +
         "significantly more robust as they do not ultimately depend on the metric structure of the predictor space " +
         "(though they do depend on the underlying topology, as they are ultimately dependent on decision-trees).")
    
    mr.Md("<br/>")
    mr.Md("### Gradient Boosted Trees")
    mr.Md("--------")
    mr.Md("Some description of boosted trees here")

    ############################
    # Step 1: Construct Pipeline
    ############################
    pipe_boosted_trees = Pipeline([
        ('scaler', preprocessing.StandardScaler()), 
        ('classifier', GradientBoostingRegressor(random_state=SEED))
    ])


    #################################
    # Step 2: Gather user input
    ################################# 

    features_input = mr.Range(label=f"How many random {stonks.granularity}in intervals each tree in our forest can see?",
                              value=[int(stonks.lag_length/4), int(stonks.lag_length/2)],
                              min=1,
                              max=int(stonks.lag_length))

    estimators_input = mr.Range(label=f"How many trees should be in our forest?",
                                value=[10, 20],
                                min=1,
                                max=150)
    learning_input = mr.Range(label="Provide an exponent range for N such that 2^N is our learning rate",
                              value=[-8,-1],
                              min=-12,
                              max=4)

    n_features = list(set(np.linspace(features_input.value[0],
                                      features_input.value[1],
                                      8,
                                      dtype=int,
                                      endpoint=True) ))
    n_estimators = list(set(np.linspace(estimators_input.value[0],
                                        estimators_input.value[1],
                                        10,
                                        dtype=int,
                                        endpoint=True) ))
    n_learning = [(2**exp) for exp in range(learning_input.value[0], learning_input.value[1] + 1)]


    params={'classifier__max_features': n_features,
            'classifier__n_estimators': n_estimators,
            'classifier__learning_rate':n_learning
            }


    #################################
    # Step 3: Grid Search
    ################################# 


    with warnings.catch_warnings():
        warnings.simplefilter("ignore")


        print(f"\n\nRunning {CROSS_FOLDS * len(n_features) * len(n_learning) * len(n_estimators)} models .....\n\n" )
        search = GridSearchCV(pipe_boosted_trees,
                              params,
                              scoring = PERFORMANCE_METRIC,
                              cv=CROSS_FOLDS,
                              n_jobs=PARALLELIZATION,
                             )
        search.fit(X_train, Y_train)



    #################################
    # Step 4: Plot Grid Performance
    #################################
    scores_mean = search.cv_results_['mean_test_score'].reshape(len(n_learning),
                                                                len(n_features),
                                                                len(n_estimators)
                                                               )
    # create pandas.DataFrame to pr
    boosted_trees_performance = pd.DataFrame({
        "Trees" : n_estimators
    })
    


    titles=[f'n_features={feature}' for feature in n_features]
    subs= make_subplots(rows=len(n_features),
                        cols=1, 
                        subplot_titles=titles
                       )

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for idx_1, val_1 in enumerate(n_features):
            for idx_2, val_2 in enumerate(n_learning):
                learning_rate_str = "learning_rate_" + str(val_2)  + "_features_" + str(val_1)
                boosted_trees_performance[learning_rate_str] = scores_mean[idx_2,idx_1,:]

            figure_traces = []
            fig = px.line(boosted_trees_performance, x="Trees",
                          y=[col for col in boosted_trees_performance.columns if f'features_{val_1}' in col],
                         title=f"Random Forest Performance (features={val_1})").update_layout(yaxis_title="R^2 value")
            for trace in range(len(fig["data"])):
                figure_traces.append(fig["data"][trace])
            for traces in figure_traces:
                subs.append_trace(traces, row=(idx_1 + 1), col=1)

            subs.update_xaxes(title_text="# Trees", row=(idx_1 + 1), col=1)


        subs.update_layout(height=(220 * len(n_estimators)), title_text="Boosted Trees Performance")
        subs.show()        


    mr.Md(f"\nThe best performing Boosted Trees model had {search.best_params_['classifier__n_estimators']} trees;" +
         f"\neach tree influenced the construction of the next tree with a learning rate of {search.best_params_['classifier__learning_rate']}." +
         f"\nEach decision tree only considered {search.best_params_['classifier__max_features']}" +
         f" {stonks.granularity}in intervals when looking for the best split.\nThis gave a mean R^2 score of " +
         f"{search.best_score_} across the {CROSS_FOLDS} folds of the training data.\n")    


    mr.Md("\n\n")
    mr.Md('Training set score: ' + str(search.score(X_train, Y_train)))
    mr.Md('Test set score: ' + str(search.score(X_test, Y_test)))


## Tree-Based Algorithms

------

Similar to $k$-Nearest neighbors, tree-based algorithms are empirically-driven models which determine an outcome based off previous data points in the training set. However, tree-based models wind up being significantly more robust as they do not ultimately depend on the metric structure of the predictor space (though they do depend on the underlying topology, as they are ultimately dependent on decision-trees).

<br/>

### Gradient Boosted Trees

--------

Some description of boosted trees here

mercury.Range

mercury.Range

mercury.Range



Running 4000 models .....


