In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

## Import Libraries

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import sys
import warnings
import plotly.express as px
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

from math import ceil, floor
from plotly.subplots import make_subplots
from datetime import date, datetime, timedelta
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.exceptions import ConvergenceWarning
from plotnine import *
from sklearn.metrics import make_scorer, mean_squared_error
# from plotly.tools import mpl_to_plotly as ggplotly


SEED = 3234
CROSS_FOLDS = 10
OPTIMIZATION_RANDOM_FOREST_ITER = 15000
OPTIMIZATION_RANDOM_FOREST_TREES = 300
PERFORMANCE_METRIC = "r2"
PARALLELIZATION = -1


## Add Data Wrappers

In [3]:
class StockDataWrapper:
    """
    A wrapper class to bundle stock data (stored in a pandas DataFrame) together with information
    about how the data was collected (such as what the time frame is between two data points)

    A StockDataWrapper class at the minimum requires a pandas DataFrame (the primary object we are 
    interested in wrapping) as well as the respective stock symbol passed as a string so that we are
    able to easily refer to the specific stock in later formatting. 
    
        IMPORTANT: It is expected that the lag_lengthand num_days are set whenever the stock data is
                loaded into the data_frame object
    
    Certain variables must be set (either from user input or a configuration file) prior to calling
    a handful of methods. Specifically, the lag length must be set prior to calling any functions which
    utilize prior data (e.g. MACD, simple_moving_average, exponential_moving_average, etc.)
    
    
    Attributes:
        data_frame:    A pandas DataFrame object containing stocks data such as Open, Close,
            Volume, High, Low, etc.
            
        stock_symbol:   A string representing the stock's symbol (i.e. AAPL for Apple)
        
        granularity:    The time interval between two data points [1m, 2m, 5m, 15m, 30m, 1h]
        
        num_days:    The number of days of stock data that have been pulled 
        
        lag_length:    The number of previous data points that are availible to use for computations
            (such as moving averages)
    """
    

    def __init__(self, data_frame, stock_symbol):
        """
        Initializes the instance based on a pandas DataFrame.

        Args:
            data_frame: a pandas DataFrame object
            stock_symbol: a string representing the stock's symbol (i.e. AAPL for Apple)
        Raises:
            ValueError: if one of the arguments is not the specified data type
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(data_frame, pd.DataFrame):
            raise ValueError("data_frame must be a pandas DataFrame object")
        if not isinstance(stock_symbol, str):
            raise ValueError("stock_symbol must be a string")
            
        self.data_frame = data_frame
        self.stock_symbol = stock_symbol
        # granularity and num_days should be set 
        self.granularity = 0
        self.num_days = 0
        self.lag_length = 0

    def __str__(self):
        """
        Indicataes how to represent a StockDataWrapper object when
        passed as a string to print
        
        Returns:
            String concatenating the __str__ of the pandas DataFrame together with
            information on the StockDataWrapper's Varaibles
        """
        my_str = f'Stock Name: {self.stock_symbol}\n'
        
        # Check whether several variables have been set
        if self.granularity == 0:
            my_str += "Granularity: NOT SET\n"
        else:
            my_str += f"Granularity: {self.granularity}\n"
        
        if self.num_days == 0:
            my_str += "Number of days of stock data: NOT SET\n"
        else:
            my_str += f"Number of days of stock data: {self.num_days}\n"
        
        if self.lag_length == 0:
            my_str += "Number of lag variables: NOT SET\n"
        else:
            my_str += f"Number of lag variables: {self.lag_length}\n"
        
        
        # Call the underlying Pandas DataFrame object's __str__
        my_str += "\n\n"
        my_str += str(self.data_frame)

        return my_str
    
    
    
    def add_lag_variables(self, num_lags):
        """
        Adds precisely num_lags lag variables to the underlying data frame. 
        The lag variables are simply additional columns where the entries are shifted
        up in time index.
        
            WARNING: A large number of lag variables can lead to fragmentation in the 
                DataFrame
        
        The (column) names of the lag variables that are added are of the form:
                            Close_L#
        where # is how far back that particular lag variable is looking.

        Args:
            num_lags: the number of lag variables we wish to add
        Raises:
            ValueError: if num_lags is not an integer or is larger than the number of
                observations or is negative
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(num_lags, int):
            raise ValueError("Parameter length must be integer")
        if num_lags < 1 or num_lags >= self.data_frame.shape[0]:
            raise ValueError("The number of lag variables must be between" +
                            " 1 and the total number of observations")

        for i in range(1,num_lags + 1):
            index_str = "Close_L" + str(i)
            self.data_frame[index_str] = self.data_frame['Close'].shift(i)

        # Backfill the entries to remove any NaN
        self.data_frame = self.data_frame.bfill(axis=0)
        
    
    def delete_lag_variables(self):
        """
        Removes any possible lag variables from the Data_Frame. This method
        implicitly assumes that all lag variables follow the naming convention
        
                        Close_L#
                        
        where # is how far back that particular lag variable is looking.
        """
        existing_lag_names = list(self.data_frame.filter(regex='Close_L'))
        
        if (len(existing_lag_names) > 0):  
            self.data_frame = self.data_frame[
                self.data_frame.columns.drop(existing_lag_names)
            ]
            # Reset lag_length so that other methods know not to compute
            # data from lag variables
            self.lag_length = 0
    
    def add_simple_moving_average(self, length):
        """
        Computes the simple moving average of the stock's closing price (i.e.
        the mean of the first 'length number' of lag variables) and adds it to a
        new column in our pandas DataFrame. The column that is added follows
        the naming convention:
        
                            SMA_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the simple moving average 
                over (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        # Gather the column names that we wish to take the average over
        lag_predictors = []
        for i in range(1, length + 1):
            lag_name = 'Close_L' + str(i)
            lag_predictors.append(lag_name)

        # Add new column to data frame
        column_label = "SMA_"  + str(length)
        self.data_frame[column_label] = (self.data_frame[lag_predictors].sum(axis = 1,
                                                                             skipna = True) / float(length))

    
    def add_simple_moving_standard_deviation(self, length):
        """
        Computes the simple moving standard deviation of the stock's closing
        price (i.e. the standard deviation of the first 'length number' of lag variables)
        and adds it to a new column in our pandas DataFrame. The column that is added follows
        the naming convention:
        
                            SMSD_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the simple moving standard deviation 
                over (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")
        
        # Gather the column names that we wish to take the average over
        lag_predictors = []
        for i in range(1, length + 1):
            lag_name = 'Close_L' + str(i)
            lag_predictors.append(lag_name)
            
        # Add new column to data frame
        column_label = "SMSD_"  + str(length)
        self.data_frame[column_label] = self.data_frame[lag_predictors].std(axis = 1, skipna = True)
          
        
    def add_upper_bollinger(self, length):
        """
        Computes the upper Bollinger band of the stock's closing price, which is just the
        simple moving average + the simple moving standard deviation and adds it to a new
        column in our pandas DataFrame. The column that is added follows the naming convention:
        
                            upper_boll_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the bollinger band over 
                (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        SMSD_string = "SMSD_" + str(length)
        SMA_string = "SMA_" + str(length)
        
        # Ensure that both the simple moving average data and
        # the simple moving standard deviation data are availible in 
        # DataFrame
        if SMSD_string not in self.data_frame.columns:
            self.add_simple_moving_standard_deviation(length)
        if SMA_string not in self.data_frame.columns:
            self.add_simple_moving_average(length)

        # Add new column to data frame
        upper_bollinger_str = "upper_boll_" + str(length)
        self.data_frame[upper_bollinger_str] = self.data_frame[SMA_string] + self.data_frame[SMSD_string]
        
        
    def add_lower_bollinger(self, length):
        """
        Computes the lower Bollinger band of the stock's closing price, which is just the 
        simple moving average + the simple moving standard deviation and adds it to a new
        column in our pandas DataFrame. The column that is added follows the naming convention:
        
                            lower_boll_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the bollinger band over 
                (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
    
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding simple moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {self.lag_length})\n")

        SMSD_string = "SMSD_" + str(length)
        SMA_string = "SMA_" + str(length)
        
        # Ensure that both the simple moving average data and
        # the simple moving standard deviation data are availible in 
        # DataFrame
        if SMSD_string not in self.data_frame.columns:
            self.add_simple_moving_standard_deviation(length)
        if SMA_string not in self.data_frame.columns:
            self.add_simple_moving_average(length)

        # Add new column to data frame
        lower_bollinger_str = "lower_boll_" + str(length)
        self.data_frame[lower_bollinger_str] = self.data_frame[SMA_string] - self.data_frame[SMSD_string]
        
        
    def _exponential_moving_average_helper_(self, series, smoothing_factor):
        """
        Helper function for add_exponential_moving_average() which utilizes a 
        temporary buffer in memory to compute the recursive function:
        
            EMA(data, n+1) = β * data[n+1] +  (1 - β) * data[n]
            
        where β represents the smoothing factor. The smoothing factor ultimately dictates
        how heavily recent data points are weighted, and how quickly previous data points
        lose weight

        Args:
            series: a pandas.core.series.Series object. Typically passed as
                dataframe["column"].iloc[a:b]. Must be treated slightly differently than
                a standard array since integer indexing of series is depricated
            
            smoothing_factor: a floating point integer which dictates how heavily recent
                data points are weighted
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(series, pd.Series):
            raise ValueError("Parameter series must be pandas.Series data type")
        if not isinstance(smoothing_factor, float):
            raise ValueError("Parameter smoothing factor must be a floating point decimal")
        
        N = series.shape[0]
        # Create temporary storage to compute our outputs. When the lag_length of
        # the exponential moving average is large, this will likely lead to memory fragmentation
        buffer = list(range(N))
        buffer[0] = series.iloc[0]

        # Use temporary storage + iteration to compute the recursive formula
        #
        #       EMA(data, n+1) = β * data[n+1] +  (1 - β) * data[n]
        #
        for i in range(1,N):
            buffer[i] = smoothing_factor * series.iloc[i] + (1 - smoothing_factor) * buffer[i-1]
        
        # return the last data point (i.e. the EMA over the desired lag_length)
        return buffer[-1]



    def add_exponential_moving_average(self, length):
        """
        Computes the exponential moving standard average of the stock's closing
        price (i.e. the weighted average of the first [length] lag variables)
        and adds it to a new column in our pandas DataFrame. The column that is added follows
        the naming convention:
        
                            SMSD_(length)
                            
        where length is the parameter passed.

        Args:
            length: the number of days to take the simple moving standard deviation 
                over (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(length, int):
            raise ValueError("Parameter lag_length must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding exponental moving average")
        if length > self.lag_length:
            raise ValueError("Cannot take the average of more lag varaibles than are availible" + 
                      f" (currently {self.lag_length})\n")

        N = self.data_frame.shape[0]
        smoothing_factor = float(2/(length + 1))

        # Temporary output storage
        buffer = list(range(N))
        buffer[0] = self.data_frame["Close"].iloc[0]
        buffer[1] = self.data_frame["Close"].iloc[0]


        for i in range(2, N):
            # If there are less that lag_period of data previous to the current date,
            # simply take the average of all the days prior to get the closest thing
            # to a weighted average
            if i <= length:
                buffer[i] = self._exponential_moving_average_helper_(
                    self.data_frame["Close"].iloc[0:i],
                    2/(i + 1)
                )
            else:
                buffer[i] = self._exponential_moving_average_helper_(
                    self.data_frame["Close"].iloc[i-length:i],
                    smoothing_factor
                )
        # Add new column to data frame
        column_label = "EMA_"  + str(length)
        self.data_frame[column_label] = buffer


    def add_MACD(self, max_lag, min_lag):
        """
        Computes the moving average convergence-divergence of the stock's closing
        price (i.e. the difference of the exponential moving average taken over min_lag
        with the exponential moving average taken over max_lag)
        and adds it to a new column in our pandas DataFrame. The column that is added follows
        the naming convention:
        
                            MACD_(min_lag)_(max_lag)
                            
        where min_lag and max_lag are the first and second parameters passed, respectively.

        Args:
            length: the number of days to take the simple moving standard deviation 
                over (equivalently, the number of lag variables we are considering)
        Raises:
            ValueError: if length is not an integer or is larger than the number of
                lag variables availible
            NameError: if lag_length has not been set yet (i.e. no lag variables have
                been added)
        """
        
        #################
        #  ERROR HANDLING
        #################
        if not isinstance(max_lag, int) or not isinstance(min_lag, int):
            raise ValueError("Parameters max_lag and min_lag must be integer")
        if self.lag_length == 0:
            raise NameError("Lag variables must be set prior to adding MACD")
        if min_lag >= max_lag:
            raise ValueError("min_lag must be strictly smaller than max_lag")

        EMA_string_min = "EMA_" + str(min_lag)
        EMA_string_max = "EMA_" + str(max_lag)

        # Ensure that both exponential moving averages are availible
        # in the DataFrame
        if EMA_string_min not in self.data_frame.columns:
            self.add_exponential_moving_average(min_lag)
        if EMA_string_max not in self.data_frame.columns:
            self.add_exponential_moving_average(max_lag)
            
        
        # Add new column to data frame
        column_label = "MACD_"  + str(min_lag) + "_" + str(max_lag)
        self.data_frame[column_label] = (self.data_frame[EMA_string_min] - self.data_frame[EMA_string_max])

## User Input Functions

In [4]:
def create_stock_data_from_input():
    """
    Helper function for creating a StockDataWrapper class from user input. Primarily 
    cleans the user input and ensures that the user input is within the range of 
    intended values. 
    
    This function only establishes:
    
        - the underlying pandas.DataFrame object (which is the primary data that
                StockDataWrapper is wrapping)
        - the granularity parameter
        - the num_days parameter
        
    This function does NOT set:
        - any lag variable data.
        
        
    Returns:
        A StockDataWrapper object that, when a correct stock symbol is typed, will
        contain a pandas DataFrame object as well as the number of days and granularity
        of the stock data
    """
    # Boolean flag which turns off (False) once the user
    # correctly enters the correct field
    prompting_user_input = True
    while prompting_user_input:
        num_days_to_build = input("How many days of intraday stock market data should " + 
                                  " we use to build our model? Enter a value between " +
                                  " 1 and 7:\n(Type 'Exit' to quit)\n")

        # Parse num_days input
        if num_days_to_build.isnumeric():
            num_days_to_build = int(num_days_to_build)
            if num_days_to_build > 0 and num_days_to_build < 8:
                prompting_user_input = False
            else:
                print("The number of days must be between 1 and 7 "+
                      "— please retry. \n (Type 'Exit' to quit)\n")
        elif "exit" in num_days_to_build.lower():
            sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry.\n")


    # Apply num_days information to create Datetime variables
    # which give a range of precisely (num_days) previous days from
    # current date.
    today = date.today()
    most_recent_stock_day = today
    
    if today.weekday() == 6:
        most_recent_stock_day = today - timedelta(1)
    
    num_days_prior = most_recent_stock_day - timedelta(num_days_to_build)

    # list of acceptable inputs for granularity
    granularity_options = ['1m', '2m', '5m', '15m', '30m', '60m']
    # Reset user input boolean flag
    prompting_user_input = True
    while prompting_user_input:
        granulrity_input = input("How often should our model look at" + 
                                 " stock prices? Choose from 1m, 2m, 5m," +
                                 " 15m, 30m or 60m.\n(Type 'Exit' to quit)\n"
                                ).lower()
        
        # parse granularity input
        if granulrity_input in granularity_options:
            prompting_user_input = False
        elif "exit" in granulrity_input:
            sys.exit("Exiting program")
        else:
            print("Input was not among the options 1m, 2m," +
                  " 5m, 15m, 30m, or 60m — please retry."+
                  "\n(Type 'Exit' to quit)\n")
    

    # utilize yfinance's error handling here — if the user inputs the 
    # wrong stock symbol, 
    stock_symbol = input("Please input the stock symbol you would like" + 
                             " to examine: (e.g. AAPL)\n").upper()
    
    # Construct pandas DataFrame object
    df = pd.DataFrame(yf.download(stock_symbol,
                                start=num_days_prior,
                                end=most_recent_stock_day,
                                interval=granulrity_input)
                            )
    
    # add user input information to StockDataWrapper object
    stock_data = StockDataWrapper(df, stock_symbol)
    stock_data.num_days = num_days_to_build
    stock_data.granularity = granulrity_input

    return stock_data



def add_lags_from_input(stock_data, threshold):
    """
    Helper function which finishes loading the lag data information into an already
    existing StockDataWrapper object. This function will only add additional columns
    to the DataFrame object referenced by the StockDataWrapper (in place); no value
    is returned.
    
    
    Raises:
        ValueError: if either stock_data is not a StockDataWrapper or threshold is not an
                integer.
        NameError: if there are no entries in the DataFrame
    """
    #################
    #  ERROR HANDLING
    #################
    if not isinstance(stock_data, StockDataWrapper):
        raise ValueError("stock_data variable must be a StockDataWrapper class")
    if not isinstance(threshold, int):
        raise ValueError("threshold must be a positive integer")
    if stock_data.data_frame.shape[0] == 0:
        raise TypeError("Data has not been correctly loaded into the StockDataWrapper object")
    
    # Clean up previous lag variables so that preexisting data
    # does not affect future computations
    stock_data.delete_lag_variables()
    
    # Boolean flag which turns off (False) once the user
    # correctly enters the correct field
    prompting_user_input = True
    while prompting_user_input:
        lag_length = input(f"How many previous {stock_data.granularity}in intervals should ALL of" +
                           " our machine learning models look at?\n(Type 'Exit' to quit)\n")

        # parse lag_length
        if lag_length.isnumeric():
            lag_length = int(lag_length)
            
            if lag_length <= threshold:
                stock_data.lag_length = lag_length
                stock_data.add_lag_variables(lag_length)
                prompting_user_input = False
            else:
                print("\nThe number of previous data points considered should" +
                      f" not exceed 30%% \nof the total number of data points (in this case, {threshold}) -- please" +
                       " retry. \n")
        elif "exit" in lag_length.lower():
                sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry.\n") 
            
            
            
            
            
def get_n_estimators_from_input():
    """
    Helper function which obtains ranges of values for number of trees in a grid search from user input data.
    Specifically, the function prompts the user for an integer array of values based on 
        (1) the minimum value specified
        (2) the maximum value specified
        (3) the increment size
    and passes the input to a numpy.arange. The resulting interval dictates how the number of trees will vary
    over a grid search applying the RandomForestPredictor classifier.
    
    
    
    Returns:
        An integer array of values representing the possible number of trees in a RandomForestPredictor
    """
    
    # Boolean flag which turns off (False) once the user
    # correctly enters the correct field
    prompting_user_input = True
    
    # Get minimum value
    while prompting_user_input:
        min_estimators = input("What's the minimum number of trees that should be in our forest?\n(Type 'Exit' to quit)\n")
        
        if min_estimators.isnumeric():
            min_estimators = int(min_estimators)
            
            if min_estimators < 1:
                print("The minimum number of trees must be at least 1 — please retry\n")
            else:
                prompting_user_input = False
            
        elif "exit" in min_estimators.lower():
                sys.exit("Exiting program")
        else:
            print("""Non-integer passed as input — please retry.\n""")   
      
    # Get maximum value
    prompting_user_input = True
    while prompting_user_input:
        max_estimators = input("What's the maximum number of trees that should be in our forest?\n(Type 'Exit' to quit)\n")
        
        if max_estimators.isnumeric():
            max_estimators = int(max_estimators)
            
            if max_estimators < min_estimators:
                print("The maximum number of trees must be more than the minimum number" +
                      f" of trees (currently {min_estimators}) — please retry.\n")
            else:
                prompting_user_input = False
            
        elif "exit" in max_estimators.lower():
                sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry.\n") 
            
    # no need to ask for number of steps when only 1 integer is given
    if max_estimators - min_estimators < 2:
        return [min_estimators, max_estimators]
    
    # Get Step Size
    prompting_user_input = True
    while prompting_user_input:
        num_steps = input(f"How many forest sizes between {min_estimators} trees and {max_estimators} trees should we look at?" + 
                          " \n(Type 'Exit' to quit)\n")
        
        if num_steps.isnumeric():
            num_steps = int(num_steps)
            
            if num_steps < 1:
                print("The number of values examined must be at least 1 — please retry\n")
            elif int( (max_estimators + 1 - min_estimators) / (num_steps)) < 1:
                print(f"Cannot subdivide the interval [{min_estimators}, {max_estimators}] into that many integers — please retry\n")
            else:
                prompting_user_input = False
                return np.linspace(min_estimators, max_estimators, num_steps, dtype=int, endpoint=True)
            
        elif "exit" in num_steps.lower():
                sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry.\n") 
            
            

def get_n_features_from_input(predictors_list, granularity):
    """
    Helper function which obtains ranges of values for number of features in a grid search from user input data.
    Specifically, the function prompts the user for an integer array of values based on 
        (1) the minimum value specified
        (2) the maximum value specified
        (3) the increment size
    and passes the input to a numpy.arange. The resulting interval dictates the number of previous lag intervals 
    that should be randomly chosen and applied to each tree in the forest. 
    
    
    Returns:
        An integer array of values representing the possible number of maximum predictors in a RandomForestPredictor
    """
    granularity_options = ['1m', '2m', '5m', '15m', '30m', '60m']
    #################
    #  ERROR HANDLING
    #################
    if not isinstance(predictors_list, pd.DataFrame):
        raise ValueError("data_frame must be a pandas DataFrame object")
    if granularity not in granularity_options:
        raise ValueError("granularity must be one of: 1m, 2m, 5m, 15m, 30m, 60m")
    
    n_features = predictors_list.shape[1]

    prompting_user_input = True
    while prompting_user_input:
        min_features = input(f"Give a lower bound for the number of random {granularity}in intervals each tree in " +
                             f"our forest can see (between 1 and {n_features})\n(Type 'Exit' to quit)\n")
        
        if min_features.isnumeric():
            min_features = int(min_features)
            
            if min_features < 1:
                print("The minimum number of features must be at least 1 — please retry\n")
            elif min_features > n_features:
                print("The minimum number of features should not exceed the total number of " + 
                      f"{granularity}in intervals availible to the model \n" +
                      f" (currently {n_features}) — please retry\n")
            else:
                prompting_user_input = False
            
        elif "exit" in min_features.lower():
                sys.exit("Exiting program")
        else:
            print("""Non-integer passed as input — please retry.\n""")   
      
    prompting_user_input = True
    while prompting_user_input:
        max_features = input(f"Give an upper bound for the number of random {granularity}in intervals each tree in " +
                             f"our forest can see (between 1 and {n_features})\n(Type 'Exit' to quit)\n")
        
        if max_features.isnumeric():
            max_features = int(max_features)
            
            if max_features < min_features:
                print("The upper bound should be greater than the lower bound" +
                      f" (currently {min_features}) — please retry.\n")
            else:
                prompting_user_input = False
            
        elif "exit" in max_estimators.lower():
                sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry.\n") 
            
    
    # no need to ask for number of steps when only 1 integer is given
    if max_features - min_features < 2:
        return [min_features, max_features]
    
    prompting_user_input = True
    while prompting_user_input:
        num_steps = input(f"How many {granularity}in intervals between {min_features} and {max_features}"+
                          " should we consider?" + 
                          " \n(Type 'Exit' to quit)\n")
        
        if num_steps.isnumeric():
            num_steps = int(num_steps)
            
            if num_steps < 1:
                print("The number of values examined must be at least 1 — please retry\n")
            elif int( (max_features + 1 - min_features) / (num_steps)) < 1:
                print(f"Cannot subdivide the interval [{min_leaf}, {max_leaf}] into that many integers — please retry\n")
            else: 
                prompting_user_input = False
                return np.linspace(min_features, max_features, num_steps, dtype=int, endpoint=True )
            
        elif "exit" in num_steps.lower():
                sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry.\n")
            
            
def get_n_leaf_from_input():
    """
    Helper function which obtains ranges of values for number of tree samples in a grid search from user input data.
    Specifically, the function prompts the user for an integer array of values based on 
        (1) the minimum value specified
        (2) the maximum value specified
        (3) the increment size
    and passes the input to a numpy.arange. The resulting interval dictates the number of observations that should
    be present in both the left branch and right branch of a node in a decision tree before it can split.
    
    
    Returns:
        An integer array of values representing the possible number of trees in a RandomForestPredictor
    """
    # Boolean flag which turns off (False) once the user
    # correctly enters the correct field
    prompting_user_input = True
    
    
    # Get minimum value of branch sizes
    while prompting_user_input:
        min_leaf = input("Give a lower bound for the number of samples in a tree branch before it should split" +
                             "\n(Type 'Exit' to quit)\n")
        
        if min_leaf.isnumeric():
            min_leaf = int(min_leaf)
            
            if min_leaf < 1:
                print("The minimum number of samples must be at least 1 — please retry\n")
            else:
                prompting_user_input = False
            
        elif "exit" in min_leaf.lower():
                sys.exit("Exiting program")
        else:
            print("""Non-integer passed as input — please retry.\n""")   
      
    # Get maximum value of branch sizes
    prompting_user_input = True
    while prompting_user_input:
        max_leaf = input("Give an upper bound for the number of samples in a tree branch before it should split" +
                             "\n(Type 'Exit' to quit)\n")
        
        if max_leaf.isnumeric():
            max_leaf = int(max_leaf)
            
            if max_leaf < min_leaf:
                print("The upper bound should be greater than the lower bound" +
                      f" (currently {min_leaf}) — please retry.\n")
            else:
                prompting_user_input = False
            
        elif "exit" in max_leaf.lower():
                sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry.\n") 
            
    
    if max_leaf - min_leaf < 2:
        return [min_leaf, max_leaf]
    
    
    # Get Step Size
    prompting_user_input = True
    while prompting_user_input:
        num_steps = input(f"How many values should we consider between {min_leaf} and {max_leaf}?" + 
                          " \n(Type 'Exit' to quit)\n")
        
        if num_steps.isnumeric():
            num_steps = int(num_steps)
            
            if num_steps < 1:
                print("The number of values examined must be at least 1 — please retry\n")
            elif int( (max_leaf + 1 - min_leaf) / (num_steps)) < 1:
                print(f"Cannot subdivide the interval [{min_leaf}, {max_leaf}] into that many integers — please retry\n")
            else:
                prompting_user_input = False
                return np.linspace(min_leaf, max_leaf, num_steps, dtype=int, endpoint=True)
            
        elif "exit" in num_steps.lower():
                sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry.\n")
            
            
            
            
def user_input_random_forest(predictors_list, granularity):
    
    ############################
    # Step 1: Construct Pipeline
    ############################
    pipe_random_forest = Pipeline([
        ('scaler', preprocessing.StandardScaler()), 
        ('classifier', RandomForestRegressor(random_state=SEED))
    ])
    
    #################################
    # Step 2: Gather user input
    ################################# 
    complexity_check = True
    while complexity_check:
        n_estimators = get_n_estimators_from_input()
        if max(n_estimators) > OPTIMIZATION_RANDOM_FOREST_TREES :
            print(f"Forests with over {OPTIMIZATION_RANDOM_FOREST_TREES} decision trees will take too long to prune.\n")
            continue
        print("---------------------------------------\n")
        n_features = get_n_features_from_input(predictors, granularity)     
        print("---------------------------------------\n")
        n_leaf = get_n_leaf_from_input()
        computations_necessary = CROSS_FOLDS*len(n_features)*len(n_estimators)*len(n_leaf)


        if computations_necessary > OPTIMIZATION_RANDOM_FOREST_ITER:
            print(f"\t\t {CROSS_FOLDS} Cross-Folds")
            print(f"x\t{len(n_estimators)} forest sizes")
            print(f"x {len(n_features)} tree complexity choices")
            print(f"x {len(n_leaves)} branch size choices")
            print("---------------------------------------------")
            print(f"{computations_necessary} computations necessary ({OPTIMIZATION_RANDOM_FOREST_ITER} max)")
            print("\n\n Please try a smaller batch.")
        else:
            complexity_check = False
            
    params={'classifier__max_features': n_features,
        'classifier__n_estimators': n_estimators,
        'classifier__min_samples_leaf':n_leaf
        }
    
        
    return (pipe_random_forest, params, [n_features, n_leaf, n_estimators])



def user_input_elastic_net():
    
    ############################
    # Step 1: Construct Pipeline
    ############################
    
    pipe_elastic_net = Pipeline([
        ('scaler', preprocessing.StandardScaler()), 
        ('classifier', ElasticNet())
    ])
    
    #################################
    # Step 2: Gather user input
    ################################# 
    
    # Boolean flag which turns off (False) once the user
    # correctly enters the correct field
    prompting_user_input = True
    
    
    # Get minimum penalty term
    while prompting_user_input:
        base_int = input("Enter a base integer (e.g. binary = base 2, decimal = base 10)" + 
                             "\n(Type 'Exit' to quit)\n")
        
        if base_int.isnumeric():
            base_int = int(base_int)
            
            if base_int <= 1:
                print("Base integer must be > 1 — please retry\n")
            else:
                prompting_user_input = False
            
        elif "exit" in min_penalty.lower():
                sys.exit("Exiting program")
        else:
            print("Non-decimal passed as input — please retry.\n") 
            
    prompting_user_input = True
    # Get minimum penalty term
    while prompting_user_input:
        min_exponent = input(f"Give the lowest exponent value of {base_int} that our model should consider for the penalty coefficient" + 
                             "\n(Type 'Exit' to quit)\n")
        
        if min_exponent.replace('-','',1).isnumeric():
            min_exponent = int(min_exponent)
            prompting_user_input = False
            
        elif "exit" in min_exponent.lower():
                sys.exit("Exiting program")
        else:
            print("Non-decimal passed as input — please retry.\n") 
            
    prompting_user_input = True
    # Get maximum penalty term
    while prompting_user_input:
        max_exponent = input(f"Give the largest exponent value of {base_int} that our model should consider for the penalty coefficient" + 
                             "\n(Type 'Exit' to quit)\n")
        
        if max_exponent.replace('-','',1).isnumeric():
            max_exponent = int(max_exponent)
            
            if max_exponent < min_exponent:
                print("The maximum exponent must be larger than the minimum exponent — please retry.\n")
            else:
                prompting_user_input = False
            
        elif "exit" in max_exponent.lower():
                sys.exit("Exiting program")
        else:
            print("Non-decimal passed as input — please retry.\n") 
            
    prompting_user_input = True


    
    prompting_user_input = True
    # Get range of lasso proportions
    while prompting_user_input:
        num_variations = input(f"How many variations of Lasso / Ridge regression should we consider?" + 
                          " \n(Type 'Exit' to quit)\n")
        
        if num_variations.isnumeric():
            num_variations = int(num_variations)
            
            if num_variations < 1:
                print("The number of values examined must be at least 1 — please retry\n")
            else:
                prompting_user_input = False
            
        elif "exit" in num_variations.lower():
                sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry.\n")
    
    
    params = {"classifier__alpha": [(base_int**exp) for exp in range(min_exponent,max_exponent+1)],
              "classifier__l1_ratio": np.linspace(0, 1, num_variations, endpoint=True)}
        
    return (pipe_elastic_net, params)



def user_input_knn_regression():
    
    ############################
    # Step 1: Construct Pipeline
    ############################
    pipe_knn = Pipeline([
        ('scaler', preprocessing.StandardScaler()), 
        ('classifier', KNeighborsRegressor())
    ])
    
    #################################
    # Step 2: Gather user input
    ################################# 
    
     
    
    

# Exploratory Data Analysis

In [22]:
##################################
# STEP 1: OBTAIN DESIRED STOCK + INTERVALS
##################################
stonks = create_stock_data_from_input()

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y='Close',
                 title=f"{stonks.stock_symbol} Stock Prices")
    fig.show()

    
##################################
# STEP 2: SET UP TIME-SERIES DATA
##################################
# The number of lag variables should not exceed 30% of the 
# number of observations
threshold = int(0.3 * stonks.data_frame.shape[0])
    
add_lags_from_input(stonks, threshold)


display_days = int(0.8 * stonks.lag_length)
SMA_string = 'SMA_' + str(display_days)
lower_boll_str = 'lower_boll_' + str(display_days)
upper_boll_str = 'upper_boll_' + str(display_days)
EMA_string = 'EMA_' + str(display_days)
EMA_string_2 = 'EMA_' + str(int(display_days / 2))
MACD_string = 'MACD_' + str(int(display_days / 2)) + '_' + str(display_days)

stonks.add_simple_moving_average(display_days)
stonks.add_upper_bollinger(display_days)
stonks.add_lower_bollinger(display_days)
stonks.add_exponential_moving_average(display_days)
stonks.add_exponential_moving_average(int(display_days/2))
stonks.add_MACD(display_days, int(display_days/2))


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=[SMA_string, upper_boll_str, lower_boll_str, 'Close',], 
                  color_discrete_map={
                      SMA_string: "#85deb1",
                      upper_boll_str: "#b6d1c3",
                      lower_boll_str: "#b6d1c3",
                      "Close": "#0059ff",
                  },
                 title=f"{stonks.stock_symbol} Stock Prices w/ Bollinger Bands").update_traces(
        selector={"name": upper_boll_str},
        line={"dash": "dot"}
    ).update_traces(
        selector={"name": lower_boll_str}, 
        line={"dash": "dot"})

    fig.show()


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=[EMA_string, EMA_string_2], 
                  color_discrete_map={
                      EMA_string:"#38b9ff",
                      EMA_string_2:"#44fcd1"
                  },
                 title=f"{stonks.stock_symbol} Exponential Moving Averages")
    fig.show()
    

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fig = px.line(stonks.data_frame, x=stonks.data_frame.index,
                  y=MACD_string, 
                  color_discrete_map={
                      MACD_string:"#edff47"
                  },
                 title=f"{stonks.stock_symbol} MACD").update_layout(height=220)
    fig.show()

How many days of intraday stock market data should  we use to build our model? Enter a value between  1 and 7:
(Type 'Exit' to quit)
4
How often should our model look at stock prices? Choose from 1m, 2m, 5m, 15m, 30m or 60m.
(Type 'Exit' to quit)
2m
Please input the stock symbol you would like to examine: (e.g. AAPL)
NVAX


[*********************100%%**********************]  1 of 1 completed


(IMPORTANT!)

How many previous 2min intervals should ALL of our machine learning models look at?
(Type 'Exit' to quit)
30


# Setting Up Models

In [23]:
# Predictors
predictors = stonks.data_frame.filter(regex='Close_L')
# Outcome
outcome=stonks.data_frame["Close"]

(X_train, X_test, Y_train, Y_test) = train_test_split(predictors, 
                                                  outcome,
                                                  random_state=SEED,
                                                  test_size=0.25)

### Random Forest

In [24]:
#################################
# Step 3: Grid Search
################################# 

(pipe_random_forest, params, dim) = user_input_random_forest(predictors,
                                                        stonks.granularity)

n_features = dim[0]
n_leaf = dim[1]
n_estimators = dim[2]


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    
    print(f"\n\nRunning {CROSS_FOLDS * len(n_features) * len(n_leaf) * len(n_estimators)} models .....\n\n" )
    search = GridSearchCV(pipe_random_forest,
                          params,
                          scoring = PERFORMANCE_METRIC,
                          cv=CROSS_FOLDS,
                          n_jobs=PARALLELIZATION,
                         )
    search.fit(X_train, Y_train)



#################################
# Step 4: Plot Grid Performance
#################################
scores_mean = search.cv_results_['mean_test_score'].reshape(len(n_features), 
                                                            len(n_leaf),
                                                           len(n_estimators))
# create pandas.DataFrame to pr
random_forest_performance = pd.DataFrame({
    "Trees" : n_estimators
})


titles=[f'min_samples_leaf={leaf}' for leaf in n_leaf]
subs= make_subplots(rows=len(n_leaf),
                    cols=1, 
                    subplot_titles=titles
                   )

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for idx_1, val_1 in enumerate(n_leaf):
        for idx_2, val_2 in enumerate(n_features):
            feature_str = "features_" + str(val_2)  + "_leaf_" + str(val_1)
            random_forest_performance[feature_str] = scores_mean[idx_2,idx_1,:]

        figure_traces = []
        fig = px.line(random_forest_performance, x="Trees",
                      y=[col for col in random_forest_performance.columns if f'leaf_{val_1}' in col],
                     title=f"Random Forest Performance (leaf={val_1})").update_layout(yaxis_title="R^2 value")
        for trace in range(len(fig["data"])):
            figure_traces.append(fig["data"][trace])
        for traces in figure_traces:
            subs.append_trace(traces, row=(idx_1 + 1), col=1)

        subs.update_xaxes(title_text="# Trees", row=(idx_1 + 1), col=1)


    subs.update_layout(height=(220 * len(n_leaf)), title_text="Random Forest Performance")
    subs.show()        
        
    
print(f"\nThe best performing Random Forest had {search.best_params_['classifier__n_estimators']} trees;" +
     "\nin each tree, a split point is only considered if it leaves " +
      f"{search.best_params_['classifier__min_samples_leaf']} samples in each of the left & right branches." +
     f"\nEach decision tree only considered {search.best_params_['classifier__max_features']}" +
     f" {stonks.granularity}in intervals when looking for the best split.\nThis gave a mean R^2 score of " +
     f"{search.best_score_} across the {CROSS_FOLDS} folds of the training data.\n")    
    

print("\n\n")
print('Training set score: ' + str(search.score(X_train, Y_train)))
print('Test set score: ' + str(search.score(X_test, Y_test)))



What's the minimum number of trees that should be in our forest?
(Type 'Exit' to quit)
1
What's the maximum number of trees that should be in our forest?
(Type 'Exit' to quit)
60
How many forest sizes between 1 trees and 60 trees should we look at? 
(Type 'Exit' to quit)
10
---------------------------------------

Give a lower bound for the number of random 2min intervals each tree in our forest can see (between 1 and 30)
(Type 'Exit' to quit)
2
Give an upper bound for the number of random 2min intervals each tree in our forest can see (between 1 and 30)
(Type 'Exit' to quit)
30
How many 2min intervals between 2 and 30 should we consider? 
(Type 'Exit' to quit)
10
---------------------------------------

Give a lower bound for the number of samples in a tree branch before it should split
(Type 'Exit' to quit)
3
Give an upper bound for the number of samples in a tree branch before it should split
(Type 'Exit' to quit)
15
How many values should we consider between 3 and 15? 
(Type 'Exit'


The best performing Random Forest had 53 trees;
in each tree, a split point is only considered if it leaves 3 samples in each of the left & right branches.
Each decision tree only considered 23 2min intervals when looking for the best split.
This gave a mean R^2 score of 0.9895040781312696 across the 10 folds of the training data.




Training set score: 0.9966058942346923
Test set score: 0.9888944911017863


### Elastic Net

In [25]:

#################################
# Step 3: Grid Search
################################# 


(pipe_elastic_net, params) = user_input_elastic_net()

alphas = params['classifier__alpha']
l1_ratios = params['classifier__l1_ratio']

# NOTE: When parallelization is applied, the parallel tasks do NOT
# have warnings supressed
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    print(f"\n\nRunning {CROSS_FOLDS * len(alphas) * len(l1_ratios)} models .....\n\n" )
    search = GridSearchCV(pipe_elastic_net,
                          params,
                          scoring=PERFORMANCE_METRIC,
                          cv=CROSS_FOLDS)
    search.fit(X_train, Y_train)
    
# predY = grid.predict(testX)

#################################
# Step 4: Plot Grid Performance
#################################
scores_mean = search.cv_results_['mean_test_score'].reshape(len(alphas),
                                                          len(l1_ratios))


elastic_net_performance = pd.DataFrame({
    "L1_Ratio" : l1_ratios
})

for idx, val in enumerate(alphas):
    alpha_str = "alpha_" + str(val)
    elastic_net_performance[alpha_str] = scores_mean[idx,:]
    
fig = px.line(elastic_net_performance, x="L1_Ratio",
                  y=[col for col in elastic_net_performance.columns if 'alpha' in col],
                 title="Elastic Net Performance").update_layout(
    yaxis_title="R^2 value"
)
fig.show()


print(f"The best performing Elastic Net regression model had an L1 ratio of {search.best_params_['classifier__l1_ratio']}" + 
     f"\n(i.e. {100*search.best_params_['classifier__l1_ratio']}% Lasso regression)\n" +
     f"together with penalty coefficient of {search.best_params_['classifier__alpha']}" +
     "\nThis gave a mean R^2 score of " +
     f"{search.best_score_} across the {CROSS_FOLDS} folds of the training data.\n")


print("\n\n")
print('Training set score: ' + str(search.score(X_train, Y_train)))
print('Test set score: ' + str(search.score(X_test, Y_test)))

Enter a base integer (e.g. binary = base 2, decimal = base 10)
(Type 'Exit' to quit)
2
Give the lowest exponent value of 2 that our model should consider for the penalty coefficient
(Type 'Exit' to quit)
-10
Give the largest exponent value of 2 that our model should consider for the penalty coefficient
(Type 'Exit' to quit)
10
How many variations of Lasso / Ridge regression should we consider? 
(Type 'Exit' to quit)
10


Running 2100 models .....




The best performing Elastic Net regression model had an L1 ratio of 0.7777777777777777
(i.e. 77.77777777777777% Lasso regression)
together with penalty coefficient of 0.001953125
This gave a mean R^2 score of 0.9896799746292138 across the 10 folds of the training data.




Training set score: 0.990073965923783
Test set score: 0.9907639101030782


### k-Nearest Neighbors

In [26]:
pipe_knn = Pipeline([
    ('scaler', preprocessing.StandardScaler()), 
    ('classifier', KNeighborsRegressor())
])


num_neighbors = np.linspace(1, 60, 60, endpoint=True, dtype=int)
p_values = np.linspace(1, 5, 20, endpoint=True)


params = {
    'classifier__p': p_values,
    'classifier__n_neighbors': num_neighbors
}
                   
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    print(f"\n\nRunning {CROSS_FOLDS * len(p_values) * len(num_neighbors)} models .....\n\n" )
    
    search = GridSearchCV(estimator=pipe_knn,
                          param_grid=params,
                          scoring = PERFORMANCE_METRIC,
                          n_jobs = PARALLELIZATION,
                          cv = CROSS_FOLDS)

    search.fit(X_train, Y_train)




scores_mean = search.cv_results_['mean_test_score'].reshape(len(num_neighbors),
                                                          len(p_values))

knn_performance = pd.DataFrame({
    "num_neighbors" : num_neighbors
})

for idx, val in enumerate(p_values):
    p_str = "p_" + str(val)
    knn_performance[p_str] = scores_mean[:,idx]
    
fig = px.line(knn_performance, x="num_neighbors",
                  y=[col for col in knn_performance.columns if 'p_' in col],
                 title="k-Nearest Neighbors").update_layout(
    yaxis_title="R^2 value"
)
fig.show()





print(f"The best performing k-Nearest Neighbors model looked at {search.best_params_['classifier__n_neighbors']}" + 
     f" neighboring data points\n" +
     f"using the L{search.best_params_['classifier__p']} metric (i.e. the distance between points x and y " + 
      f" is |x^{search.best_params_['classifier__p']} - y^{search.best_params_['classifier__p']}|^(1/{search.best_params_['classifier__p']}))" +
     "\nThis gave a mean R^2 score of " +
     f"{search.best_score_} across the {CROSS_FOLDS} folds of the training data.\n")


print("\n\n")
print('Training set score: ' + str(search.score(X_train, Y_train)))
print('Test set score: ' + str(search.score(X_test, Y_test)))



Running 12000 models .....




The best performing k-Nearest Neighbors model looked at 2 neighboring data points
using the L1.2105263157894737 metric (i.e. the distance between points x and y  is |x^1.2105263157894737 - y^1.2105263157894737|^(1/1.2105263157894737))
This gave a mean R^2 score of 0.9886726113303815 across the 10 folds of the training data.




Training set score: 0.9958684061632661
Test set score: 0.9918040159437335


In [None]:
datetime.today().weekday()