## Import Libraries


In [117]:
import yfinance as yf
import pandas as pd
import numpy as np
import sys
import warnings

import matplotlib.dates as mdates
from datetime import date, datetime, timedelta
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from plotnine import *
from plotly.tools import mpl_to_plotly as ggplotly

## Add Data Wrappers

In [118]:
class StockDataWrapper:

    def __init__(self, data_frame, stock_symbol):
        self.data_frame = data_frame
        self.stock_symbol = stock_symbol
        self.granularity = 0
        self.num_days = 0
        self.lag_length = 0

    def __str__(self):
        my_str = f'Stock Name: {self.stock_symbol}\n'
        if self.granularity == 0:
            my_str += "Granularity of data not set\n"
        else:
            my_str += f"Granularity: {self.granularity}\n"
        
        if self.num_days == 0:
            my_str += "Number of days of stock data not set\n"
        else:
            my_str += f"Number of days of stock data: {self.num_days}\n"
        
        if self.lag_length == 0:
            my_str += "Number of lag variables not set\n"
        else:
            my_str += f"Number of lag variables: {self.lag_length}\n"

        my_str += "\n\n"
        my_str += str(self.data_frame)

        return my_str
    

    def compute_maximal_lag_variables(self):

        if self.granularity == 0 or self.num_days == 0:
            print("Both the granularity and number of days of stock data" +
                  " must be set prior to computing the maximum number of lag variables\n")
            return -1
        
        data_entries = -4
        if self.granularity == '1m':
            data_entries = 389 * self.num_days
        elif self.granularity == '2m':
            data_entries = 195 * self.num_days
        elif self.granularity == '5m':
            data_entries = 78 * self.num_days
        elif self.granularity == '15m':
            data_entries = 26 * self.num_days
        elif self.granularity == '30m':
            data_entries = 13 * self.num_days
        elif self.granularity == '1h':
            data_entries = 7 * self.num_days
        else:
            print("Granularity is not set properly: must be set to " +
                  "one of: 1m, 2m, 5m, 15m, 30m, 1h\n")
            return -1
        
        return int(0.3 * data_entries)

## Import Data

In [119]:
def create_stock_data_from_input():

    user_code = 0

    while user_code == 0:
        num_days_to_build = input("How many days of intraday stock market data should " + 
                                  " we use to build our model? Enter a value between " +
                                  " 1 and 7:\n(Type 'Exit' to quit)\n")

        if num_days_to_build.isnumeric():
            num_days_to_build = int(num_days_to_build)
            if num_days_to_build < 8 and num_days_to_build > 0:
                user_code = 1
            else:
                print("The number of days must be between 1 and 7 "+
                      "— please retry. \n (Type 'Exit' to quit)\n")
        elif "exit" in num_days_to_build.lower():
            sys.exit("Exiting program")
        else:
            print("Non-integer passed as input — please retry. \n(Type 'Exit' to quit)\n")


    today = date.today()
    num_days_prior = today - timedelta(num_days_to_build)

    granularity_options = ['1m', '2m', '5m', '15m', '30m', '1h']
    user_code = 0

    while user_code == 0:
        granulrity_input = input("How often should our model look at" + 
                                 " stock prices? Choose from 1m, 2m, 5m," +
                                 " 15m, 30m or 1h.\n(Type 'Exit' to quit)\n"
                                ).lower()
        
        if granulrity_input in granularity_options:
            user_code = 1
        elif "exit" in granulrity_input:
            sys.exit("Exiting program")
        else:
            print("Input was not among the options 1m, 2m," +
                  " 5m, 15m, 30m, or 1h — please retry."+
                  "\n(Type 'Exit' to quit)\n")
            
    
    stock_symbol = input("Please input the stock symbol you would like to examine: (e.g. AAPL)").upper()
        


    df = pd.DataFrame(yf.download(stock_symbol,
                                start=num_days_prior,
                                end=today,
                                interval=granulrity_input)
                            )
    
    stock_data = StockDataWrapper(df, stock_symbol)
    stock_data.num_days = num_days_to_build
    stock_data.granularity = granulrity_input

    return stock_data

## Create Lag Variables

In [120]:
def add_lag_variables_to_df(data_frame, num_lags):

    for i in range(1,num_lags + 1):
        index_str = "Close_L" + str(i)
        data_frame[index_str] = data_frame['Close'].shift(i)

    # Backfill the entries to remove any NaN
    data_frame = data_frame.bfill(axis=0)

    return data_frame


def add_lags_from_input(stock_data):

    user_code = 0
    max_lag_len = stock_data.compute_maximal_lag_variables()
    if max_lag_len == -1:
        print("Cannot set lag variables with current settings")
        return

    # Clean up previous entries
    stock_data.data_frame = stock_data.data_frame[stock_data.data_frame.columns.drop(
        list(stock_data.data_frame.filter(regex='Close_L'))
        )]

    while user_code == 0:
        lag_length = input("How many previous data points should our model look at?\n (Type 'Exit' to quit)")

        if lag_length.isnumeric():
            lag_length = int(lag_length)
            if lag_length <= max_lag_len:
                stock_data.lag_length = lag_length
                stock_data.data_frame = add_lag_variables_to_df(stock_data.data_frame,
                                                                lag_length)
                user_code = 1
            else:
                print("The number of previous data points considered should" +
                      f" not exceed 30%% \nof the total number of data points (in this case, {max_lag_len}) -- please" +
                       " retry. \n\n(Type 'Exit' to quit)")
        elif "exit" in num_days_to_build.lower():
                sys.exit("Exiting program")
        else:
            print("""Non-integer passed as input — please retry.
                  \n
                  (Type 'Exit' to quit)\n""")

## Add Simple Moving Average

In [121]:
def simple_moving_average(data_frame, lag_length):
    lag_predictors = []
    for i in range(1, lag_length + 1):
        lag_name = 'Close_L' + str(i)
        lag_predictors.append(lag_name)
    
    return (data_frame[lag_predictors].sum(axis = 1, skipna = True)/lag_length)


def add_simple_moving_average(stock_data, lag_length):
    
    if not isinstance(lag_length, int):
        print("Parameter lag_length must be integer")
        return
    if stock_data.lag_length == 0:
        print("Lag variables must be set prior to adding simple moving average")
        return
    if lag_length > stock_data.lag_length:
        print("Cannot take the average of more lag varaibles than are availible" + 
              f" (currently {stock_data.lag_length})\n")
        return
    
    column_label = "SMA_"  + str(lag_length)
    stock_data.data_frame[column_label] = simple_moving_average(stock_data.data_frame,
                                                                 lag_length)


## Add Simple Moving Standard Deviation

In [122]:
def simple_moving_standard_deviation(data_frame, lag_length):
    lag_predictors = []
    for i in range(1, lag_length + 1):
        lag_name = 'Close_L' + str(i)
        lag_predictors.append(lag_name)
    
    return (data_frame[lag_predictors].std(axis = 1, skipna = True))

def add_simple_moving_standard_deviation(stock_data, lag_length):
    
    if not isinstance(lag_length, int):
        print("Parameter lag_length must be integer")
        return
    if stock_data.lag_length == 0:
        print("Lag variables must be set prior to adding simple moving average")
        return
    if lag_length > stock_data.lag_length:
        print("Cannot take the average of more lag varaibles than are availible" + 
              f" (currently {stock_data.lag_length})\n")
        return
    
    column_label = "SMSD_"  + str(lag_length)
    stock_data.data_frame[column_label] = simple_moving_standard_deviation(stock_data.data_frame,
                                                                 lag_length)
    
def add_upper_bollinger(stock_data, lag_length):
    
    if not isinstance(lag_length, int):
        print("Parameter lag_length must be integer")
        return
    if stock_data.lag_length == 0:
        print("Lag variables must be set prior to adding simple moving average")
        return
    if lag_length > stock_data.lag_length:
        print("Cannot take the average of more lag varaibles than are availible" + 
              f" (currently {stock_data.lag_length})\n")
        return
    
    SMSD_string = "SMSD_" + str(lag_length)
    SMA_string = "SMA_" + str(lag_length)
    if not (SMSD_string in stock_data.data_frame.columns):
        add_simple_moving_standard_deviation(stock_data, lag_length)
        
    if not (SMA_string in stock_data.data_frame.columns):
        add_simple_moving_average(stock_data, lag_length)
    
    
    upper_bollinger_str = "upper_boll_" + str(lag_length)
    stock_data.data_frame[upper_bollinger_str] = stock_data.data_frame[SMA_string] + stock_data.data_frame[SMSD_string]
    
def add_lower_bollinger(stock_data, lag_length):
    
    if not isinstance(lag_length, int):
        print("Parameter lag_length must be integer")
        return
    if stock_data.lag_length == 0:
        print("Lag variables must be set prior to adding simple moving average")
        return
    if lag_length > stock_data.lag_length:
        print("Cannot take the average of more lag varaibles than are availible" + 
              f" (currently {stock_data.lag_length})\n")
        return
    
    SMSD_string = "SMSD_" + str(lag_length)
    SMA_string = "SMA_" + str(lag_length)
    if not (SMSD_string in stock_data.data_frame.columns):
        add_simple_moving_standard_deviation(stock_data, lag_length)
        
    if not (SMA_string in stock_data.data_frame.columns):
        add_simple_moving_average(stock_data, lag_length)
    
    
    lower_bollinger_str = "lower_boll_" + str(lag_length)
    stock_data.data_frame[lower_bollinger_str] = stock_data.data_frame[SMA_string] - stock_data.data_frame[SMSD_string]
    

# Exploratory Data Analysis

In [128]:
stonks = create_stock_data_from_input()
p = ggplot(stonks.data_frame, aes(x=stonks.data_frame.index, y='Close')) +\
    geom_line() +\
    scale_x_date(date_labels =  '%m-%d %H:%M') +\
    theme_dark()
p1 = ggplotly(p.draw())
p1.show()

add_lags_from_input(stonks)

display_days = int(0.8 * stonks.lag_length)
SMA_string = 'SMA_' + str(display_days)
lower_boll_str = 'lower_boll_' + str(display_days)
upper_boll_str = 'upper_boll_' + str(display_days)


add_simple_moving_average(stonks, display_days)
add_upper_bollinger(stonks, display_days)
add_lower_bollinger(stonks, display_days)


p = ggplot(stonks.data_frame, aes(x=stonks.data_frame.index)) + \
    geom_line( aes(y=SMA_string), color="blue") + \
    geom_line( aes(y=upper_boll_str), color='green') + \
    geom_line( aes(y=lower_boll_str), color='green') + \
    geom_line( aes(y='Close'), color="black") + \
    scale_x_date(date_labels =  '%m-%d %H:%M') 
p2 = ggplotly(p.draw())
p2.
p2.show()



How many days of intraday stock market data should  we use to build our model? Enter a value between  1 and 7:
(Type 'Exit' to quit)
1
How often should our model look at stock prices? Choose from 1m, 2m, 5m, 15m, 30m or 1h.
(Type 'Exit' to quit)
1m
Please input the stock symbol you would like to examine: (e.g. AAPL)nvda


[*********************100%%**********************]  1 of 1 completed


How many previous data points should our model look at?
 (Type 'Exit' to quit)30


In [126]:
def _exponential_moving_average_helper(s, smoothing_factor):
    vec_len = len(s)
    output_vec = list(range(vec_len))
    output_vec[0] = s[0]
  
    for i in range(1,vec_len):
        output_vec[i] = smoothing_factor * s[i] + (1 - smoothing_factor) * output_vec[i-1]
  
  
    return(output_vec[vec_len-1])

def exponential_moving_average(s, lag_length):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        length = len(s)
        output_vec = list(range(length))
        output_vec[0] = s[0]
        output_vec[1] = s[0]
        smoothing_factor=float(2/(lag_length + 1))

        for i in range(2, length):
            # If there are less that lag_period of data previous to the current date,
            # simply take the average of all the days prior to get the closest thing
            # to a running average
            if i <= lag_length:
                output_vec[i] = _exponential_moving_average_helper(s[0:i], 2/(i + 1))
            else:
                output_vec[i] = _exponential_moving_average_helper(s[i-lag_length:i], smoothing_factor)

        return output_vec


def add_exponential_moving_average(stock_data, lag_length):
    if not isinstance(lag_length, int):
        print("Parameter lag_length must be integer")
        return
    if stock_data.lag_length == 0:
        print("Lag variables must be set prior to adding simple moving average")
        return
    if lag_length > stock_data.lag_length:
        print("Cannot take the average of more lag varaibles than are availible" + 
                  f" (currently {stock_data.lag_length})\n")
        return

    column_label = "EMA_"  + str(lag_length)
    stock_data.data_frame[column_label] = exponential_moving_average(stock_data.data_frame["Close"], lag_length)

    
def add_MACD(stock_data, max_lag, min_lag):
    if not isinstance(max_lag, int) or not isinstance(min_lag, int):
        print("Parameters max_lag and min_lag must be integer")
        return
    if min_lag >= max_lag:
        print("min_lag must be strictly smaller than max_lag")
        return
    
    EMA_string_min = "EMA_" + str(min_lag)
    EMA_string_max = "EMA_" + str(max_lag)
    column_label = "MACD_"  + str(min_lag) + "_" + str(max_lag)
    
    if not (EMA_string_min in stock_data.data_frame.columns):
        add_exponential_moving_average(stonks, min_lag)
        
    if not (EMA_string_max in stock_data.data_frame.columns):
        add_exponential_moving_average(stonks, max_lag)
    
    stock_data.data_frame[column_label] = (stock_data.data_frame[EMA_string_min] - stock_data.data_frame[EMA_string_max])
    
    
add_exponential_moving_average(stonks, display_days)
add_exponential_moving_average(stonks, int(display_days/2))

EMA_string = 'EMA_' + str(display_days)
EMA_string_2 = 'EMA_' + str(int(display_days / 2))

p = ggplot(stonks.data_frame, aes(x=stonks.data_frame.index)) +\
    geom_line(aes(y=EMA_string), color='chartreuse') +\
    geom_line(aes(y=EMA_string_2), color='blue')
p1 = ggplotly(p.draw())  

p1.show()


MACD_string = 'MACD_' + str(int(display_days / 2)) + '_' + str(display_days)


add_MACD(stonks, display_days, int(display_days/2))

p = ggplot(stonks.data_frame, aes(x=stonks.data_frame.index)) +\
    geom_line(aes(y=MACD_string), color='cyan') 
    
p2 = ggplotly(p.draw()) 
p2.show()



