# Analysing stock market prices from Yahoo Finance data

https://medium.com/analytics-vidhya/trading-dashboard-with-yfinance-python-56fa471f881d

https://pypi.org/project/yfinance/

https://hvplot.holoviz.org/

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from matplotlib.ticker import MultipleLocator, FormatStrFormatter, AutoMinorLocator
import datetime
import yfinance as yf
import hvplot.pandas

In [None]:
INPUT_NEWS_FOLDER_NAME = "/Users/abuzatu/Work/data/finance/stocks"

# choose the company, e.g. ticker SPCE for the company Virgin Galactic
STOCK_TICKER = "SPCE"
STOCK_NAME = "Virgin Galactic"

LOCALIZE_US_STOCK_MARKET = "America/New_York"
DATETIME_TODAY = pd.to_datetime(str(datetime.date.today())).tz_localize(LOCALIZE_US_STOCK_MARKET)
DATETIME_TOMORROW = DATETIME_TODAY + pd.Timedelta(9, "h") + pd.Timedelta(1, "d")

# set the time frame of interest and the interval
if True:
    DATE_INITIAL_DATA = None
    DATE_FINAL_DATA   = None
    DATE_INITIAL_PLOT = None
    DATE_FINAL_PLOT   = None
elif True:
    STRING_DATE_INITIAL_DATA = "2020-12-22 09:20:00"
    STRING_DATE_FINAL_DATA   = "2020-12-22 23:59:59"
    STRING_DATE_INITIAL_PLOT = STRING_DATE_INITIAL_DATA
    STRING_DATE_FINAL_PLOT   = STRING_DATE_FINAL_DATA 
    # 
    DATE_INITIAL_DATA =  pd.to_datetime(STRING_DATE_INITIAL_DATA).tz_localize(LOCALIZE_US_STOCK_MARKET)
    DATE_FINAL_DATA =  pd.to_datetime(STRING_DATE_FINAL_DATA).tz_localize(LOCALIZE_US_STOCK_MARKET)
    DATE_INITIAL_PLOT =  pd.to_datetime(STRING_DATE_INITIAL_PLOT).tz_localize(LOCALIZE_US_STOCK_MARKET)
    DATE_FINAL_PLOT =  pd.to_datetime(STRING_DATE_FINAL_PLOT).tz_localize(LOCALIZE_US_STOCK_MARKET)

# 1d - for all historical data
# 1h - for the last 730 days (~ 2 years)
# 30m, 15m, 5m, 2m - for the last 60 days (~ 2 months)
# 1m - for the last 7 days
INTERVAL = "1h" # # 1m (minute) works only for the previous week from the current moment
DO_PLOT_NEWS = True
if INTERVAL == "1d":
    DATETIME = "Date"
    MULTIPLY_WINDOW = 1
    INTERVAL_NUMBER_SHORT = 0
    NB_DAYS_LOOK_BACK = 730 + 365 # 730
elif INTERVAL == "1h":
    DATETIME = "Date"
    INTERVAL_NUMBER_SHORT = 0.5
    MULTIPLY_WINDOW = 7 # one trading day has 7 trading hour intervals
    NB_DAYS_LOOK_BACK = 730
elif INTERVAL == "30m" or INTERVAL == "15m" or INTERVAL == "5m" or INTERVAL == "2m" or INTERVAL == "1m":
    DATETIME = "Datetime"
    INTERVAL_NUMBER_SHORT = 0.0
    if INTERVAL == "30m":
        MULTIPLY_WINDOW = 7 * 2 - 1 # one trading day has 7 trading hour intervals, but last hour only half hour
        NB_DAYS_LOOK_BACK = 60
    elif INTERVAL == "15m":
        MULTIPLY_WINDOW = 7 * 4 - 2 # one trading day has 7 trading hour intervals, but last hour only half hour
        NB_DAYS_LOOK_BACK = 60
    elif INTERVAL == "5m":
        MULTIPLY_WINDOW = 7 * 12 - 6 # one trading day has 7 trading hour intervals, but last hour only half hour
        NB_DAYS_LOOK_BACK = 60
    elif INTERVAL == "2m":
        MULTIPLY_WINDOW = 7 * 30 - 15 # one trading day has 7 trading hour intervals, but last hour only half hour
        NB_DAYS_LOOK_BACK = 60
    elif INTERVAL == "1m":
        MULTIPLY_WINDOW = 7 * 60 - 30 # one trading day has 7 trading hour intervals, but last hour only half hour
        NB_DAYS_LOOK_BACK = 1
        DO_PLOT_NEWS = False
    else:
        raise RuntimeError(f"INTERVAL={INTERVAL} is not well defined!")    
else:
    raise RuntimeError(f"INTERVAL={INTERVAL} is not well defined!")
    
if DATE_FINAL_DATA is None:
    DATE_FINAL_DATA = DATETIME_TOMORROW
    DATE_INITIAL_DATA = DATE_FINAL_DATA - pd.Timedelta(NB_DAYS_LOOK_BACK, "d")
    DATE_FINAL_PLOT = DATE_FINAL_DATA
    DATE_INITIAL_PLOT = DATE_INITIAL_DATA
    
INTERVAL_NUMBER = int(INTERVAL[0:-1])
INTERVAL_UNIT = INTERVAL[-1:]

# number of days in simple moving averages (SMA)
SHORT_WINDOW = 21 * MULTIPLY_WINDOW # 50
MEDIUM_WINDOW = 55 * MULTIPLY_WINDOW # 100
LONG_WINDOW = 200 * MULTIPLY_WINDOW

In [None]:
DATETIME_TODAY

In [None]:
DATE_INITIAL_DATA

In [None]:
DATE_FINAL_DATA

In [None]:
INTERVAL_NUMBER

In [None]:
INTERVAL_UNIT

In [None]:
# create the ticker for the desired company
ticker = yf.Ticker(STOCK_TICKER)
ticker

In [None]:
# df_original = ticker.history(period = "max")
df_original = ticker.history(start = DATE_INITIAL_DATA, end = DATE_FINAL_DATA, interval = INTERVAL)
df_original[0:15]

In [None]:
df_original[-50:]

In [None]:
# For the 1h interval, there is a bug as it returns a date without the time, so we need to add by hand
# There are 7 intervals for every day, sometimes fewer if there is a short day
# The stock market usually starts at 9:30 am and ends at 5 pm.
# so the last interval has only 30 minutes.
df = df_original.reset_index()

In [None]:
def update_df_1d(dt):
    # this is for one day interval
    dt = df[DATETIME]
    date = None
    list_datetime_start = []
    list_datetime_end = []
    for i in range(len(df)):
        datetime_start = dt[i].tz_localize(LOCALIZE_US_STOCK_MARKET) + pd.Timedelta(9.5, unit = "h")
        datetime_end = datetime_start + pd.Timedelta(6.5, unit = "h") # there are 6.5 trading hours
        # print(f"i={i}, counter={counter}, datetime_start={datetime_start}, datetime_end={datetime_end}")
        list_datetime_start.append(datetime_start)
        list_datetime_end.append(datetime_end)
    df["datetime_start"] = list_datetime_start
    df["datetime_end"] = list_datetime_end

In [None]:
def update_df_1h(dt):
    # this is for one hour interval
    dt = df[DATETIME]
    date = None
    list_datetime_start = []
    list_datetime_end = []
    for i in range(len(df)):
        dti = dt[i].tz_localize(LOCALIZE_US_STOCK_MARKET)
        dti = dti + pd.Timedelta(8.5, unit = "h")
        if dti != date:
            date = dti
            counter = 0
            datetime_start = date
            new_date = False
        # increase another hour
        counter += 1
        interval = INTERVAL_NUMBER
        if counter == 7:
            interval = INTERVAL_NUMBER_SHORT
        datetime_start = datetime_start + pd.Timedelta(INTERVAL_NUMBER, unit = INTERVAL_UNIT)
        datetime_end = datetime_start + pd.Timedelta(interval, unit = INTERVAL_UNIT)
        # print(f"i={i}, counter={counter}, datetime_start={datetime_start}, datetime_end={datetime_end}")
        list_datetime_start.append(datetime_start)
        list_datetime_end.append(datetime_end)
    df["datetime_start"] = list_datetime_start
    df["datetime_end"] = list_datetime_end

In [None]:
def update_df_min(df):
    # this is for N minutes interval
    dt = df[DATETIME]
    date = None
    list_datetime_start = []
    list_datetime_end = []
    for i in range(len(df)):
        datetime_start = dt[i]
        datetime_end = datetime_start + pd.Timedelta(INTERVAL_NUMBER, unit = INTERVAL_UNIT) 
        # print(f"i={i}, counter={counter}, datetime_start={datetime_start}, datetime_end={datetime_end}")
        list_datetime_start.append(datetime_start)
        list_datetime_end.append(datetime_end)
    df["datetime_start"] = list_datetime_start
    df["datetime_end"] = list_datetime_end

In [None]:
if INTERVAL == "1d":
    update_df_1d(df)
elif INTERVAL == "1h":
    update_df_1h(df)
elif INTERVAL.endswith("m"):
    update_df_min(df)
else:
    raise RuntimeError(f"INTERVAL={INTERVAL} not known!")
df.head()

In [None]:
df.tail()

In [None]:
df.datetime_end[0]

In [None]:
df.datetime_end[0].tz

In [None]:
df

In [None]:
# keep only the close datetime and price
MY_DATETIME = "datetime_end"
df = df[[MY_DATETIME, "Open", "Close", "Volume"]].set_index(MY_DATETIME)
df

In [None]:
# calculate moving averages for various time windows
df["SMA_S"] = df.Close.rolling(window = SHORT_WINDOW).mean()
df["SMA_M"] = df.Close.rolling(window = MEDIUM_WINDOW).mean()
df["SMA_L"] = df.Close.rolling(window = LONG_WINDOW).mean()
df

In [None]:
# let's create a column called signal initially with all values at 0
# this columns informs us when to buy or to sell
# the rule of thumb is that when the short term SMA goes above
# the long term SME the stock will continue to grow further (bullish sign)
# so we should buy; the opposite if it goes below
df["Signal_S_L"] = 0.0
df["Signal_S_M"] = 0.0
df

In [None]:
# fill the signal column with 1.0 only in the places SMA_S above SMA_L
# and 0.0 remains elsewhere, so where SMA_S < SMA_L
df.loc[df.SMA_S > df.SMA_L, "Signal_S_L"] = 1.0
df.loc[df.SMA_S > df.SMA_M, "Signal_S_M"] = 1.0
df

In [None]:
df.Signal_S_L.value_counts()

In [None]:
df.Signal_S_M.value_counts()

In [None]:
# another syntax possible
df["Signal_S_M_2"] = np.where(df.SMA_S > df.SMA_M, 1.0, 0.0)
df

In [None]:
df.Signal_S_M.value_counts()

In [None]:
# calculate the points in time when a position should be taken
# meaning when the signal changes value
df["Action_S_L"] = df["Signal_S_L"].diff()
df["Action_S_M"] = df["Signal_S_M"].diff()
df

In [None]:
# create a static plot with matplotlib
# price at close
# moving averages
# predictions of when to buy or sell
# my note: the predictor is not very good, as lagging the big moves
# but at east predictor Action_S_M is betterthan Action_S_L
fig, ax =  plt.subplots(1, 1, figsize = (12, 9))
ax.plot(df.Close, color = "lightgray", label = "Close")
ax.plot(df.SMA_S, color = "skyblue", label = "SMA_S")
ax.plot(df.SMA_M, color = "dodgerblue", label = "SMA_M")
ax.plot(df.SMA_L, color = "darkblue", label = "SMA_L")
#ax.plot(df.index, df.Signal, color = "darkgreen", label = "Signal")
#ax.plot(df.index, df.Action, color = "violet", label = "Action")
# plot markers on the plot for the points where one has to sell or buy
# https://matplotlib.org/3.2.1/api/_as_gen/matplotlib.pyplot.plot.html
# https://www.w3schools.com/python/matplotlib_markers.asp
if False:
    df_buy = df[df["Action_S_L"] == 1.0]
    plt.plot(df_buy.index, df_buy.Close, "Dg", label = "S_L Buy signal")
    df_sell = df[df["Action_S_L"] == -1.0]
    plt.plot(df_sell.index, df_sell.Close, "Dr", label = "S_L Sell signal")
if True:
    df_buy = df[df["Action_S_M"] == 1.0]
    plt.plot(df_buy.index, df_buy.Close, "og", label = "S_M Buy signal")
    df_sell = df[df["Action_S_M"] == -1.0]
    plt.plot(df_sell.index, df_sell.Close, "or", label = "S_M Sell signal")
#
plt.legend()
plt.title(f"Stock price for {STOCK_TICKER} ({STOCK_NAME})", fontsize = 18)
# plt.xticks(rotation="vertical")
# date_form = DateFormatter("%H:%M:%S")
date_form = DateFormatter("%Y-%m-%d")
# date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
#ax.set_xticks(rotation='vertical')
ax.tick_params(axis = "x", labelsize = 18, labelrotation = 90)
plt.ylabel("Stock price [USD]", fontsize = 18)
plt.xlim(DATE_INITIAL_PLOT, DATE_FINAL_PLOT)
# plt.ylim(23.78, 35.82)

In [None]:
# create data frame with news
input_news_file_name = f"{INPUT_NEWS_FOLDER_NAME}/{STOCK_TICKER}.txt"
try:
    f = open(input_news_file_name)
    lines = f.readlines()
    counter = 0
    list_dict_news = []
    for line in lines:
        line = line.rstrip()
        if line == "":
            continue
        # print(line)
        if counter%3 == 0:
            string_datetime = line
        if counter%3 == 1:
            text_short = line
        if counter%3 == 2:
            text_long = line
            # now the piece of news if finished
            datetime_end = pd.to_datetime(string_datetime + " 09:30:00").tz_localize(LOCALIZE_US_STOCK_MARKET)
            # find the numerical index in the df of the row whose datetime index is the closest to a given datetime
            i = np.argmin(np.abs(df.index - datetime_end))
            value_news = df.Close[i]
            list_dict_news.append(
                {
                    "datetime_end": datetime_end,
                    "text_short": text_short,
                    "text_long": text_long,
                    "stock_price": value_news,
                }
            )
        counter += 1
    df_news = pd.DataFrame(list_dict_news).set_index("datetime_end")
except IOError:
    print(f"File {input_news_file_name} not accessible.")
finally:
    f.close()
df_news

In [None]:
# df_news.datetime_end[0].tz

In [None]:
DATE_INITIAL_PLOT

In [None]:
df.index[0]

In [None]:
np.min(df.Close.values)

In [None]:
np.max(df.Volume.values)

In [None]:
1.1 * np.max(df.Volume.values) / np.min(df.Close.values)

In [None]:
df["Volume2"] = df["Volume"] / (1.1 * np.max(df.Volume.values) / np.min(df.Close.values))

In [None]:
# plot on an interactive plot using hvplot
# https://hvplot.holoviz.org/user_guide/Customization.html
# https://coderzcolumn.com/tutorials/data-science/how-to-convert-static-pandas-plot-matplotlib-to-interactive-hvplot#2

# close
security_close = df["Close"].hvplot(
    line_color = "lightgray",
    ylabel = "Stock price [USD]",
    xlabel = "Date",
    width = 800,
    height = 350,
    # xlim = (DATE_INITIAL_PLOT, DATE_FINAL_PLOT),
    # ylim = (100, 135),
    # hover_cols = ["datetime_end"],
    grid = True,
)

security_close_dots = df["Close"].hvplot.scatter(
    line_color = "darkgray",
    fill_color = "darkgray",
    ylabel = "Stock price [USD]",
    xlabel = "Date",
    width = 800,
    height = 400,
    # xlim = (DATE_INITIAL_PLOT, DATE_FINAL_PLOT),
    # ylim = (25.45, 27.5),
    # hover_cols = ["datetime_end"],
    grid = True,
)

security_open_dots = df["Open"].hvplot.scatter(
    line_color = "orange",
    fill_color = "orange",
    ylabel = "Stock price [USD]",
    xlabel = "Date",
    width = 800,
    height = 400,
    # xlim = (DATE_INITIAL_PLOT, DATE_FINAL_PLOT),
    # ylim = (25.45, 27.5),
    # hover_cols = ["datetime_end"],
    grid = True,
)

# Visualize moving averages
SMA_S = df["SMA_S"].hvplot(
    line_color = "skyblue",
    hover = False,
)
SMA_M = df["SMA_M"].hvplot(
    line_color = "dodgerblue",
    hover = False,
)
SMA_L = df["SMA_L"].hvplot(
    line_color = "darkblue",
    hover = False,
)

# actions
sell = df_sell["Close"].hvplot.scatter(
    color = "red",
    legend = False,
)

buy = df_buy["Close"].hvplot.scatter(
    color = "green",
    legend = False,
)

# actions
news = df_news.hvplot.scatter(
    x = "datetime_end",
    y = "stock_price",
    color = "violet",
    # hover_cols = "all",
    hover_cols = ["text_short"],
    legend = True,
)

# overlay plots
# final_plot = security_close * security_close_dots * security_open_dots * SMA_S * SMA_M * SMA_L * sell * buy * news
# final_plot = security_close *  SMA_S * SMA_M * SMA_L * sell * buy # * news
# final_plot = security_close *  SMA_S * SMA_M * SMA_L * sell * buy * news
final_plot = security_close * SMA_S * SMA_M * SMA_L
if DO_PLOT_NEWS:
    final_plot = final_plot * news
# final_plot = security_close * news

In [None]:
volume = df.hvplot.bar(
    x = "datetime_end",
    y = "Volume",
    # color = "orange",
    line_color = "orange",
    fill_color = "orange",
)

volume2 = df["Volume"].hvplot.bar(
    # color = "orange",
    line_color = "orange",
    fill_color = "orange",
    width = 850,
    height = 200,
    
)

final_plot_2 = volume2

In [None]:
final_plot.opts(xaxis = "bottom", title = f"Stock price of {STOCK_TICKER} ({STOCK_NAME})", show_legend = False) 

In [None]:
final_plot_2.opts(xaxis = None, yaxis = None, title = f"Stock volume of {STOCK_TICKER} ({STOCK_NAME})") 