# Reading and analysing data from Yahoo Finance

Taking the functions from utils.py, to make the code more modular.

In [None]:
from utils import *

%load_ext autoreload
%autoreload 2

In [None]:
# logging level: NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
INPUT_FILE_NAME = "/Users/abuzatu/Work/data/finance/stocks/tickers11.txt"
OUTPUT_FOLDER_NAME = "/Users/abuzatu/Work/data/finance/stocks/processed_data"

In [None]:
ADD_OUTSIDE_TRADING_HOURS = True
ADD_DIVIDENDS_AND_STOCK_SPLITS = True
AUTO_ADJUST = True
PERIOD = None

In [None]:
LIST_DATE = [
        # ("10-01-01", "19-01-01", "1d"),
        ("19-01-01", "19-01-04", "1d"),
        ("19-01-04", "20-11-04", "1h"),
        ("20-11-04", "20-11-18", "5m"),
        ("20-11-18", "20-12-04", "2m"),
        ("20-12-04", "20-12-11", "1m"),
        ("20-12-11", "20-12-18", "1m"),
        ("20-12-18", "20-12-25", "1m"),
        ("20-12-25", "21-01-01", "1m"),
    ]
#LIST_DATE = [
#    ("19-01-01", "19-01-04", "1d"),
#]
date_start_all = LIST_DATE[0][0]
date_end_all = LIST_DATE[-1][-2]

In [None]:
date_start_all

In [None]:
date_end_all

In [None]:
INPUT_FILE_NAME

In [None]:
list_stock_ticker = []
try:
    f = open(INPUT_FILE_NAME)
    lines = f.readlines()
    for line in lines:
        line = line.rstrip()
        if line.startswith("#"):
            continue
        list_stock_ticker.append(line)
except IOError:
    print(f"File {INPUT_FILE_NAME} not accessible.")
finally:
    f.close()
list_stock_ticker

In [None]:
def get_df(stock_ticker, list_date, output_folder_name):
    list_df = []
    for s, e, interval in list_date:
        string_date_start = f"20{s} 00:00:00"
        string_date_end   = f"20{e} 00:00:00"
        date_start = pd.to_datetime(string_date_start).tz_localize(LOCALIZE_US_STOCK_MARKET)
        date_end = pd.to_datetime(string_date_end).tz_localize(LOCALIZE_US_STOCK_MARKET)
        logger.info(f"{stock_ticker} from {date_start} to {date_end} with interval {interval}")
        # fix a bug in yfinance of not applying the localization when this option is on
        if ADD_OUTSIDE_TRADING_HOURS:
            date_start += pd.Timedelta (5, "h")
            date_end += pd.Timedelta (5, "h")

        # read the data
        df = read_data(stock_ticker,
                   PERIOD,
                   date_start,
                   date_end,
                   interval,
                   ADD_OUTSIDE_TRADING_HOURS,
                   ADD_DIVIDENDS_AND_STOCK_SPLITS,
                   AUTO_ADJUST)
        
        #
        logger.info(f"len = {len(df)}")
        if len(df) > 0:
            if stock_ticker == "AMRH":
                if interval.endswith("h") or interval.endswith("m"):
                    # ajust by the stock split of 4 stocks -> 1 stock
                    apply_split(df, 4, 1)
            # add to list
            list_df.append(df)
            # save for future
            output_file_name = get_output_file_name(output_folder_name, s, e, interval, stock_ticker)
            df.to_pickle(output_file_name )
                  
    # print(list_df[-1])
    return pd.concat(list_df, axis = 0)

In [None]:
# read the file and save to a file
for stock_ticker in list_stock_ticker:
    if False:
        # do only for one ticker
        if stock_ticker != "EOSE":
            continue
    print(f"stock_ticker={stock_ticker}")
    # find automatically the range that we want
    # we collect the data using the period max, then find the first date
    # than depending on that date build the LIST_DATE
    df = read_data(stock_ticker,
                   "max",
                   None,
                   None,
                   "1d",
                   ADD_OUTSIDE_TRADING_HOURS,
                   ADD_DIVIDENDS_AND_STOCK_SPLITS,
                   AUTO_ADJUST)
    datetime_end = df.index[0].tz_localize(None)
    date_short = str(datetime_end.tz_localize(None).date())[2:]
    list_date = []
    if datetime_end < pd.to_datetime("2019-01-01"):
        list_date = [
            ("19-01-01", "19-01-05", "1d"),
            ("19-01-05", "20-11-05", "1h"),
            ("20-11-05", "20-11-18", "5m"),
            ("20-11-18", "20-12-05", "2m"),
            ("20-12-05", "20-12-11", "1m"),
            ("20-12-11", "20-12-18", "1m"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    elif datetime_end < pd.to_datetime("2019-01-04"):
        list_date = [
            (date_short, "19-01-04", "1d"),
            ("19-01-04", "20-11-04", "1h"),
            ("20-11-04", "20-11-18", "5m"),
            ("20-11-18", "20-12-04", "2m"),
            ("20-12-04", "20-12-11", "1m"),
            ("20-12-11", "20-12-18", "1m"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    elif datetime_end < pd.to_datetime("2020-11-04"):
        list_date = [
            (date_short, "20-11-04", "1h"),
            ("20-11-04", "20-11-18", "5m"),
            ("20-11-18", "20-12-04", "2m"),
            ("20-12-04", "20-12-11", "1m"),
            ("20-12-11", "20-12-18", "1m"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    elif datetime_end < pd.to_datetime("2020-11-18"):
        list_date = [
            (date_short, "20-11-18", "5m"),
            ("20-11-18", "20-12-04", "2m"),
            ("20-12-04", "20-12-11", "1m"),
            ("20-12-11", "20-12-18", "1m"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    elif datetime_end < pd.to_datetime("2020-12-04"):
        list_date = [
            (date_short, "20-12-04", "2m"),
            ("20-12-04", "20-12-11", "1m"),
            ("20-12-11", "20-12-18", "1m"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    elif datetime_end < pd.to_datetime("2020-12-11"):
        list_date = [
            (date_short, "20-12-11", "1m"),
            ("20-12-11", "20-12-18", "1m"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    elif datetime_end < pd.to_datetime("2020-12-18"):
        list_date = [
            (date_short, "20-12-18", "1m"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    elif datetime_end < pd.to_datetime("2020-12-25"):
        list_date = [
            (date_short, "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    elif datetime_end < pd.to_datetime("2021-01-01"):
        list_date = [
            (date_short, "21-01-01", "1m"),
        ]
    
    if stock_ticker == "LAZR":
        list_date = [
            ("19-03-25", "20-12-04", "1d"),
            ("20-12-04", "20-12-11", "1m"),
            ("20-12-11", "20-12-18", "1m"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    if stock_ticker == "QS":
        list_date = [
            ("20-08-17", "20-12-04", "1d"),
            ("20-12-04", "20-12-11", "1m"),
            ("20-12-11", "20-12-18", "1m"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    if stock_ticker == "GOEV" or stock_ticker == "XL":
        list_date = [
            ("19-04-16", "20-12-18", "1d"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    if stock_ticker == "EOSE":
        list_date = [
            (date_short, "20-11-18", "1d"),
            ("20-11-18", "20-12-04", "2m"),
            ("20-12-04", "20-12-11", "1m"),
            ("20-12-11", "20-12-18", "1m"),
            ("20-12-18", "20-12-25", "1m"),
            ("20-12-25", "21-01-01", "1m"),
        ]
    
    if False:
        for date in list_date:
            print(date)
    #break
    # continue
    #
    logging.info(f"{stock_ticker}")
    df = get_df(stock_ticker, list_date, OUTPUT_FOLDER_NAME)
    output_file_name = get_output_file_name(OUTPUT_FOLDER_NAME, date_start_all, date_end_all, "al", stock_ticker)
    df.to_pickle(output_file_name)

In [None]:
# done one for some dates
if False:
    df = read_data("LAZR",
                   None,
                   pd.to_datetime("2019-03-25").tz_localize(LOCALIZE_US_STOCK_MARKET),
                   pd.to_datetime("2020-11-04").tz_localize(LOCALIZE_US_STOCK_MARKET),
                   "1h",
                   ADD_OUTSIDE_TRADING_HOURS,
                   ADD_DIVIDENDS_AND_STOCK_SPLITS,
                   AUTO_ADJUST)
df

In [None]:
# done one for the entire period
if False:
    df = read_data("GOEV",
                   "max",
                   None,
                   None,
                   "1d",
                   ADD_OUTSIDE_TRADING_HOURS,
                   ADD_DIVIDENDS_AND_STOCK_SPLITS,
                   AUTO_ADJUST)
df

In [None]:
plt.plot(df.Close)

In [None]:
# example of concatenate two that changed name
if False:
    interval = "al"
    output_file_name = get_output_file_name(OUTPUT_FOLDER_NAME, "19-01-01", "20-12-31", "al", "AMRH")
    df1 = pd.read_pickle(output_file_name)
    df1

In [None]:
if False:
    interval = "al"
    output_file_name = get_output_file_name(OUTPUT_FOLDER_NAME, "20-12-31", "21-01-01", "al", "ENVB")
    df2 = pd.read_pickle(output_file_name)
    df2

In [None]:
if False:
    df = pd.concat([df1, df2], axis = 0)
    df

In [None]:
if False:
    output_file_name = get_output_file_name(OUTPUT_FOLDER_NAME, "19-01-01", "21-01-01", "al", "ENVB")
    df.to_pickle(output_file_name )

In [None]:
# sometimes buggy data with very large value in after-market, remove it
df = df[df.Close < 100000]

In [None]:
# get only the pre-market data
get_df_pre_market(df)

In [None]:
# get only the after-market data
get_df_after_market(df)

In [None]:
# get only the during-market data
df2 = get_df_during_market(df)
df2

In [None]:
# for the in-market data, add a fictious interval one minute before that ends on the open value
# so that we can plot the open value as well
df3 = add_interval_with_open(df2)
df3

In [None]:
final_plot = plot_interactive(df)
final_plot.opts(xaxis = "bottom", title = f"Stock price of {stock_ticker}", show_legend = False) 

In [None]:
final_plot_volume = plot_interactive_volume(df)
xaxis = None # "bottom"
final_plot_volume.opts(xaxis = xaxis, yaxis = None, title = f"Stock volume of {stock_ticker}") 