In [1]:
from time import sleep
from bs4 import BeautifulSoup
import pandas as pd
import json
from numpy import float16
import polars as pl

In [2]:
TEST_TICKER = "NOK"
DEBUG_STATE = True

In [3]:
with open(f"{TEST_TICKER}.html") as fd:
    page_text = fd.read()

In [64]:
def parse_html_to_df(page_text: str) -> pd.DataFrame:
    soup = BeautifulSoup(page_text, "lxml")
    data_tables = soup.find("div", {"ng-bind-html": "$ctrl.dataTableHtml"})
    tables = data_tables.find_all("table", {"class": "table"})  # type: ignore
    parsed_tables = pd.read_html(str(tables), flavor="lxml")
    df = pd.concat(parsed_tables)
    df.columns = ("date", "value")
    return df


def parse_html_to_pd(page_text: str) -> pd.DataFrame:
    df = parse_html_to_df(page_text)
    df["date"] = pd.to_datetime(df["date"], format="%B %d, %Y")
    df["date"] = df["date"].dt.to_period("M").dt.to_timestamp()
    df["value"] = df["value"].str.strip("%").astype(float16)
    return df


def parse_html_to_pl(page_text: str) -> pl.DataFrame:
    df = parse_html_to_df(page_text)
    df["date"] = pd.to_datetime(df["date"], format="%B %d, %Y")
    df["date"] = df["date"].dt.to_period("M").dt.to_timestamp()
    pldf = pl.from_pandas(df)
    pldf = pldf.with_columns(
        pldf["value"].str.replace("%", "").cast(pl.Float64).alias("value")
    )
    return pldf


In [68]:
df = parse_html_to_df(page_text)
pldf = parse_html_to_pl(page_text).sort(by="date")

In [85]:
# group the dataframe by the "date" column and a quarter interval
pldf.groupby_dynamic(index_column="date", every="3mo").agg(
    [pl.mean("value")]
)


date,value
datetime[ns],f64
2010-07-01 00:00:00,5.59
2010-10-01 00:00:00,4.92
2011-01-01 00:00:00,4.8
2011-04-01 00:00:00,4.6
2011-07-01 00:00:00,3.9
2011-10-01 00:00:00,-0.05
2012-01-01 00:00:00,-12.75
2012-04-01 00:00:00,-24.92
2012-07-01 00:00:00,-39.49
2012-10-01 00:00:00,2.9


In [70]:
pldf

date,value
datetime[ns],f64
2010-09-01 00:00:00,5.59
2010-12-01 00:00:00,4.92
2011-03-01 00:00:00,4.8
2011-06-01 00:00:00,4.6
2011-09-01 00:00:00,3.9
2011-12-01 00:00:00,-0.05
2012-03-01 00:00:00,-12.75
2012-06-01 00:00:00,-24.92
2012-09-01 00:00:00,-39.49
2012-12-01 00:00:00,2.9


In [6]:
from json import load
import os

with open("./ch_data/ticker_list.json") as fj:
    ticker_list = load(fj)

filenames = list()
for file in os.listdir("opm_parqs"):
    filename, _ = os.path.splitext(file)
    filenames.append(filename)


In [8]:
ticker_list.index(filenames[-1])

16770

In [6]:
# with open("./ch_data/symbols_table.html") as fd:
#     symbols_html = BeautifulSoup(fd)

# with open("./ch_data/names_table.html") as fd:
#     names_html = BeautifulSoup(fd)
# syms = symbols_html.find_all("a", {"target": "_blank"})
# ych_ticker_list = [s.get_text() for s in syms]
# len(ych_ticker_list)
# with open("./ch_data/ticker_list.json", "w") as fj:
#     json.dump(ych_ticker_list, fj)
