In [2]:
import os
import re
import warnings
import unicodedata
from tqdm import tqdm
from bs4 import BeautifulSoup

warnings.simplefilter("ignore")


def write_content(content, output_path):
    with open(output_path, "w", encoding="utf-8") as fout:
        fout.write(content)


def normalize(soup):
    ix = soup.find("ix:header")
    if ix != None:
        ix.decompose()
    for s in soup.find_all("table"):
        if not s.find(
            text=re.compile(
                "item\s*7|Management[’']s\s*Discussion\s*And\s*Analysis\s*(of\s*(Financial\s*Condition\s*and\s*Results\s*of\s*Operations|Results\s*of\s*Operations?\s*and\s*Financial\s*Condition))",
                re.I,
            )
        ):
            s.decompose()
    text = soup.get_text("\n")
    text = unicodedata.normalize("NFKD", text)
    text = "\n".join(text.splitlines())
    text = text.replace("\xa0", " ")
    text = text.replace("&nbsp;", " ")
    return text


def clear(text):
    text = re.sub(re.compile("\s*\n", re.I | re.M), "\n", text)
    clear_pattern = [
        "^\s*Table\s*of\s*Contents?\s*$",
        "^\s*\d+\s*$",
        "^\s*[-]+\s*$",
        "^\s*[^\n]*\|?[^\S\n]*\d{4}[^\S\n]*Form[^\S\n]*10-K[^\S\n]*\|?[^\S\n]*\d*\s*$", "^\s*Bank\s*of\s*America\s*[^\S\n]*\d*\s*$"
    ]
    for x in clear_pattern:
        text = re.sub(re.compile(x, re.I | re.M), "", text)
    text = re.sub("\n+|\s+", " ", text, flags=re.M)
    text = text.lower()
    return text


# 公司股票代碼_年份
def find_mda(text, pattern, pattern2, minlen):
    start = list(re.finditer(pattern, text, re.I | re.M))[-1].end()
    end = list(re.finditer(pattern2, text, re.I | re.M))[-1].start()
    if end - start <= minlen and 0 < end - start < 50:
        start = list(re.finditer(pattern, text, re.I | re.M))[0].end()
        end = list(re.finditer(pattern2, text, re.I | re.M))[0].start()
    else:
        for i in range(len(list(re.finditer(pattern, text, re.I | re.M)))):
            j = i
            start = list(re.finditer(pattern, text, re.I | re.M))[i].end()
            if j >= len(list(re.finditer(pattern2, text, re.I | re.M))):
                j = len(list(re.finditer(pattern2, text, re.I | re.M)))-1
            end = list(re.finditer(pattern2, text, re.I | re.M))[j].start()
            if end - start > minlen:
                break
    if end - start > minlen:
        text = clear(text[start:end])
        return text
    else:
        return None


htm_folders = os.listdir("htm/")
if not os.path.isdir("pa/"):
    os.makedirs("pa/")
if not os.path.isdir("mda/"):
    os.makedirs("mda/")

In [43]:
error = ""
error_list = []
htm_files = os.listdir("htm/")
loop = tqdm(range(len(htm_files)))
# loop = tqdm(range(10))
for i in loop:
    try:
        file = htm_files[i]
        htm_path = "htm/" + file
        pa_path = "pa/" + file[:-3] + "txt"
        mda_path = "mda/" + file[:-3] + "txt"
        loop.set_description(file)
        with open(htm_path, "r") as fin:
            content = fin.read()
        soup = BeautifulSoup(content, "html.parser")
        text = normalize(soup)
        write_content(text, pa_path)
        pattern = r"^[^\S\n]*item\s*7[\.:—]\s*Management[’']\s*s\s*Discussion\s*And\s*Analysis\s*(of\s*(Financial\s*Condition\s*and\s*Results\s*of\s*Operations|Results\s*of\s*Operations?\s*and\s*Financial\s*Condition))?.?(\s*\(MD&A\))?[^\S\n]*$"
        pattern2 = r"^[^\S\n]*item\s*7A[\.:—]\s*(Disclosures?\s*About\s*Market\s*Risk|(Quantitative\s*And\s*Qualitative|Qualitative\s*And\s*Quantitative)\s*Disclosures?\s*About\s*(Market\s*)?Risk).?[^\S\n]*$"
        if re.finditer(pattern, text, re.I | re.M) and re.finditer(pattern2, text, re.I | re.M):
            mda_text = find_mda(text, pattern, pattern2, 500)
        if not mda_text:
            pattern = r"^[^\S\n]*Management[’']\s*s\s*Discussion\s*And\s*Analysis\s*(of\s*(Financial\s*Condition\s*and\s*Results\s*of\s*Operations|Results\s*of\s*Operations?\s*and\s*Financial\s*Condition))?.?(\s*\(MD&A\))?[^\S\n]*$"
            pattern2 = r"^[^\S\n]*(Disclosures?\s*About\s*Market\s*Risk|(Quantitative\s*And\s*Qualitative|Qualitative\s*And\s*Quantitative)\s*Disclosures?\s*About\s*(Market\s*)?Risk).?[^\S\n]*$"
            if re.finditer(pattern, text, re.I | re.M) and re.finditer(pattern2, text, re.I | re.M):
                mda_text = find_mda(text, pattern, pattern2, 500)
        if mda_text:
            write_content(mda_text, mda_path)
    except Exception as e:
        error_list.append(file)
        error += f"❌{file}\n{e}\n"
        continue

print(error, error_list)

AAPL_2022.htm: 100%|██████████| 847/847 [15:14<00:00,  1.08s/it] 

❌TXN_2013.htm
list index out of range
❌IBM_2018.htm
list index out of range
❌IBM_2020.htm
list index out of range
❌IBM_2015.htm
list index out of range
❌IBM_2011.htm
list index out of range
❌MCD_2023.htm
list index out of range
❌MA_2018.htm
list index out of range
❌WFC_2014.htm
list index out of range
❌GS_2012.htm
list index out of range
❌WFC_2015.htm
list index out of range
❌CRM_2023.htm
list index out of range
❌CRM_2021.htm
list index out of range
❌CRM_2020.htm
list index out of range
❌CRM_2024.htm
list index out of range
❌MS_2018.htm
list index out of range
❌WFC_2017.htm
list index out of range
❌CRM_2022.htm
list index out of range
❌MA_2017.htm
list index out of range
❌IBM_2014.htm
list index out of range
❌IBM_2021.htm
list index out of range
❌WFC_2016.htm
list index out of range
❌MCD_2022.htm
list index out of range
❌MS_2021.htm
list index out of range
❌IBM_2023.htm
list index out of range
❌MS_2023.htm
list index out of range
❌IBM_2019.htm
list index out of range
❌JNJ_2011.htm
list




In [34]:
# 單一檔案 股票代碼_年份
for file in error_list:
    htm_path = "htm/" + file
    pa_path = "test/pa/" + file[:-3] + "txt"
    mda_path = "test/mda/" + file[:-3] + "txt"
    print(file)
    with open(htm_path, "r") as fin:
        content = fin.read()
    soup = BeautifulSoup(content, "html.parser")
    text = normalize(soup)
    write_content(text, pa_path)
    pattern = r"^[^\S\n]*item\s*7[\.:—]\s*Management[’']\s*s\s*Discussion\s*And\s*Analysis\s*(of\s*(Financial\s*Condition\s*and\s*Results\s*of\s*Operations|Results\s*of\s*Operations?\s*and\s*Financial\s*Condition))?.?(\s*\(MD&A\))?[^\S\n]*$"
    pattern2 = r"^[^\S\n]*item\s*7A[\.:—]\s*(Disclosures?\s*About\s*Market\s*Risk|(Quantitative\s*And\s*Qualitative|Qualitative\s*And\s*Quantitative)\s*Disclosures?\s*About\s*(Market\s*)?Risk).?[^\S\n]*$"
    if re.finditer(pattern, text, re.I | re.M) and re.finditer(pattern2, text, re.I | re.M):
        mda_text = find_mda(text, pattern, pattern2, 500)
    if not mda_text:
        pattern = r"^[^\S\n]*Management[’']\s*s\s*Discussion\s*And\s*Analysis\s*(of\s*(Financial\s*Condition\s*and\s*Results\s*of\s*Operations|Results\s*of\s*Operations?\s*and\s*Financial\s*Condition))?.?(\s*\(MD&A\))?[^\S\n]*$"
        pattern2 = r"^[^\S\n]*(Disclosures?\s*About\s*Market\s*Risk|(Quantitative\s*And\s*Qualitative|Qualitative\s*And\s*Quantitative)\s*Disclosures?\s*About\s*(Market\s*)?Risk).?[^\S\n]*$"
        if re.finditer(pattern, text, re.I | re.M) and re.finditer(pattern2, text, re.I | re.M):
            mda_text = find_mda(text, pattern, pattern2, 500)
    if mda_text:
        write_content(mda_text, mda_path)
    else:
        print("error")

BRK.B_2022.htm
error
HD_2016.htm
NOW_2015.htm
WMT_2012.htm
error
NOW_2019.htm
PG_2017.htm
VZ_2016.htm
error
QCOM_2014.htm
error
PG_2011.htm


In [None]:
# 依年份
error = ""
error_list = []
htm_folders = os.listdir("htm/")
for year in htm_folders:
    htm_files = os.listdir("htm/" + year + "/")
    loop = tqdm(range(len(htm_files)))
    for i in loop:
        try:
            file = htm_files[i]
            htm_path = "htm/" + year + "/" + file
            pa_path = "pa/" + year + "/" + file[:-3] + "txt"
            mda_path = "mda/" + year + "/" + file[:-3] + "txt"
            loop.set_description(year + "/" + file)
            with open(htm_path, "r") as fin:
                content = fin.read()
            soup = BeautifulSoup(content, "html.parser")
            text = normalize(soup)
            write_content(text, pa_path)
            pattern = r"^[^\S\n]*item\s*7[\.:—]\s*Management[’']\s*s\s*Discussion\s*And\s*Analysis\s*(of\s*(Financial\s*Condition\s*and\s*Results\s*of\s*Operations|Results\s*of\s*Operations?\s*and\s*Financial\s*Condition))?.?(\s*\(MD&A\))?[^\S\n]*$"
            pattern2 = r"^[^\S\n]*item\s*7A[\.:—]\s*(Disclosures?\s*About\s*Market\s*Risk|(Quantitative\s*And\s*Qualitative|Qualitative\s*And\s*Quantitative)\s*Disclosures?\s*About\s*(Market\s*)?Risk).?[^\S\n]*$"
            if re.finditer(pattern, text, re.I | re.M) and re.finditer(pattern2, text, re.I | re.M):
                mda_text = find_mda(text, pattern, pattern2, 500)
            if not mda_text:
                pattern = r"^[^\S\n]*Management[’']\s*s\s*Discussion\s*And\s*Analysis\s*(of\s*(Financial\s*Condition\s*and\s*Results\s*of\s*Operations|Results\s*of\s*Operations?\s*and\s*Financial\s*Condition))?.?(\s*\(MD&A\))?[^\S\n]*$"
                pattern2 = r"^[^\S\n]*(Disclosures?\s*About\s*Market\s*Risk|(Quantitative\s*And\s*Qualitative|Qualitative\s*And\s*Quantitative)\s*Disclosures?\s*About\s*(Market\s*)?Risk).?[^\S\n]*$"
                if re.finditer(pattern, text, re.I | re.M) and re.finditer(pattern2, text, re.I | re.M):
                    mda_text = find_mda(text, pattern, pattern2, 500)
            if mda_text:
                write_content(mda_text, mda_path)
        except Exception as e:
            error_list.append(file)
            error += f"❌{file}\n{e}\n"
            continue

print(error, error_list)

2017/C.htm: 100%|██████████| 469/469 [11:16<00:00,  1.44s/it]    
2014/C.htm: 100%|██████████| 445/445 [10:21<00:00,  1.40s/it]    
2015/C.htm: 100%|██████████| 457/457 [10:33<00:00,  1.39s/it]    
2016/C.htm: 100%|██████████| 463/463 [10:58<00:00,  1.42s/it]    
2020/C.htm: 100%|██████████| 482/482 [06:00<00:00,  1.34it/s]    
2024/TPR.htm: 100%|██████████| 101/101 [00:48<00:00,  2.10it/s]
2022/C.htm: 100%|██████████| 495/495 [05:37<00:00,  1.46it/s]    
2019/C.htm: 100%|██████████| 484/484 [08:25<00:00,  1.04s/it]    
2023/C.htm: 100%|██████████| 498/498 [05:38<00:00,  1.47it/s]    
2018/C.htm: 100%|██████████| 476/476 [12:07<00:00,  1.53s/it]    
2021/C.htm: 100%|██████████| 494/494 [05:45<00:00,  1.43it/s]    

❌HUBB.htm
list index out of range
❌JBHT.htm
list index out of range
❌NRG.htm
list index out of range
❌COF.htm
list index out of range
❌TSCO.htm
list index out of range
❌MA.htm
list index out of range
❌GPC.htm
list index out of range
❌DAL.htm
list index out of range
❌URI.htm
list index out of range
❌EXPD.htm
list index out of range
❌NTAP.htm
list index out of range
❌EXPE.htm
list index out of range
❌CLX.htm
list index out of range
❌MS.htm
list index out of range
❌CAH.htm
list index out of range
❌FCX.htm
list index out of range
❌WFC.htm
list index out of range
❌SJM.htm
list index out of range
❌TFC.htm
list index out of range
❌TRGP.htm
list index out of range
❌HES.htm
list index out of range
❌SPGI.htm
list index out of range
❌HAL.htm
list index out of range
❌EMR.htm
list index out of range
❌XEL.htm
list index out of range
❌WDC.htm
list index out of range
❌GIS.htm
list index out of range
❌TXN.htm
list index out of range
❌NTRS.htm
list index out of range
❌PNC.htm
list index out of range
❌EQ




In [36]:
file_count = len(os.listdir("mda/"))
print("file count：", file_count)

file count： 789


In [37]:
import pandas as pd
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "sentence-transformers/all-mpnet-base-v2")

res = []
mda_files = os.listdir("mda/")
loop = tqdm(range(len(mda_files)))
for i in loop:
    com = {"name": mda_files[i].split("_")[0]}
    file = mda_files[i].split("_")[1]
    loop.set_description(mda_files[i])
    file_path = "mda/" + mda_files[i]
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    tokens = tokenizer.encode(text)
    com["file"] = file[:-3] + "txt"
    com["tokens"] = len(tokens)
    res.append(com)
df = pd.DataFrame(res)

NVDA_2022.txt:   0%|          | 0/789 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (7729 > 512). Running this sequence through the model will result in indexing errors
MA_2023.txt: 100%|██████████| 789/789 [00:13<00:00, 58.84it/s]   


In [38]:
df.sort_values("tokens", ascending=False).head(10)

Unnamed: 0,name,file,tokens
712,BAC,2010.txt,118396
101,BAC,2011.txt,118307
310,BAC,2012.txt,115058
56,BAC,2013.txt,107074
360,BAC,2014.txt,95180
638,BAC,2015.txt,88348
541,BAC,2020.txt,81439
224,GE,2020.txt,81087
81,GE,2017.txt,79605
71,GE,2019.txt,79150
