# Download the text of 10-K sections and perform simple natural language processing on them

In [2]:
%pip install "calcbench-api-client[Pandas, Backoff, BeautifulSoup]" tqdm nltk py-readability-metrics sentence-transformers
# also install pytorch following the instructions @ https://pytorch.org/

Collecting py-readability-metrics
  Downloading py_readability_metrics-1.4.5-py3-none-any.whl.metadata (8.8 kB)
Downloading py_readability_metrics-1.4.5-py3-none-any.whl (26 kB)
Installing collected packages: py-readability-metrics
Successfully installed py-readability-metrics-1.4.5
Note: you may need to restart the kernel to use updated packages.


In [3]:
import calcbench as cb
from readability import (
    Readability,
)  # https://github.com/cdimascio/py-readability-metrics
from readability.exceptions import ReadabilityException
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from nltk import word_tokenize
import nltk

nltk.download("punkt")
from sentence_transformers import SentenceTransformer, util

cb.enable_backoff(giveup=lambda e: e.response.status_code == 404)
import torch

tqdm.pandas(desc="progress") # adds the progress_applymap function

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
tickers = cb.tickers(index="DJIA")

In [5]:
with tqdm_notebook() as progress_bar:
    data = cb.disclosure_dataframe(
        company_identifiers=tickers[:2],
        all_history=True,
        disclosure_names=["ManagementsDiscussionAndAnalysis", "RiskFactors"],
        use_fiscal_period=True,
        progress_bar=progress_bar,
        period_type=cb.api_query_params.PeriodType.Annual
    )

0it [00:00, ?it/s]

  pandas_period = pd.Period(year=period_year, freq="A")  # type: ignore


In [6]:
def get_contents(d):
    try:
        return d.get_contents_text()
    except Exception as e:
        print(f"Exception getting {d} \n{e}")


d = data.progress_applymap(get_contents, na_action="ignore")

  return getattr(df, df_function)(wrapper, **kwargs)
progress:  94%|███████████████████████████████████████████████████████████████████▊    | 64/68 [00:16<00:01,  4.00it/s]


In [7]:
def flesch_kincaid(text):
    try:
        return Readability(text).flesch_kincaid().score
    except ReadabilityException:
        return None


readability = d.progress_applymap(flesch_kincaid, na_action="ignore")

progress:  94%|███████████████████████████████████████████████████████████████████▊    | 64/68 [00:18<00:01,  3.52it/s]


In [None]:
readability.to_excel(r"readability.xlsx")

In [8]:
word_counts = d.progress_applymap(
    lambda text: len(word_tokenize(text)), na_action="ignore"
)

progress:  94%|███████████████████████████████████████████████████████████████████▊    | 64/68 [00:02<00:00, 26.51it/s]


In [None]:
word_counts.to_excel(r"word_counts.xlsx")

In [9]:
model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")
tensor_cosine_distance = torch.nn.CosineSimilarity()


def cosine_difference(documents):
    embeddings = model.encode(documents.fillna(""), convert_to_tensor=True)
    return tensor_cosine_distance(embeddings, embeddings.roll(1, 0))


diffs = d.progress_apply(cosine_difference)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
progress: 100%|██████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.64s/it]


In [None]:
diffs.to_excel(r"cosine_diffs.xlsx")