# Download the text of 10-K sections and perform simple natural language processing on them

In [None]:
%pip install "calcbench-api-client[Pandas, Backoff, BeautifulSoup]" tqdm nltk py-readability-metrics sentence-transformers
# also install pytorch following the instructions @ https://pytorch.org/

In [None]:
import calcbench as cb
from readability import (
    Readability,
)  # https://github.com/cdimascio/py-readability-metrics
from readability.exceptions import ReadabilityException
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from nltk import word_tokenize
import nltk

nltk.download("punkt")
from sentence_transformers import SentenceTransformer, util

cb.enable_backoff(giveup=lambda e: e.response.status_code == 404)
import torch

tqdm.pandas(desc="progress") # adds the progress_applymap function

In [None]:
tickers = cb.tickers(index="DJIA")

In [None]:
with tqdm_notebook() as progress_bar:
    data = cb.disclosure_dataframe(
        company_identifiers=tickers[:2],
        all_history=True,
        disclosure_names=["ManagementsDiscussionAndAnalysis", "RiskFactors"],
        use_fiscal_period=True,
        progress_bar=progress_bar,
        period_type=cb.api_query_params.PeriodType.Annual
    )

In [None]:
def get_contents(d):
    try:
        return d.get_contents_text()
    except Exception as e:
        print(f"Exception getting {d} \n{e}")


d = data.progress_applymap(get_contents, na_action="ignore")

In [None]:
def flesch_kincaid(text):
    try:
        return Readability(text).flesch_kincaid().score
    except ReadabilityException:
        return None


readability = d.progress_applymap(flesch_kincaid, na_action="ignore")

In [None]:
readability.to_excel(r"readability.xlsx")

In [None]:
word_counts = d.progress_applymap(
    lambda text: len(word_tokenize(text)), na_action="ignore"
)

In [None]:
word_counts.to_excel(r"word_counts.xlsx")

In [None]:
model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")
tensor_cosine_distance = torch.nn.CosineSimilarity()


def cosine_difference(documents):
    embeddings = model.encode(documents.fillna(""), convert_to_tensor=True)
    return tensor_cosine_distance(embeddings, embeddings.roll(1, 0))


diffs = d.progress_apply(cosine_difference)

In [None]:
diffs.to_excel(r"cosine_diffs.xlsx")