# Topic and word list from a working paper by Sophie Stone of the Dartmouth College Economics Department



## https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [None]:
# Dependencies
# !pip install "calcbench-api-client[Pandas, Backoff, BeautifulSoup]" sentence-transformers tqdm qgrid ipywidgets 

In [None]:
import warnings

import pandas as pd
import itertools
from tqdm.notebook import tqdm
import qgrid

# https://github.com/UKPLab/sentence-transformers
from sentence_transformers import SentenceTransformer, util
import calcbench as cb

model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")

cb.enable_backoff()
tqdm.pandas(desc="progress")

In [None]:
categories = [
    (
        "environmental",
        """biodiversity, carbon, carbon negative, carbon neutral, carbon zero, clean-up, clean, clean tech, climate change, climate positive, contamination, discharge, emission, energy-efficient, environmental, environmental risk, fuel, fuel efficiency, green, greenhouse gas, hazardous, low carbon, natural resource, net-zero emission, pollution, remediation, sustainability, sustainable, toxic, waste, water, zero carbon, zero net carbon""",
    ),
    (
        "social",
        """accident, antiracism, consumer protection, customer privacy, employee relation, equal, equal pay, equity, gender equality, health, human right, justice, labor relation, labor standard, racial equity, racial awareness, racial justice, working condition""",
    ),
    (
        "governance",
        """advocacy, antitrust, board independence, code of ethic, compensation, corporate culture, corporate governance, corruption, governance risk, justice, political lobbying, scandal, shareholder right, stability, stewardship, transparency""",
    ),
]

In [None]:
with tqdm() as progress_bar:
    disclosures = cb.document_dataframe(
        company_identifiers=["msft", "orcl"],  # "DJIA"
        disclosure_names=["ManagementsDiscussionAndAnalysis", "RiskFactors"],
        all_history=True,
        period_type="annual",
        progress_bar=progress_bar,
        # entire_universe=True,
    )

In [None]:
def get_contents(d):
    try:
        return d.get_contents_text()
    except Exception as e:
        print(f"Exception getting {d} \n{e}")


disclosure_contents = disclosures.progress_applymap(
    get_contents, na_action="ignore"
).fillna("")

In [None]:
def distance(contents: pd.Series, embeddings):
    content_embeddings = model.encode(
        contents,
        convert_to_tensor=True,
    )
    distances = util.pytorch_cos_sim(content_embeddings, embeddings).numpy().T[0]
    return distances

In [None]:
distances = []
for _, category_words in tqdm(categories):
    embeddings = model.encode(category_words, convert_to_tensor=True)
    category_distances = disclosure_contents.progress_apply(distance, embeddings=embeddings)
    distances.append(category_distances)

In [None]:
pd.concat(distances, axis=1, keys=[category for category, _ in categories])