# Topic and word list from a working paper by Sophie Stone of the Dartmouth College Economics Department



## https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [None]:
# Dependencies
# !pip install "calcbench-api-client[Pandas, Backoff, BeautifulSoup]" sentence-transformers tqdm qgrid ipywidgets 

In [248]:
import warnings

import pandas as pd
import itertools
from tqdm.notebook import tqdm
import qgrid

# https://github.com/UKPLab/sentence-transformers
from sentence_transformers import SentenceTransformer, util
import calcbench as cb

model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")

cb.enable_backoff()
tqdm.pandas(desc="progress")

In [249]:
categories = [
    (
        "environmental",
        """biodiversity, carbon, carbon negative, carbon neutral, carbon zero, clean-up, clean, clean tech, climate change, climate positive, contamination, discharge, emission, energy-efficient, environmental, environmental risk, fuel, fuel efficiency, green, greenhouse gas, hazardous, low carbon, natural resource, net-zero emission, pollution, remediation, sustainability, sustainable, toxic, waste, water, zero carbon, zero net carbon""",
    ),
    (
        "social",
        """accident, antiracism, consumer protection, customer privacy, employee relation, equal, equal pay, equity, gender equality, health, human right, justice, labor relation, labor standard, racial equity, racial awareness, racial justice, working condition""",
    ),
    (
        "governance",
        """advocacy, antitrust, board independence, code of ethic, compensation, corporate culture, corporate governance, corruption, governance risk, justice, political lobbying, scandal, shareholder right, stability, stewardship, transparency""",
    ),
]

In [250]:
with tqdm() as progress_bar:
    disclosures = cb.document_dataframe(
        company_identifiers=["msft", "orcl"],  # "DJIA"
        disclosure_names=["ManagementsDiscussionAndAnalysis", "RiskFactors"],
        all_history=True,
        period_type="annual",
        progress_bar=progress_bar,
        # entire_universe=True,
    )

0it [00:00, ?it/s]

In [251]:
def get_contents(d):
    try:
        return d.get_contents_text()
    except Exception as e:
        print(f"Exception getting {d} \n{e}")


disclosure_contents = disclosures.progress_applymap(
    get_contents, na_action="ignore"
).fillna("")

progress:   0%|          | 0/44 [00:00<?, ?it/s]

In [282]:
disclosure_contents.progress_apply(lambda c : list(model.encode(c, convert_to_tensor=True)))

progress:   0%|          | 0/4 [00:00<?, ?it/s]

disclosure_type_name,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,RiskFactors,RiskFactors
ticker,MSFT,ORCL,MSFT,ORCL
period,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2010,"[tensor(-0.0815), tensor(0.5086), tensor(0.409...","[tensor(-0.0409), tensor(0.5338), tensor(0.253...","[tensor(-0.5752), tensor(0.5337), tensor(0.045...","[tensor(-0.1091), tensor(0.3647), tensor(-0.11..."
2011,"[tensor(0.0267), tensor(0.3679), tensor(0.4261...","[tensor(-0.0161), tensor(0.6351), tensor(0.233...","[tensor(-0.5466), tensor(0.4814), tensor(0.053...","[tensor(-0.1091), tensor(0.3647), tensor(-0.11..."
2012,"[tensor(0.0777), tensor(0.3125), tensor(0.5341...","[tensor(-0.0807), tensor(0.6055), tensor(0.407...","[tensor(-0.1781), tensor(0.4499), tensor(0.052...","[tensor(-0.1020), tensor(0.3097), tensor(-0.14..."
2013,"[tensor(-0.0837), tensor(0.2474), tensor(0.492...","[tensor(-0.2248), tensor(0.4081), tensor(0.361...","[tensor(-0.1801), tensor(0.4780), tensor(0.076...","[tensor(-0.1767), tensor(0.4423), tensor(-0.20..."
2014,"[tensor(-0.3330), tensor(0.4071), tensor(0.466...","[tensor(-0.1335), tensor(0.4905), tensor(0.385...","[tensor(-0.2326), tensor(0.4099), tensor(-0.06...","[tensor(-0.1020), tensor(0.3097), tensor(-0.14..."
2015,"[tensor(-0.2203), tensor(0.4431), tensor(0.481...","[tensor(0.0890), tensor(0.7324), tensor(0.6550...","[tensor(-0.2551), tensor(0.4232), tensor(0.115...","[tensor(-0.1020), tensor(0.3097), tensor(-0.14..."
2016,"[tensor(-0.2690), tensor(0.6047), tensor(0.464...","[tensor(0.0659), tensor(0.6676), tensor(0.5572...","[tensor(-0.1250), tensor(0.4783), tensor(0.085...","[tensor(-0.1491), tensor(0.3031), tensor(0.152..."
2017,"[tensor(-0.4079), tensor(0.5036), tensor(0.413...","[tensor(0.3988), tensor(0.6977), tensor(0.5984...","[tensor(-0.2551), tensor(0.4232), tensor(0.115...","[tensor(-0.1006), tensor(0.2886), tensor(0.194..."
2018,"[tensor(-0.4079), tensor(0.5036), tensor(0.413...","[tensor(0.3516), tensor(0.6838), tensor(0.5437...","[tensor(-0.2551), tensor(0.4232), tensor(0.115...","[tensor(-0.1491), tensor(0.3031), tensor(0.152..."
2019,"[tensor(-0.3389), tensor(0.5636), tensor(0.495...","[tensor(0.2449), tensor(0.5134), tensor(0.5144...","[tensor(-0.2551), tensor(0.4232), tensor(0.115...","[tensor(-0.0239), tensor(0.1828), tensor(0.176..."


In [None]:
def distance(contents: pd.Series, embeddings):
    content_embeddings = model.encode(
        contents,
        convert_to_tensor=True,
    )
    distances = util.pytorch_cos_sim(content_embeddings, embeddings).numpy().T[0]
    return distances

In [None]:
distances = []
for _, category_words in tqdm(categories):
    embeddings = model.encode(category_words, convert_to_tensor=True)
    category_distances = disclosure_contents.progress_apply(distance, embeddings=embeddings)
    distances.append(category_distances)

In [None]:
pd.concat(distances, axis=1, keys=[category for category, _ in categories])