# Topic and word list from a working paper by Sophie Stone of the Dartmouth College Economics Department



## https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [None]:
# Dependencies
# !pip install "calcbench-api-client[Pandas, Backoff, BeautifulSoup]" sentence-transformers tqdm qgrid ipywidgets

In [21]:
import warnings

import pandas as pd
import itertools
from tqdm.notebook import tqdm
import qgrid

import torch

# https://github.com/UKPLab/sentence-transformers
from sentence_transformers import SentenceTransformer, util
import calcbench as cb

model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")

cb.enable_backoff()
tqdm.pandas(desc="progress")

In [2]:
categories = [
    (
        "environmental",
        """biodiversity, carbon, carbon negative, carbon neutral, carbon zero, clean-up, clean, clean tech, climate change, climate positive, contamination, discharge, emission, energy-efficient, environmental, environmental risk, fuel, fuel efficiency, green, greenhouse gas, hazardous, low carbon, natural resource, net-zero emission, pollution, remediation, sustainability, sustainable, toxic, waste, water, zero carbon, zero net carbon""",
    ),
    (
        "social",
        """accident, antiracism, consumer protection, customer privacy, employee relation, equal, equal pay, equity, gender equality, health, human right, justice, labor relation, labor standard, racial equity, racial awareness, racial justice, working condition""",
    ),
    (
        "governance",
        """advocacy, antitrust, board independence, code of ethic, compensation, corporate culture, corporate governance, corruption, governance risk, justice, political lobbying, scandal, shareholder right, stability, stewardship, transparency""",
    ),
]

In [3]:
with tqdm() as progress_bar:
    disclosures = cb.document_dataframe(
        company_identifiers=["msft", "orcl"],  # "DJIA"
        disclosure_names=["ManagementsDiscussionAndAnalysis", "RiskFactors"],
        all_history=True,
        period_type="annual",
        progress_bar=progress_bar,
        # entire_universe=True,
    )

0it [00:00, ?it/s]

In [4]:
def get_contents(d):
    try:
        return d.get_contents_text()
    except Exception as e:
        print(f"Exception getting {d} \n{e}")


disclosure_contents = disclosures.progress_applymap(
    get_contents, na_action="ignore"
).fillna("")

progress:   0%|          | 0/44 [00:00<?, ?it/s]

In [9]:
disclosure_embeddings = disclosure_contents.progress_applymap(
    lambda c: model.encode(c, convert_to_tensor=True)
)

progress:   0%|          | 0/44 [00:00<?, ?it/s]

In [53]:
def distance(column_embeddings: pd.Series, category_embeddings):
    distances = util.pytorch_cos_sim(torch.stack(list(column_embeddings)), category_embeddings).T[0]
    return distances

In [54]:
distances = []
for _, category_words in tqdm(categories):
    category_embeddings = model.encode(category_words, convert_to_tensor=True)
    category_distances = disclosure_tensors.progress_apply(distance, category_embeddings=category_embeddings)
    distances.append(category_distances)

  0%|          | 0/3 [00:00<?, ?it/s]

progress:   0%|          | 0/4 [00:00<?, ?it/s]

progress:   0%|          | 0/4 [00:00<?, ?it/s]

progress:   0%|          | 0/4 [00:00<?, ?it/s]

In [51]:
pd.concat(distances, axis=1, keys=[category for category, _ in categories])

Unnamed: 0_level_0,environmental,environmental,environmental,environmental,social,social,social,social,governance,governance,governance,governance
disclosure_type_name,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,RiskFactors,RiskFactors,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,RiskFactors,RiskFactors,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,RiskFactors,RiskFactors
ticker,MSFT,ORCL,MSFT,ORCL,MSFT,ORCL,MSFT,ORCL,MSFT,ORCL,MSFT,ORCL
period,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
2010,0.067704,0.046842,0.041494,0.069147,0.287402,0.28285,0.390782,0.374338,0.434765,0.399483,0.302868,0.367925
2011,0.056875,0.021732,0.031076,0.069147,0.317599,0.289613,0.388511,0.374338,0.406364,0.403239,0.310657,0.367925
2012,0.036274,-0.012102,0.125279,0.076744,0.287051,0.285193,0.397964,0.37268,0.386091,0.401582,0.376065,0.382831
2013,0.056632,0.041432,0.126557,0.085984,0.286282,0.321872,0.405035,0.385839,0.39031,0.405992,0.363333,0.375781
2014,0.096285,0.02941,0.172148,0.076744,0.026628,0.321093,0.400975,0.37268,0.035563,0.412648,0.396679,0.382831
2015,0.097422,0.058823,0.138491,0.076744,0.283516,0.322281,0.408795,0.37268,0.416288,0.468417,0.369809,0.382831
2016,0.10832,0.083348,0.125647,0.12213,0.285077,0.354271,0.397462,0.372968,0.401584,0.478349,0.370774,0.416945
2017,0.135766,0.035972,0.138491,0.115558,0.288576,0.322182,0.408795,0.36556,0.428353,0.441306,0.369809,0.407424
2018,0.135766,0.040085,0.138491,0.12213,0.288576,0.325882,0.408795,0.372968,0.428353,0.445339,0.369809,0.416945
2019,0.109887,0.063685,0.138491,0.057861,0.280293,0.351092,0.408795,0.368606,0.415492,0.465294,0.369809,0.343619
