# Topic and word list from a working paper by Sophie Stone of the Dartmouth College Economics Department



## https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [None]:
# Dependencies
# !pip install "calcbench-api-client[Pandas, Backoff, BeautifulSoup]" sentence-transformers tqdm qgrid ipywidgets

In [None]:
import warnings

import pandas as pd
import itertools
from tqdm.notebook import tqdm
import qgrid

import torch

# https://github.com/UKPLab/sentence-transformers
from sentence_transformers import SentenceTransformer, util
import calcbench as cb

model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")

cb.enable_backoff()
tqdm.pandas(desc="progress")

In [None]:
["cat, dog, bird"]

In [98]:
categories = {
    "environmental": """biodiversity, carbon, carbon negative, carbon neutral, carbon zero, clean-up, clean, clean tech, climate change, climate positive, contamination, discharge, emission, energy-efficient, environmental, environmental risk, fuel, fuel efficiency, green, greenhouse gas, hazardous, low carbon, natural resource, net-zero emission, pollution, remediation, sustainability, sustainable, toxic, waste, water, zero carbon, zero net carbon""",
    "social": """accident, antiracism, consumer protection, customer privacy, employee relation, equal, equal pay, equity, gender equality, health, human right, justice, labor relation, labor standard, racial equity, racial awareness, racial justice, working condition""",
    "governance": """advocacy, antitrust, board independence, code of ethic, compensation, corporate culture, corporate governance, corruption, governance risk, justice, political lobbying, scandal, shareholder right, stability, stewardship, transparency""",
}

category_embeddings = {
    category: model.encode(category_words, convert_to_tensor=True)
    for (category, category_words) in categories.items()
}

In [56]:
with tqdm() as progress_bar:
    disclosures = cb.document_dataframe(
        company_identifiers=cb.tickers(index="DJIA"),
        disclosure_names=["ManagementsDiscussionAndAnalysis", "RiskFactors"],
        all_history=True,
        period_type="annual",
        progress_bar=progress_bar,
        # entire_universe=True,
    )

0it [00:00, ?it/s]

In [58]:
def get_contents(d):
    try:
        return d.get_contents_text()
    except Exception as e:
        print(f"Exception getting {d} \n{e}")


disclosure_contents = disclosures.progress_applymap(
    get_contents, na_action="ignore"
).fillna("")

progress:   0%|          | 0/900 [00:00<?, ?it/s]

In [60]:
disclosure_embeddings = disclosure_contents.progress_applymap(
    lambda c: model.encode(c, convert_to_tensor=True)
)

progress:   0%|          | 0/900 [00:00<?, ?it/s]

In [103]:
def distance(column_embeddings: pd.Series, category_embeddings):
    stacked_embeddings = torch.stack(list(column_embeddings))
    distances = util.pytorch_cos_sim(stacked_embeddings, category_embeddings).T[0]
    return distances

In [70]:
distances = []
for _, category_words in tqdm(categories):
    category_embeddings = model.encode(category_words, convert_to_tensor=True)
    category_distances = disclosure_embeddings.progress_apply(
        distance, category_embeddings=category_embeddings
    )
    distances.append(category_distances)
esg_distances = pd.concat(
    distances, axis=1, keys=[category for category, _ in categories]
)

  0%|          | 0/3 [00:00<?, ?it/s]

progress:   0%|          | 0/60 [00:00<?, ?it/s]

progress:   0%|          | 0/60 [00:00<?, ?it/s]

progress:   0%|          | 0/60 [00:00<?, ?it/s]

In [118]:
[len(d.split()) for d in disclosure_contents["ManagementsDiscussionAndAnalysis", "KO"]]

[0,
 0,
 24538,
 34050,
 33391,
 33826,
 28465,
 27580,
 27407,
 27784,
 27781,
 26944,
 23429,
 20833,
 0]

In [121]:
disclosure_contents["ManagementsDiscussionAndAnalysis", "KO"].apply(
    lambda d: model.encode(d, convert_to_tensor=True)
)

period
2007    [tensor(-0.3330), tensor(0.4071), tensor(0.466...
2008    [tensor(-0.3330), tensor(0.4071), tensor(0.466...
2009    [tensor(0.0904), tensor(0.6818), tensor(0.3599...
2010    [tensor(0.0904), tensor(0.6818), tensor(0.3599...
2011    [tensor(0.1042), tensor(0.7120), tensor(0.3881...
2012    [tensor(0.1042), tensor(0.7120), tensor(0.3881...
2013    [tensor(0.1042), tensor(0.7120), tensor(0.3881...
2014    [tensor(0.1042), tensor(0.7120), tensor(0.3881...
2015    [tensor(0.1042), tensor(0.7120), tensor(0.3881...
2016    [tensor(0.1042), tensor(0.7120), tensor(0.3881...
2017    [tensor(0.1042), tensor(0.7120), tensor(0.3881...
2018    [tensor(0.1042), tensor(0.7120), tensor(0.3881...
2019    [tensor(0.1042), tensor(0.7120), tensor(0.3881...
2020    [tensor(0.1163), tensor(0.7281), tensor(0.2686...
2021    [tensor(-0.3330), tensor(0.4071), tensor(0.466...
Freq: A-DEC, Name: (ManagementsDiscussionAndAnalysis, KO), dtype: object

In [126]:
model.encode(
    disclosure_contents["ManagementsDiscussionAndAnalysis", "KO"]["2011"],
    convert_to_tensor=True,
)

tensor([ 1.0419e-01,  7.1195e-01,  3.8808e-01, -1.2603e-01,  4.1063e-01,
         1.2477e-01,  1.8526e-01, -1.1357e-01, -2.3724e-01, -3.8541e-02,
         5.9021e-02,  4.3346e-01, -2.4523e-01, -3.9408e-01,  1.0578e-01,
        -2.2742e-01,  8.3409e-01,  2.2292e-01,  3.8170e-01, -2.1000e-01,
         3.6251e-01,  6.3048e-01, -1.7260e-02,  3.4541e-01,  7.1464e-01,
        -5.7913e-03,  5.0781e-01,  5.4918e-01,  3.0199e-01, -7.0723e-01,
         3.5121e-01, -5.9891e-01, -4.2147e-01, -7.2128e-01,  8.3866e-02,
         2.1282e-01, -1.3906e-01,  8.6219e-02, -1.1572e+00, -2.7023e-01,
        -1.0882e-01, -8.0493e-02, -1.5484e-01,  7.2565e-01, -3.6709e-01,
        -3.2039e-03, -1.2916e-01,  1.3924e-01, -1.0436e+00,  3.5744e-01,
        -6.1307e-01, -3.9542e-01, -2.9205e-01, -6.9406e-01, -1.7607e-01,
         2.2078e-01, -1.7893e-01, -2.9947e-01, -5.1040e-01, -8.2245e-02,
         6.6126e-02, -1.9324e-01, -2.3568e-01, -3.7153e-01,  1.2056e-01,
        -1.0321e+00, -1.1255e-01,  2.5034e-01, -4.6

In [127]:
model.encode(
    disclosure_contents["ManagementsDiscussionAndAnalysis", "KO"]["2012"],
    convert_to_tensor=True,
)

tensor([ 1.0419e-01,  7.1195e-01,  3.8808e-01, -1.2603e-01,  4.1063e-01,
         1.2477e-01,  1.8526e-01, -1.1357e-01, -2.3724e-01, -3.8541e-02,
         5.9021e-02,  4.3346e-01, -2.4523e-01, -3.9408e-01,  1.0578e-01,
        -2.2742e-01,  8.3409e-01,  2.2292e-01,  3.8170e-01, -2.1000e-01,
         3.6251e-01,  6.3048e-01, -1.7260e-02,  3.4541e-01,  7.1464e-01,
        -5.7913e-03,  5.0781e-01,  5.4918e-01,  3.0199e-01, -7.0723e-01,
         3.5121e-01, -5.9891e-01, -4.2147e-01, -7.2128e-01,  8.3866e-02,
         2.1282e-01, -1.3906e-01,  8.6219e-02, -1.1572e+00, -2.7023e-01,
        -1.0882e-01, -8.0493e-02, -1.5484e-01,  7.2565e-01, -3.6709e-01,
        -3.2039e-03, -1.2916e-01,  1.3924e-01, -1.0436e+00,  3.5744e-01,
        -6.1307e-01, -3.9542e-01, -2.9205e-01, -6.9406e-01, -1.7607e-01,
         2.2078e-01, -1.7893e-01, -2.9947e-01, -5.1040e-01, -8.2245e-02,
         6.6126e-02, -1.9324e-01, -2.3568e-01, -3.7153e-01,  1.2056e-01,
        -1.0321e+00, -1.1255e-01,  2.5034e-01, -4.6

In [129]:
disclosure_contents["ManagementsDiscussionAndAnalysis", "KO"]["2012"]

'ITEM 7.\xa0\xa0MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONSOverviewThe following Management\'s Discussion and Analysis of Financial Condition and Results of Operations ("MD&A") is intended to help the reader understand The Coca-Cola Company, our operations and our present business environment. MD&A is provided as a supplement to\xa0— and should be read in conjunction with\xa0— our consolidated financial statements and the accompanying notes thereto contained in "Item\xa08. Financial Statements and Supplementary Data" of this report. This overview summarizes the MD&A, which includes the following sections:•Our Business\xa0— a general description of our business and the nonalcoholic beverage segment of the commercial beverage industry, our objective, our strategic priorities, our core capabilities, and challenges and risks of our business.•Critical Accounting Policies and Estimates\xa0— a discussion of accounting policies that require critical j

In [130]:
disclosure_contents["ManagementsDiscussionAndAnalysis", "KO"]["2011"]

'ITEM 7.\xa0\xa0MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONSOverviewThe following Management\'s Discussion and Analysis of Financial Condition and Results of Operations ("MD&A") is intended to help the reader understand The Coca-Cola Company, our operations and our present business environment. MD&A is provided as a supplement to\xa0— and should be read in conjunction with\xa0— our consolidated financial statements and the accompanying notes thereto contained in "Item\xa08. Financial Statements and Supplementary Data" of this report. This overview summarizes the MD&A, which includes the following sections:•Our Business\xa0— a general description of our business and the nonalcoholic beverage segment of the commercial beverage industry, our objective, our strategic priorities, our core capabilities, and challenges and risks of our business.•Critical Accounting Policies and Estimates\xa0— a discussion of accounting policies that require critical j