# Topic and word list from a working paper by Sophie Stone of the Dartmouth College Economics Department



## https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [None]:
# Dependencies
# !pip install "calcbench-api-client[Pandas, Backoff, BeautifulSoup]" tqdm qgrid ipywidgets nltk

In [3]:
import warnings
from collections import Counter
from typing import Set

import pandas as pd
import itertools
from tqdm.notebook import tqdm
import qgrid
import nltk

import calcbench as cb


cb.enable_backoff()
tqdm.pandas(desc="progress")

In [None]:
category_embeddings = {
    "environmental": {
        "biodiversity",
        "carbon",
        "carbon negative",
        "carbon neutral",
        "carbon zero",
        "clean",
        "clean tech",
        "clean-up",
        "climate change",
        "climate positive",
        "contamination",
        "discharge",
        "emission",
        "energy-efficient",
        "environmental",
        "environmental risk",
        "fuel",
        "fuel efficiency",
        "green",
        "greenhouse gas",
        "hazardous",
        "low carbon",
        "natural resource",
        "net-zero emission",
        "pollution",
        "remediation",
        "sustainability",
        "sustainable",
        "toxic",
        "waste",
        "water",
        "zero carbon",
        "zero net carbon",
    },
    "social": {
        "accident",
        "antiracism",
        "consumer protection",
        "customer privacy",
        "employee relation",
        "equal",
        "equal pay",
        "equity",
        "gender equality",
        "health",
        "human right",
        "justice",
        "labor relation",
        "labor standard",
        "racial awareness",
        "racial equity",
        "racial justice",
        "working condition",
    },
    "governance": {
        "advocacy",
        "antitrust",
        "board independence",
        "code of ethic",
        "compensation",
        "corporate culture",
        "corporate governance",
        "corruption",
        "governance risk",
        "justice",
        "political lobbying",
        "scandal",
        "shareholder right",
        "stability",
        "stewardship",
        "transparency",
    },
}

In [None]:
with tqdm() as progress_bar:
    disclosures = cb.document_dataframe(
        company_identifiers=cb.tickers(index="SP500"),
        disclosure_names=["ManagementsDiscussionAndAnalysis", "RiskFactors"],
        all_history=True,
        period_type="annual",
        progress_bar=progress_bar,
        # entire_universe=True,
    )

In [None]:
def get_contents(d):
    try:
        return d.get_contents_text()
    except Exception as e:
        print(f"Exception getting {d} \n{e}")


disclosure_contents = disclosures.progress_applymap(
    get_contents, na_action="ignore"
).fillna("")

In [None]:
disclosure_contents.to_pickle(
    r"C:\Users\andre\Dropbox (Calcbench)\andrew\sp_500_contents.pkl"
)

In [None]:
disclosure_contents = pd.read_pickle(
    r"C:\Users\andre\Dropbox (Calcbench)\andrew\sp_500_contents.pkl"
)

In [None]:
disclosure_embeddings = disclosure_contents.progress_applymap(
    lambda c: Counter(word.lower() for word in nltk.word_tokenize(c))
)

In [None]:
disclosure_embeddings.to_pickle(
    r"C:\Users\andre\Dropbox (Calcbench)\andrew\sp_500_tokenize_embeddings.pkl"
)

In [4]:
disclosure_embeddings = pd.read_pickle(
    r"C:\Users\andre\Dropbox (Calcbench)\andrew\sp_500_tokenize_embeddings.pkl"
)

In [5]:
disclosure_embeddings = disclosure_embeddings.loc[
    :, disclosure_embeddings.columns.get_level_values(1).isin(["A", "AAL"])
]

In [None]:
def distance(word_counts: pd.Series):
    for category, phrases in category_embeddings.items():
        results[category] = word_counts.apply(
            lambda counter: sum(
                count for word, count in counter.items() if word in phrases
            )
        )
    return results

In [36]:
words = category_embeddings['environmental']

In [37]:
counts = disclosure_embeddings.applymap(lambda c: [c[word] for word in words]).apply(
    pd.Series.explode
)

In [39]:
counts["words"] = list(words) * disclosure_embeddings.shape[0]

In [40]:
counts.set_index("words", append=True)

Unnamed: 0_level_0,disclosure_type_name,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,RiskFactors,RiskFactors
Unnamed: 0_level_1,ticker,A,AAL,A,AAL
period,words,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2007,carbon negative,0,0,0,0
2007,fuel efficiency,0,0,0,0
2007,greenhouse gas,0,0,0,0
2007,carbon zero,0,0,0,0
2007,climate change,0,0,0,0
2007,discharge,0,0,0,0
2007,fuel,0,0,0,0
2007,sustainability,0,0,0,0
2007,toxic,0,0,0,0
2007,green,0,0,0,0


In [None]:
distances = []
for _, category_words in tqdm(category_embeddings.items()):
    category_distances = disclosure_embeddings.progress_applymap(
        lambda word_counts: distance(word_counts, category_words)
    )
    distances.append(category_distances)
esg_distances = pd.concat(
    distances, axis=1, keys=[category for category, _ in category_embeddings.items()]
)

In [None]:
esg_distances.to_excel("djia_esg_word_counts.xlsx")

In [None]:
def matching_sentences(disclosure: str, matching_phrases: Set[str]):
    sentences = nltk.sent_tokenize(disclosure)
    for sentence in sentences:
        words = set(nltk.word_tokenize(sentence))
        matches = words & matching_phrases
        if matches:
            print(matches, sentence)

In [None]:
disclosure_contents.applymap(
    lambda c: matching_sentences(c, category_embeddings["environmental"])
)