# Topic and word list from a working paper by Sophie Stone of the Dartmouth College Economics Department



## https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [None]:
# Dependencies
# !pip install "calcbench-api-client[Pandas, Backoff, BeautifulSoup]" tqdm qgrid ipywidgets nltk spacy
# !python -m spacy download en_core_web_sm

In [None]:
import warnings
from collections import Counter
from typing import Set, Iterable
import numpy as np

import pandas as pd
import itertools
from tqdm.notebook import tqdm
import qgrid
import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English


nlp = English()

import calcbench as cb

cb.enable_backoff()
tqdm.pandas(desc="progress")

MAX_DOC_LENGTH = 1000000  # The maximum document length spacy can handle

In [None]:
category_embeddings = {
    "environmental": {
        "biodiversity",
        "carbon",
        "carbon negative",
        "carbon neutral",
        "carbon zero",
        "clean",
        "clean tech",
        "clean-up",
        "climate change",
        "climate positive",
        "contamination",
        "discharge",
        "emission",
        "energy-efficient",
        "environmental",
        "environmental risk",
        "fuel",
        "fuel efficiency",
        "green",
        "greenhouse gas",
        "hazardous",
        "low carbon",
        "natural resource",
        "net-zero emission",
        "pollution",
        "remediation",
        "sustainability",
        "sustainable",
        "toxic",
        "waste",
        "water",
        "zero carbon",
        "zero net carbon",
    },
    "social": {
        "accident",
        "antiracism",
        "consumer protection",
        "customer privacy",
        "employee relation",
        "equal",
        "equal pay",
        "equity",
        "gender equality",
        "health",
        "human right",
        "justice",
        "labor relation",
        "labor standard",
        "racial awareness",
        "racial equity",
        "racial justice",
        "working condition",
    },
    "governance": {
        "advocacy",
        "antitrust",
        "board independence",
        "code of ethic",
        "compensation",
        "corporate culture",
        "corporate governance",
        "corruption",
        "governance risk",
        "justice",
        "political lobbying",
        "scandal",
        "shareholder right",
        "stability",
        "stewardship",
        "transparency",
    },
}

In [None]:
contents_pkl_file = r"C:\Users\andre\Dropbox (Calcbench)\andrew\sp_500_contents.pkl"

In [None]:
disclosure_contents = pd.read_pickle(contents_pkl_file)

In [None]:
matcher = Matcher(nlp.vocab)
phrase_matches = []
for phrase in category_embeddings["environmental"]:
    phrase_words = phrase.split()
    phrase_matches.append([{"LOWER": word} for word in phrase_words])
matcher.add("environmental", phrase_matches)

In [None]:
for category, category_phrases in tqdm(category_embeddings.items()):
    all_phrases = set()
    def disclosure_matches(disclosure: str):
        # https://stackoverflow.com/questions/47638877/using-phrasematcher-in-spacy-to-find-multiple-match-types
        global all_phrases
        doc = nlp(disclosure[:MAX_DOC_LENGTH])
        matches = matcher(doc)
        match_words = [doc[start:end].text.lower() for _, start, end in matches]
        all_phrases = all_phrases.union(match_words)
        return Counter(match_words)

    phrase_counts = disclosure_contents.progress_applymap(disclosure_matches)
    all_phrases = list(all_phrases)
    counts = phrase_counts.progress_applymap(
        lambda phrase_counts: [phrase_counts[p] for p in all_phrases]
    ).apply(pd.Series.explode)
    counts['phrases'] = all_phrases * disclosure_contents.shape[0]
    counts.set_index("phrases", append=True, inplace=True)
    counts.to_excel(f"{category}_phrase_counts.xlsx")