# Topic and word list from a working paper by Sophie Stone of the Dartmouth College Economics Department



## https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [None]:
# Dependencies
# !pip install "calcbench-api-client[Pandas, Backoff, BeautifulSoup]" sentence-transformers tqdm qgrid ipywidgets

In [3]:
import warnings
from collections import Counter

import pandas as pd
import itertools
from tqdm.notebook import tqdm
import qgrid

import calcbench as cb


cb.enable_backoff()
tqdm.pandas(desc="progress")

In [66]:
categories = {
    "environmental": """biodiversity, carbon, carbon negative, carbon neutral, carbon zero, clean-up, clean, clean tech, climate change, climate positive, contamination, discharge, emission, energy-efficient, environmental, environmental risk, fuel, fuel efficiency, green, greenhouse gas, hazardous, low carbon, natural resource, net-zero emission, pollution, remediation, sustainability, sustainable, toxic, waste, water, zero carbon, zero net carbon""",
    "social": """accident, antiracism, consumer protection, customer privacy, employee relation, equal, equal pay, equity, gender equality, health, human right, justice, labor relation, labor standard, racial equity, racial awareness, racial justice, working condition""",
    "governance": """advocacy, antitrust, board independence, code of ethic, compensation, corporate culture, corporate governance, corruption, governance risk, justice, political lobbying, scandal, shareholder right, stability, stewardship, transparency""",
}

category_embeddings = {
    category: set(w.strip() for w in category_words.split(","))
    for (category, category_words) in categories.items()
}

In [67]:
category_embeddings

{'environmental': {'biodiversity',
  'carbon',
  'carbon negative',
  'carbon neutral',
  'carbon zero',
  'clean',
  'clean tech',
  'clean-up',
  'climate change',
  'climate positive',
  'contamination',
  'discharge',
  'emission',
  'energy-efficient',
  'environmental',
  'environmental risk',
  'fuel',
  'fuel efficiency',
  'green',
  'greenhouse gas',
  'hazardous',
  'low carbon',
  'natural resource',
  'net-zero emission',
  'pollution',
  'remediation',
  'sustainability',
  'sustainable',
  'toxic',
  'waste',
  'water',
  'zero carbon',
  'zero net carbon'},
 'social': {'accident',
  'antiracism',
  'consumer protection',
  'customer privacy',
  'employee relation',
  'equal',
  'equal pay',
  'equity',
  'gender equality',
  'health',
  'human right',
  'justice',
  'labor relation',
  'labor standard',
  'racial awareness',
  'racial equity',
  'racial justice',
  'working condition'},
 'governance': {'advocacy',
  'antitrust',
  'board independence',
  'code of ethic'

In [10]:
with tqdm() as progress_bar:
    disclosures = cb.document_dataframe(
        company_identifiers=cb.tickers(index="DJIA"),
        disclosure_names=["ManagementsDiscussionAndAnalysis", "RiskFactors"],
        all_history=True,
        period_type="annual",
        progress_bar=progress_bar,
        # entire_universe=True,
    )

0it [00:00, ?it/s]

In [11]:
def get_contents(d):
    try:
        return d.get_contents_text()
    except Exception as e:
        print(f"Exception getting {d} \n{e}")


disclosure_contents = disclosures.progress_applymap(
    get_contents, na_action="ignore"
).fillna("")

progress:   0%|          | 0/900 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [12]:
disclosure_contents = pd.read_pickle("disclosure_contents.pkl")

In [14]:
disclosure_embeddings = disclosure_contents.progress_applymap(
    lambda c: Counter(c.split())
)

progress:   0%|          | 0/900 [00:00<?, ?it/s]

In [44]:
def distance(column_embeddings: Counter, category_embeddings: set):
    print(column_embeddings)
    return sum(count for word, count in column_embeddings.items() if word in category_embeddings)

In [51]:
distances = []
for _, category_words in tqdm(categories.items()):
    category_distances = disclosure_embeddings.progress_applymap(
        lambda word_counts: sum(count for word, count in word_counts.items() if word in category_words)
    )
    distances.append(category_distances)
esg_distances = pd.concat(
    distances, axis=1, keys=[category for category, _ in categories.items()]
)

  0%|          | 0/3 [00:00<?, ?it/s]

progress:   0%|          | 0/900 [00:00<?, ?it/s]

progress:   0%|          | 0/900 [00:00<?, ?it/s]

progress:   0%|          | 0/900 [00:00<?, ?it/s]

In [70]:
esg_distances.environmental / disclosure_embeddings.progress_applymap(lambda d: sum(d.values()))

progress:   0%|          | 0/900 [00:00<?, ?it/s]

disclosure_type_name,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,ManagementsDiscussionAndAnalysis,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors,RiskFactors
ticker,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,GS,HD,HON,IBM,INTC,JNJ,JPM,KO,MCD,MMM,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,GS,HD,HON,IBM,INTC,JNJ,JPM,KO,MCD,MMM,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
period,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2
2007,,,,,,,,,,,,,,,,,,,0.10604,,,,,,,0.11406,,,,,,,,,,,,,,,,,,,,,,,0.145026,,,,,,,,,,,
2008,,,,,,,,,,,,,,,,,,,0.105115,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2009,0.095056,0.113332,0.100589,0.103651,0.095581,,,0.138889,0.100254,,0.11449,0.110978,0.113894,0.094293,0.118982,0.102784,0.082278,0.121404,0.100622,0.091929,0.108013,,,,0.105206,0.10343,0.113986,0.119145,,,0.088506,0.12829,0.136912,0.120979,0.131678,,,0.128904,0.128961,,0.150554,0.133781,0.125298,0.093739,0.154164,0.146341,0.133194,0.151304,0.144949,0.109043,0.102456,,,,0.136204,0.129007,0.133721,0.149844,,
2010,0.095013,0.112136,0.102545,0.104647,0.094241,0.100024,0.114542,0.138889,0.094739,,0.112083,0.114489,0.109577,0.091664,0.122988,0.097171,0.085054,0.1242,0.097486,0.092852,0.109546,0.088738,0.116309,0.095238,0.102667,0.102153,0.114877,0.120528,0.101427,0.119099,0.091631,0.130714,0.133557,0.124195,0.136187,0.140695,0.131522,0.134126,0.131804,,0.150682,0.13719,0.124971,0.089237,0.153292,0.153846,0.102693,0.147921,0.139124,0.108434,0.103929,0.140145,0.142438,0.164384,0.134728,0.125014,0.134212,0.148261,0.131177,0.155701
2011,0.096849,0.113288,0.098601,0.101509,0.069583,0.105519,0.114159,0.15625,0.100617,,0.106948,0.120953,0.107907,0.087742,0.125476,0.103945,0.093065,0.129646,0.098662,0.098442,0.110669,0.089301,0.11806,0.135532,0.103416,0.10444,0.116852,0.121738,0.10293,0.117697,0.095875,0.128314,0.136147,0.127265,0.135311,0.142974,0.132419,0.125077,0.136018,,0.145914,0.140512,0.124974,0.088442,0.142561,0.153846,0.101528,0.148686,0.133954,0.101472,0.100894,0.135294,0.140933,0.163831,0.133798,0.125719,0.136802,0.151729,0.130311,0.152296
2012,0.099883,0.120983,0.100678,0.106739,0.100709,0.109438,0.114235,0.098191,0.097699,,0.10627,0.125059,0.110492,0.090425,0.126077,0.100301,0.092179,0.128629,0.108868,0.100402,0.111602,0.094509,0.113494,0.129306,0.104312,0.108375,0.114912,0.118762,0.111111,0.121048,0.096618,0.129521,0.138393,0.125786,0.138989,0.143693,0.133167,0.125756,0.136913,,0.145933,0.144696,0.125539,0.08849,0.143326,0.162162,0.101463,0.15123,0.141319,0.103654,0.10121,0.132557,0.143161,0.153546,0.130993,0.121422,0.136506,0.148079,0.122586,0.152064
2013,0.097571,0.117795,0.098635,0.107302,,0.112211,0.121479,0.07377,0.102605,,0.110161,0.121982,0.109485,0.092779,0.128146,0.100541,0.103364,0.128052,0.111912,0.100177,0.113493,0.101378,0.116139,0.133257,0.104716,0.106536,0.112031,0.122158,0.12476,0.127637,0.098747,0.131914,0.137276,0.123242,0.141594,0.145261,0.132007,0.125227,0.140646,,0.144169,0.147501,0.12631,0.092113,0.144201,0.162162,0.099311,0.149885,0.138025,0.106114,0.101082,0.134542,0.144341,0.151873,0.1325,0.130192,0.132392,0.141043,0.123562,0.146763
2014,0.100646,0.112709,0.113783,0.107419,0.108975,0.113343,0.122921,0.073171,0.108294,,0.116484,0.127275,0.108926,0.092942,0.107341,0.103593,0.096208,0.127121,0.113106,0.097662,0.113601,,0.113946,0.12847,0.095491,0.10327,0.109486,0.12744,0.128205,0.129752,0.099548,0.13221,0.139863,0.12254,0.145551,0.145433,0.132978,0.124202,0.140255,,0.142312,0.135434,0.126405,0.092402,0.141892,0.082353,0.097299,0.146479,0.151342,0.106568,0.102571,0.127816,0.144845,0.148882,0.133742,0.129394,0.131661,0.14537,0.116724,0.142002
2015,0.099127,0.114365,0.10843,0.112121,0.110315,0.117067,0.121974,0.078261,0.108942,,0.114134,0.125208,0.105367,0.093814,0.122741,0.114234,0.095816,0.12588,0.106913,0.094916,0.114585,0.102793,0.116083,0.128141,0.094618,0.100408,0.108465,0.12682,0.119491,0.131562,0.099384,0.130553,0.144064,0.121858,0.145218,0.147003,0.133183,0.120447,0.141259,,0.138852,0.139283,0.12533,0.092632,0.140025,0.073171,0.098624,0.147675,0.148623,0.10894,0.101536,0.132465,0.145774,0.141917,0.133998,0.128286,0.130806,0.136149,0.122978,0.145122
2016,0.098814,0.111216,0.109207,0.108193,0.111057,0.114995,0.119294,0.078261,0.1135,,0.113174,0.123461,0.100978,0.093373,0.107823,0.117588,0.096262,0.127339,0.101791,0.09396,0.114157,0.108003,0.1101,0.127869,0.099607,0.097362,0.104866,0.126447,0.113822,0.127858,0.100769,0.12981,0.144507,0.122607,0.141356,0.145733,0.132801,0.123485,0.141509,,0.139991,0.138008,0.122716,0.093886,0.139149,0.095152,0.099741,0.148152,0.147013,0.107798,0.102736,0.132325,0.147137,0.136785,0.134482,0.12893,0.128662,0.143135,0.125564,0.142469
