## https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [99]:
import pandas as pd
import itertools
from tqdm.notebook import tqdm

# https://github.com/UKPLab/sentence-transformers
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")
import calcbench as cb

In [67]:
categories = [
    (
        "environmental",
        """biodiversity, carbon, carbon negative, carbon neutral, carbon zero, clean-up, clean, clean tech, climate change, climate positive, contamination, discharge, emission, energy-efficient, environmental, environmental risk, fuel, fuel efficiency, green, greenhouse gas, hazardous, low carbon, natural resource, net-zero emission, pollution, remediation, sustainability, sustainable, toxic, waste, water, zero carbon, zero net carbon""",
    ),
    (
        "social",
        """accident, antiracism, consumer protection, customer privacy, employee relation, equal, equal pay, equity, gender equality, health, human right, justice, labor relation, labor standard, racial equity, racial awareness, racial justice, working condition""",
    ),
    (
        "goverance",
        """advocacy, antitrust, board independence, code of ethic, compensation, corporate culture, corporate governance, corruption, governance risk, justice, political lobbying, scandal, shareholder right, stability, stewardship, transparency""",
    ),
]

In [57]:
esg_embeddings = model.encode(
    [words for _, words in categories], convert_to_tensor=True
)

In [95]:
search_results = list(
    cb.document_search(
        company_identifiers=cb.tickers(index="DJIA"),
        document_name="ManagementsDiscussionAndAnalysis",
        all_history=True,
        period_type="annual",
    )
)

search_results = [
    s for s in search_results if s.document_type == "10-K"
]  # Remove amended filings

In [100]:
document_contents = [(d, d.get_contents_text()) for d in tqdm(search_results)]

  0%|          | 0/333 [00:00<?, ?it/s]

In [102]:
md_a_embeddings = model.encode(
    [contents for _, contents in document_contents],
    convert_to_tensor=True,
    show_progress_bar=True,
)

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [104]:
cosine_scores = util.pytorch_cos_sim(esg_embeddings, md_a_embeddings)

In [113]:
distances = pd.DataFrame(
    cosine_scores.numpy(),
    columns=pd.MultiIndex.from_tuples([(document.ticker, document.fiscal_year) for document, _ in document_contents], names=['ticker', 'year']),
    index=[c for c, _ in categories],
)

In [114]:
distances.T

Unnamed: 0_level_0,Unnamed: 1_level_0,environmental,social,goverance
ticker,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,2020,0.077228,0.253213,0.439496
MMM,2019,0.077228,0.253213,0.439496
MMM,2018,0.064323,0.296376,0.440758
MMM,2017,0.064323,0.296376,0.440758
MMM,2016,0.064323,0.296376,0.440758
MMM,2015,0.064323,0.296376,0.440758
MMM,2014,0.064323,0.296376,0.440758
MMM,2013,0.042174,0.304803,0.407609
MMM,2012,0.042174,0.304803,0.407609
MMM,2011,0.042174,0.304803,0.407609


In [82]:
distances.loc[:, ~distances.columns.duplicated()].style.background_gradient(
    cmap="viridis"
)

Unnamed: 0,MMM,AXP,AMGN,AAPL,BA,CAT,CVX,CSCO,KO,DOW,GS,HD,HON,INTC,IBM,JNJ,JPM,MCD,MRK,MSFT,NKE,PG,CRM,TRV,UNH,VZ,V,WBA,WMT,DIS
environmental,0.077228,0.072081,0.119456,0.188023,0.049635,0.088863,0.04627,0.208268,0.118117,0.137277,0.044041,0.218505,0.156125,0.112483,0.037996,0.038603,0.110461,0.276271,0.087996,0.109887,0.086627,0.166836,0.138848,0.086009,0.106012,0.094298,0.093081,0.079708,0.051817,0.198753
social,0.253213,0.351238,0.270231,0.270651,0.250201,0.21735,0.302959,0.322666,0.293957,0.246113,0.287698,0.311418,0.188648,0.268946,0.328172,0.294968,0.214728,0.301616,0.170367,0.280293,0.273858,0.306774,0.325204,0.161181,0.328216,0.302468,0.382732,0.33734,0.217662,0.347523
goverance,0.439496,0.479153,0.435111,0.411581,0.326565,0.313225,0.421872,0.447613,0.465742,0.387791,0.412481,0.485787,0.316277,0.302344,0.360157,0.395387,0.300616,0.391904,0.254094,0.415492,0.369767,0.530607,0.450492,0.251981,0.471686,0.348908,0.514012,0.495103,0.381842,0.425977


In [85]:
distances.columns[distances.columns.duplicated()]

Index(['PG'], dtype='object')

In [93]:
[s for s in search_results if s.ticker == "PG"]

[DocumentSearchResults(fact_id=None, entity_name='PROCTER & GAMBLE Co', accession_id=271810, footnote_type=2700, SEC_URL='https://www.sec.gov/Archives/edgar/data/80424/000008042420000059/0000080424-20-000059-index.htm', sec_filing_id=1601824, blob_id='1601824_section270', fiscal_year=2020, fiscal_period='Y', calendar_year=2020, calendar_period='2', filing_date='2020-08-10T00:00:00', received_date='2020-08-10T00:00:00', document_type='10-K/A', guide_link=None, page_url='https://www.sec.gov/Archives/edgar/data/80424/000008042420000059/000008042420000059index.htm/pg-20200630.htm', entity_id=5841, id_detail=False, local_name=None, CIK='0000080424', sec_accession_number='0000080424-20-000059', network_id=None, ticker='PG', filing_type=2, description='PROCTER & GAMBLE Co (PG)', disclosure_type_name='ManagementsDiscussionAndAnalysis', period_end_date='2020-06-30T00:00:00', footnote_type_title='Additional 10-K and 10-Q Sections', content=None),
 DocumentSearchResults(fact_id=None, entity_name=