In [27]:
import re
from collections import Counter
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [21]:
def converttotext(url):
    headers = {'User-Agent':'Sample Company Name AdminContact@<sample company domain>.com','Accept-Encoding':'gzip, deflate','Host':'www.sec.gov'}

    r = requests.get(url, headers=headers)

    raw_10k = r.text

    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')

    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    document = {}
    
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]

    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

    matches = regex.finditer(document['10-K'])

    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower()
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)

    pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]
    item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]
    item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]

    item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')
    item_7_content = BeautifulSoup(item_7_raw, 'lxml')
    item_7a_content = BeautifulSoup(item_7a_raw, 'lxml')

    item_1a_text = item_1a_content.get_text()
    item_7_text = item_7_content.get_text()
    item_7a_text = item_7a_content.get_text()

    item_1a_text_cleaned = re.sub(r'(Item\s*\d+[A-Za-z]*)\.', r'\1. ', item_1a_text)
    item_7_text_cleaned = re.sub(r'(Item\s*\d+[A-Za-z]*)\.', r'\1. ', item_7_text)
    item_7a_text_cleaned = re.sub(r'(Item\s*\d+[A-Za-z]*)\.', r'\1. ', item_7a_text)

    item_1a_text_cleaned = re.sub(r'(?<=[.!?])(?=\S)', r' ', item_1a_text_cleaned)
    item_7_text_cleaned = re.sub(r'(?<=[.!?])(?=\S)', r' ', item_7_text_cleaned)
    item_7a_text_cleaned = re.sub(r'(?<=[.!?])(?=\S)', r' ', item_7a_text_cleaned)

    sentence_splitter = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<!\.\.\.)(?<=\.|\?)\s')

    sentences_1a = sentence_splitter.split(item_1a_text_cleaned)
    sentences_7 = sentence_splitter.split(item_7_text_cleaned)
    sentences_7a = sentence_splitter.split(item_7a_text_cleaned)

    all_sentences = sentences_1a + sentences_7 + sentences_7a
    all_sentences = [sentence.strip() for sentence in all_sentences if sentence.strip()]

    return all_sentences

In [22]:
def load_esg_dictionaries(json_file_path="words.json"):
    with open(json_file_path, 'r') as file:
        esg_dictionaries = json.load(file)
    return esg_dictionaries

In [None]:
def preprocess_text(text):
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [30]:
def analyze_esg(text, esg_dictionaries):
    text = preprocess_text(text)
    word_counts = Counter(text.split())
    
    # Initialize scores for Environmental, Social, and Governance
    environmental_score = 0
    social_score = 0
    governance_score = 0
    
    # Calculate the scores based on the keyword matches
    for keyword in esg_dictionaries['Environmental']:
        environmental_score += word_counts.get(keyword, 0)
    for keyword in esg_dictionaries['Social']:
        social_score += word_counts.get(keyword, 0)
    for keyword in esg_dictionaries['Governance']:
        governance_score += word_counts.get(keyword, 0)
    
    # Normalize the scores by the total word count
    total_words = sum(word_counts.values())
    if total_words > 0:
        environmental_score /= total_words
        social_score /= total_words
        governance_score /= total_words
    
    # Return the individual ESG scores (normalized)
    return environmental_score, social_score, governance_score

In [33]:
esg_dictionaries = load_esg_dictionaries()
text = converttotext("https://www.sec.gov/Archives/edgar/data/1018724/000101872420000004/0001018724-20-000004.txt")
environmental, social, governance = analyze_esg(" ".join(text), esg_dictionaries)

print(f"Environmental: {environmental:.4f}")
print(f"Social: {social:.4f}")
print(f"Governance: {governance:.4f}")

Environmental: 0.0106
Social: 0.0144
Governance: 0.0120
