In [4]:
# Import requests to retrive Web Urls example HTML. TXT 
import requests

# Import BeautifulSoup
from bs4 import BeautifulSoup

# import re module for REGEXes
import re

# import pandas
import pandas as pd

from transformers import BertTokenizer, BertForSequenceClassification, pipeline

import json

In [33]:
def converttotext(url):
    headers = {'User-Agent':'Sample Company Name AdminContact@<sample company domain>.com','Accept-Encoding':'gzip, deflate','Host':'www.sec.gov'}

    r = requests.get(url, headers=headers)

    raw_10k = r.text

    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')

    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    document = {}
    
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]

    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

    matches = regex.finditer(document['10-K'])

    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower()
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)

    pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]

    item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

    item_1a_text = item_1a_content.get_text()

    def clean_text(text):
        text = re.sub(r'(Item\s*\d+[A-Za-z]*)\.', r'\1. ', text)
        text = re.sub(r'(?<=[.!?])(?=\S)', r' ', text)
        return text.strip()

    item_1a_text_cleaned = clean_text(item_1a_text)

    item_1a_text_cleaned = re.sub(r'(?<=[.!?])(?=\S)', r' ', item_1a_text_cleaned)


    sentence_splitter = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<!\.\.\.)(?<=\.|\?)\s')

    sentences = sentence_splitter.split(item_1a_text_cleaned)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    return sentences


In [34]:
sentences = converttotext('https://www.sec.gov/Archives/edgar/data/0000789019/000156459020034944/0001564590-20-034944.txt')
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
results = nlp(sentences)

environment_score = [entry['score'] for entry in results if entry['label'] == 'Environmental']
social_score = [entry['score'] for entry in results if entry['label'] == 'Social']
governance_score = [entry['score'] for entry in results if entry['label'] == 'Governance']

environment_score_avg = sum(environment_score) / len(environment_score) if environment_score else 0
social_score_avg = sum(social_score) / len(social_score) if social_score else 0
governance_score_avg = sum(governance_score) / len(governance_score) if governance_score else 0

print(f'Average Environmental Score: {environment_score_avg}')
print(f'Average Social Score: {social_score_avg}')
print(f'Average Governance Score: {governance_score_avg}')

Average Environmental Score: 0.9692671895027161
Average Social Score: 0.9106579795479774
Average Governance Score: 0.7698687016963959
