In [133]:
# Import requests to retrive Web Urls example HTML. TXT 
import requests

# Import BeautifulSoup
from bs4 import BeautifulSoup

# import re module for REGEXes
import re

# import pandas
import pandas as pd

from transformers import BertTokenizer, BertForSequenceClassification, pipeline

import json

import nltk

In [134]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ajaym\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [135]:
def clean_text(text):
    # Ensure consistent spacing after titles like "Item 1A."
    text = re.sub(r'(Item\s*\d+[A-Za-z]*)\.', r'\1. ', text)
    # Insert missing spaces between concatenated words like "FactorsThe"
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    # Clean up spacing after punctuation
    text = re.sub(r'(?<=[.!?])(?=\S)', r' ', text)
    return text.strip()

In [136]:
def split_sentences(text):
    # Initial cleaning
    text = clean_text(text)

    # Refine the regex for splitting
    sentence_splitter = re.compile(
        r'(?<!\w\.\w\.)(?<![A-Z][a-z]\.)(?<!\.\.\.)(?<=\.|\?|!)\s'
    )
    sentences = sentence_splitter.split(text)
    
    # Post-process to handle cases like standalone titles and short fragments
    cleaned_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and sentence != ".":
            cleaned_sentences.append(sentence)
    
    return cleaned_sentences

In [137]:
def split_sentences_with_nltk(text):
    # Initial cleaning
    text = clean_text(text)

    # Use nltk's sentence tokenizer
    sentences = nltk.sent_tokenize(text)

    # Post-process to remove redundant or empty sentences
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

In [138]:
item_1a_text = """Item 1A. Risk FactorsThe Company’s business, reputation, results of operations, 
and financial condition can be affected by various factors. For instance, economic downturns 
and market disruptions can significantly impact performance. Risks Related to COVID-19The pandemic 
has had adverse effects on global markets."""

sentences = split_sentences_with_nltk(item_1a_text)

# Print the results
for idx, sentence in enumerate(sentences, start=1):
    print(f"Sentence {idx}: {sentence}")

Sentence 1: Item 1A.
Sentence 2: Risk Factors The Company’s business, reputation, results of operations, 
and financial condition can be affected by various factors.
Sentence 3: For instance, economic downturns 
and market disruptions can significantly impact performance.
Sentence 4: Risks Related to COVID-19The pandemic 
has had adverse effects on global markets.


In [140]:
def converttotext(url):
    headers = {'User-Agent':'Sample Company Name AdminContact@<sample company domain>.com','Accept-Encoding':'gzip, deflate','Host':'www.sec.gov'}

    r = requests.get(url, headers=headers)

    raw_10k = r.text

    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')

    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    document = {}
    
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]

    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

    matches = regex.finditer(document['10-K'])

    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower()
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)

    pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]

    item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

    item_1a_text = item_1a_content.get_text()

    # def clean_text(text):
    #     text = re.sub(r'(Item\s*\d+[A-Za-z]*)\.', r'\1. ', text)
    #     text = re.sub(r'(?<=[.!?])(?=\S)', r' ', text)
    #     return text.strip()

    # item_1a_text_cleaned = clean_text(item_1a_text)

    # item_1a_text_cleaned = re.sub(r'(?<=[.!?])(?=\S)', r' ', item_1a_text_cleaned)


    # sentence_splitter = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<!\.\.\.)(?<=\.|\?)\s')

    # sentences = sentence_splitter.split(item_1a_text_cleaned)
    # sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    sentences = split_sentences_with_nltk(item_1a_text)
    
    return sentences


In [76]:
# sentences = converttotext('https://www.sec.gov/Archives/edgar/data/0001018724/000101872424000008/0001018724-24-000008.txt')
# print(len(sentences))
# finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
# tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
# nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
# results = nlp(sentences)

# environment_score = [entry['score'] for entry in results if entry['label'] == 'Environmental']
# social_score = [entry['score'] for entry in results if entry['label'] == 'Social']
# governance_score = [entry['score'] for entry in results if entry['label'] == 'Governance']

# environment_score_avg = sum(environment_score) / len(environment_score) if environment_score else 0
# social_score_avg = sum(social_score) / len(social_score) if social_score else 0
# governance_score_avg = sum(governance_score) / len(governance_score) if governance_score else 0

# print(f'Average Environmental Score: {environment_score_avg}')
# print(f'Average Social Score: {social_score_avg}')
# print(f'Average Governance Score: {governance_score_avg}')

In [141]:
def calculate_average_esg(sentences, nlp):
    results = nlp(sentences)

    # Extract scores for each category
    environment_score = [entry['score'] for entry in results if entry['label'] == 'Environmental']
    social_score = [entry['score'] for entry in results if entry['label'] == 'Social']
    governance_score = [entry['score'] for entry in results if entry['label'] == 'Governance']

    # Calculate averages or set to 0 if no scores are found
    environment_score_avg = sum(environment_score) / len(environment_score) if environment_score else 0
    social_score_avg = sum(social_score) / len(social_score) if social_score else 0
    governance_score_avg = sum(governance_score) / len(governance_score) if governance_score else 0

    return {
        'environment_score_avg': environment_score_avg,
        'social_score_avg': social_score_avg,
        'governance_score_avg': governance_score_avg
    }

In [142]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

target_year = 2021

with open(f"../Edgar/10K-URL/All_10K_Filings_{target_year}.json", "r") as read_file:
    companies = json.load(read_file)

valid_companies = []

for company in companies:
    try:
        print(f"Analysing: {company['company']}")
        sentences = converttotext(company['url'])
        print(sentences)
        print(f"Number of sentences: {len(sentences)}")
        scores = calculate_average_esg(sentences, nlp)
        
        company['environment_score_avg'] = scores['environment_score_avg']
        company['social_score_avg'] = scores['social_score_avg']
        company['governance_score_avg'] = scores['governance_score_avg']
        
        valid_companies.append(company)

    except Exception as e:
        print(f"Error processing {company['company']}: {e}")

final_companies = [company['company'] for company in valid_companies]
print(f"Companies processed: {final_companies}")

if valid_companies:
    environment_avg = sum([company['environment_score_avg'] for company in valid_companies]) / len(valid_companies)
    social_avg = sum([company['social_score_avg'] for company in valid_companies]) / len(valid_companies)
    governance_avg = sum([company['governance_score_avg'] for company in valid_companies]) / len(valid_companies)

    print(f'Average Environmental Score: {environment_avg}')
    print(f'Average Social Score: {social_avg}')
    print(f'Average Governance Score: {governance_avg}')
else:
    print("No valid companies to calculate averages.")

Analysing: AAPL
['>Item 1A.', 'Risk Factors The Company’s business, reputation, results of operations and financial condition, as well as the price of the Company’s stock, can be affected by a number of factors, whether currently known or unknown, including those described below.', 'When any one or more of these risks materialize from time to time, the Company’s business, reputation, results of operations and financial condition, as well as the price of the Company’s stock, can be materially and adversely affected.', 'Because of the following factors, as well as other factors affecting the Company’s results of operations and financial condition, past financial performance should not be considered to be a reliable indicator of future performance, and investors should not use historical trends to anticipate results or trends in future periods.', 'This discussion of risk factors contains forward-looking statements.', 'This section should be read in conjunction with Part II, Item 7, “Manag