In [7]:
import re
import pandas as pd
from collections import Counter
import json

In [24]:
def load_esg_dictionaries(json_file_path="words.json"):
    with open(json_file_path, 'r') as file:
        esg_dictionaries = json.load(file)
    
    return esg_dictionaries


In [25]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [32]:
def analyze_esg(text, esg_dictionaries):
    text = preprocess_text(text)
    word_counts = Counter(text.split())
    
    esg_scores = {key: 0 for key in esg_dictionaries.keys()}
    
    for category, keywords in esg_dictionaries.items():
        for keyword in keywords:
            esg_scores[category] += word_counts.get(keyword, 0)
    
    total_words = sum(word_counts.values())
    print(f"total words: {total_words}")
    esg_scores_normalized = {k: v / total_words for k, v in esg_scores.items()}
    
    return esg_scores, esg_scores_normalized

In [33]:
def process_10k_file(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    
    # Load ESG dictionaries
    esg_dictionaries = load_esg_dictionaries()
    
    # Analyze the text
    esg_scores, esg_scores_normalized = analyze_esg(text, esg_dictionaries)
    
    return esg_scores, esg_scores_normalized

In [34]:
# Example usage
file_path = "./10K-TXT/10k.txt"  # Path to your 10-K file
esg_scores, esg_scores_normalized = process_10k_file(file_path)

print("Raw ESG Scores:", esg_scores)
print("Normalized ESG Scores:", esg_scores_normalized)

total words: 40499
Raw ESG Scores: {'environmental': 259, 'social': 505, 'governance': 491}
Normalized ESG Scores: {'environmental': 0.006395219635052718, 'social': 0.012469443689967654, 'governance': 0.012123756142126966}
