In [1]:
from google.colab import drive
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import os

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

In [None]:
model_name = "nbroad/ESG-BERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def load_sentences(file_path):
    with open(file_path, 'r') as file:
        sentences = file.readlines()
    return [sentence.strip() for sentence in sentences]

def get_esg_score(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    return probabilities.numpy()

def filter_sentences(sentences, threshold=0.80):
    filtered_sentences = []
    for sentence in sentences:
        try:
            probabilities = get_esg_score(sentence)
            esg_score = np.max(probabilities)
            if esg_score >= threshold:
                filtered_sentences.append(sentence)
        except Exception as e:
            print(f"Error processing sentence: {sentence}\nError: {e}")
    return filtered_sentences

def process_directory(input_dir, output_dir, threshold=0.80):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_dir, filename)
            output_file_path = os.path.join(output_dir, filename)

           
            sentences = load_sentences(input_file_path)

            
            filtered_sentences = filter_sentences(sentences, threshold)

           
            with open(output_file_path, 'w') as file:
                for sentence in filtered_sentences:
                    file.write(sentence + '\n')

            print(f"Filtered sentences saved to {output_file_path}")


In [None]:
# Define input and output directories
input_dir = '/content/drive/MyDrive/SustainabilityReports/Firm_ID/text/81_90'  
output_dir = '/content/drive/MyDrive/SustainabilityReports/Firm_ID/text/81_90/BERT'  

# Process all text files in the directory
process_directory(input_dir, output_dir)