In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
!pip install pdfplumber
!pip install Pillow

In [3]:
import os

In [4]:
import pdfplumber
import re

from PIL import Image
import io
from google.colab import drive
from IPython.display import display

In [None]:
directory = '/content/drive/MyDrive/SustainabilityReports/Firm_ID/1/test'

In [5]:
import os
import re
import pdfplumber
import logging


Code for removing tables and only keep text



In [None]:
pip install tabula-py


In [21]:
output_directory = '/content/drive/MyDrive/SustainabilityReports/Firm_ID/text/81_90'

In [None]:
convert_pdfs_in_directory(input_directory, output_directory)

In [10]:
import os
import logging
import re
import pandas as pd
import pdfplumber
import tabula

In [None]:
# logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


logging.getLogger("org.apache.pdfbox").setLevel(logging.ERROR)

input_directory = '/content/drive/MyDrive/SustainabilityReports/Firm_ID/81'
csv_file_path = os.path.join('/content/drive/MyDrive/SustainabilityReports/Firm_ID/Results/DB', 'esg_report.csv')

def remove_table_text(text, tables):
    """
    Remove text lines that match table content.

    Args:
        text (str): The extracted text from the PDF.
        tables (list): A list of DataFrames containing table content.

    Returns:
        str: The cleaned text with table content removed.
    """
    for table in tables:
        for _, row in table.iterrows():
            for cell in row:
                if isinstance(cell, str):
                    text = text.replace(cell, '')
    return text

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file, removes table-like structures, and returns sentences containing ESG-related keywords.

    Args:
        pdf_path (str): The file path to the PDF.

    Returns:
        list: A list of sentences containing ESG-related keywords.
    """
    text = ""
    try:
        # Tabula
        tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)

        # Extract text 
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text.replace('\n', ' ') + " "
                else:
                    text += f"Page {page.page_number}: No text found. "

        # Remove table
        text = remove_table_text(text, tables)
    except Exception as e:
        logging.error(f"An error occurred while processing {pdf_path}: {e}")
        return []

  
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s')
    sentences = [sentence.strip() for sentence in sentence_endings.split(text) if sentence.strip()]

    #  keywords
    esg_keywords = [
        "climate", "mitigation", "decarbonisation", "carbon", "ghg", "emission",
        "energy", "fuels", "fossil", "nuclear", "renewable", "scope 1", "scope 2",
        "scope 3", "pollution", "air", "water", "soil", "hazard", "concern", "recycle",
        "marine", "waste", "wastage", "hazardous", "danger", "dangerous", "radioactive",
        "human rights", "policy", "employee", "employees", "worker", "workers", "staff",
        "workplace", "accident", "accidents", "eliminate", "discriminate", "discrimination",
        "grievance", "grievances", "complaint", "complaints",
        "mitigate", "workforce", "board members", "male", "female", "management",
        "percent", "percentage", "number", "fatal", "fatalities", "death", "injury", "ill",
        "illness", "health", "life", "fine", "penalty", "penalties", "fines", "customer",
        "customers", "end users", "consumer", "consumers", "public", "society",
        "whistleblowing", "whistleblower", "whistle", "animal", "welfare", "training",
        "workshops", "business ethics", "business conduct", "disclosure", "corruption",
        "bribery", "favor", "illegal", "violate", "violation", "law", "laws", "anti-corruption",
        "anti-bribery", "anticorruption", "antibribery", "politics", "political", "finance",
         "contribution", "contributions", "payment", "wages",
        "salary", "esg", "environment", "social", "governance", "mental-health", "holiday",
        "bonus"
    ]

    #  pattern for matching ESG-related keywords
    keywords_pattern = re.compile(r'\b(?:' + '|'.join(re.escape(keyword) for keyword in esg_keywords) + r')\b', re.IGNORECASE)


    keyword_sentences = [sentence for sentence in sentences if keywords_pattern.search(sentence)]

    return keyword_sentences

def count_total_sentences(pdf_path):
    """
    Counts the total number of sentences in the PDF after removing table content.

    Args:
        pdf_path (str): The file path to the PDF.

    Returns:
        int: The total number of sentences in the PDF.
    """
    text = ""
    try:
        #  Tabula
        tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)

        # Extract text from the PDF
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text.replace('\n', ' ') + " "
                else:
                    text += f"Page {page.page_number}: No text found. "

        # Remove table text
        text = remove_table_text(text, tables)
    except Exception as e:
        logging.error(f"An error occurred while processing {pdf_path}: {e}")
        return 0

   
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s')
    sentences = [sentence.strip() for sentence in sentence_endings.split(text) if sentence.strip()]

    return len(sentences)

def process_pdfs(input_directory, output_directory, csv_file_path):
    """
    Process all PDF files in the input directory, extracting ESG-related sentences and counting total sentences.
    Save the results in a CSV file.

    Args:
        input_directory (str): The directory containing PDF files to process.
        output_directory (str): The directory to save the results.
        csv_file_path (str): The file path to the CSV file for storing the results.
    """
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Prepare the CSV file
    if not os.path.exists(csv_file_path):
        df = pd.DataFrame(columns=["Company", "Year", "Total Sentences", "ESG Sentences"])
        df.to_csv(csv_file_path, index=False)

    for filename in os.listdir(input_directory):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(input_directory, filename)

            # Extract firm name and year from filename
            match = re.match(r'^(.*?)_(\d{4})\.pdf$', filename)
            if match:
                company_name = match.group(1)
                year = match.group(2)
            else:
                logging.warning(f"Filename {filename} does not match the expected pattern.")
                continue

            esg_sentences = extract_text_from_pdf(pdf_path)
            total_sentences = count_total_sentences(pdf_path)

            # CSV file
            df = pd.read_csv(csv_file_path)
            new_row = {
                "Company": company_name,
                "Year": year,
                "Total Sentences": total_sentences,
                "ESG Sentences": len(esg_sentences)
            }
            df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
            df.to_csv(csv_file_path, index=False)

            # text file
            output_file_path = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}.txt")
            with open(output_file_path, 'w') as output_file:
                output_file.write(f"Total sentences in the PDF after removing tables: {total_sentences}\n")
                output_file.write("ESG-related sentences:\n")
                for sentence in esg_sentences:
                    output_file.write(f"{sentence}\n")

            logging.info(f"Processed {filename}")


process_pdfs(input_directory, output_directory, csv_file_path)