In [1]:
import glob
import re
import pdfplumber
import pandas as pd
import os
from path import Path

# Define keyword groups
keywords_groups = {
    'G4-1':'Statement from the most senior decision-maker of the organization',
    'G4-2':'Description of key impacts, risks, and opportunities',
    'G4-3':'Name of the organization',
    'G4-4':'Primary brands, products, and services',
    'G4-5':'Location of the organization’s headquarters',
    'G4-6':'Number of countries where the organization operates',
    'G4-7':'Nature of ownership and legal form',
    'G4-8':'The markets served',
    'G4-9':'The scale of the organization',
    'G4-10':'Total number of employees by employment contract and gender',
    'G4-11':'Percentage of total employees covered by collective bargaining agreements',
    'G4-12':'The organization’s supply chain',
    'G4-13':'Significant changes during the reporting period regarding the organization’s size, structure, ownership, or its supply chain',
    'G4-14':'The precautionary approach or principle addressed by the organization',
    'G4-15':'List externally developed economic, environmental and social charters, principles, or other initiatives',
    'G4-16':'List memberships of associations',
    'G4-17':'List all entities included in the organization’s consolidated financial statements',
    'G4-18':'The process for defining the report content and the Aspect Boundaries',
    'G4-19':'The material Aspects identified in the process for defining report content',
    'G4-20':'Report the Aspect Boundary within the organization',
    'G4-21':'Report the Aspect Boundary outside the organization',
    'G4-22':'The effect of any restatements of information provided in previous reports, and the reasons for such restatements',
    'G4-23':'Significant changes from previous reporting periods in the Scope and Aspect Boundaries',
    'G4-24':'List of stakeholder groups engaged by the organization',
    'G4-25':'The basis for identification and selection of stakeholders with whom to engage',
    'G4-26':'The organization’s approach to stakeholder engagement',
    'G4-27':'Key topics and concerns that have been raised through stakeholder engagement',
    'G4-28':'Reporting period',
    'G4-29':'Date of most recent previous report',
    'G4-30':'Reporting cycle',
    'G4-31':'The contact point for questions regarding the report or its contents',
    'G4-32':'The GRI Content Index for the chosen option',
    'G4-33':'The organization’s policy and current practice with regard to eeking external assurance for the report',
    'G4-34':'The governance structure of the organization',
    'G4-35':'The process for delegating authority for economic, environmental and social topics',
    'G4-36':'Report whether the organization has appointed an executive level position or positions with responsibility for economic, environmental and social topics',
    'G4-37':'Processes for consultation between stakeholders and the highest governance body on economic, environmental and social topics',
    'G4-38':'The composition of the highest governance body and its committees',
    'G4-39':'Report whether the Chair of the highest governance body is also an executive officer',
    'G4-40':'The nomination and selection processes for the highest governance body and its committees',
    'G4-41':'Processes for the highest governance body to ensure conflicts of interest are avoided and managed',
    'G4-42':'The highest governance body’s and senior executives’ roles in the development, approval, and updating of the organization’s purpose',
    'G4-43':'The measures taken to develop and enhance the highest governance body’s collective knowledge of economic, environmental and social topics',
    'G4-44':'The processes for evaluation of the highest governance body’s performance with respect to governance of economic, environmental and social topics',
    'G4-45':'The highest governance body’s role in the identification and management of economic, environmental and social impacts, risks, and opportunities',
    'G4-46':'The highest governance body’s role in reviewing the effectiveness of the organization’s risk management processes for economic, environmental and social topics',
    'G4-47':'The frequency of the highest governance body’s review of economic, environmental and socia impacts, risks, and opportunities',
    'G4-48':'The highest committee or position that formally reviews and approves the organization’s sustainability report and ensures that all material Aspects are covered',
    'G4-49':'The process for communicating critical concerns to the highest governance body',
    'G4-50':'The nature and total number of critical concerns that were communicated to the highest governance body',
    'G4-51':'The remuneration policies for the highest governance body and senior executives',
    'G4-52':'The process for determining remuneration',
    'G4-53':'Stakeholders’ views are sought and taken into account regarding remuneration',
    'G4-54':'The ratio of the annual total compensation for the organization’s highest-paid individual',
    'G4-55':'The ratio of percentage increase in annual total compensation for the organization’s highest-paid individual',
    'G4-56':'The organization’s values, principles, standards and norms of behavior',
    'G4-57':'The internal and external mechanisms for seeking advice on ethical and lawful behavior, and matters related to organizational integrity',
    'G4-58':'The internal and external mechanisms for reporting concerns about unethical or unlawful behavior, and matters related to organizational integrity',
    'G4-DMA':'Report how the organization manages the material Aspect or its impacts',
    'G4-EC1':'Direct economic value generated and distributed',
    'G4-EC2':'Financial implications and other risks and opportunities for the organization’s activities due to climate change',
    'G4-EC3':'Coverage of the organization’s defined benefit plan obligations',
    'G4-EC4':'Financial assistance received from government',
    'G4-EC5':'Ratios of standard entry level wage by gender compared to local minimum wage at significant locations of operation',
    'G4-EC6':'Proportion of senior management hired from the local community at significant locations of operation',
    'G4-EC7':'Development and impact of infrastructure investments and services supported',
    'G4-EC8':'Significant indirect economic impacts, including the extent of impacts',
    'G4-EC9':'Proportion of spending on local suppliers at significant locations of operation',
    'G4-EN1':'Materials used by weight or volume',
    'G4-EN2':'Percentage of materials used that are recycled input materials',
    'G4-EN3':'Energy consumption within the organization',
    'G4-EN4':'Energy consumption outside of the organization',
    'G4-EN5':'Energy intensity',
    'G4-EN6':'Reduction of energy consumption',
    'G4-EN7':'Reductions in energy requirements of products and services',
    'G4-EN8':'Total water withdrawal by source',
    'G4-EN9':'Water sources significantly affected by withdrawal of water',
    'G4-EN10':'Percentage and total volume of water recycled and reused',
    'G4-EN11':'Operational sites owned, leased, managed in, or adjacent to, protected areas and areas of high biodiversity value outside protected areas',
    'G4-EN12':'Description of significant impacts of activities, products, and services on biodiversity in protected areas and areas of high biodiversity value outside protected areas',
    'G4-EN13':'Habitats protected or restored',
    'G4-EN14':'Total number of iucn red list species and national conservation list species with habitats in areas affected by operations, by level of extinction risk',
    'G4-EN15':'Direct greenhouse gas (GHG) emissions (scope 1)',
    'G4-EN16':'Energy indirect greenhouse gas (GHG) emissions (scope 2)',
    'G4-EN17':'Other indirect greenhouse gas (GHG) emissions (scope 3)',
    'G4-EN18':'Greenhouse gas (GHG) emissions intensity',
    'G4-EN19':'Reduction of greenhouse gas (GHG) emissions',
    'G4-EN20':'Emissions of ozone-depleting substances (ODS)',
    'G4-EN21':'NOx, SOx, and other significant air emissions',
    'G4-EN22':'Total water discharge by quality and destination',
    'G4-EN23':'Total weight of waste by type and disposal method',
    'G4-EN24':'Total number and volume of significant spills',
    'G4-EN25':'Weight of transported, imported, exported, or treated waste deemed hazardous under the terms of the basel convention2 annex i, ii, iii, and viii, and percentage of transported waste shipped internationally',
    'G4-EN26':'Identity, size, protected status, and biodiversity value of water bodies and related habitats significantly affected by the organization’s discharges of water and runoff',
    'G4-EN27':'Extent of impact mitigation of environmental impacts of products and services',
    'G4-EN28':'Percentage of products sold and their packaging materials that are reclaimed by category',
    'G4-EN29':'Monetary value of significant fines and total number of non-monetary sanctions for non-compliance with environmental laws and regulations',
    'G4-EN30':'Significant environmental impacts of transporting products and other goods and materials for the organization’s operations, and transporting members of the workforce',
    'G4-EN31':'Total environmental protection expenditures and investments by type',
    'G4-EN32':'Percentage of new suppliers that were screened using environmental criteria',
    'G4-EN33':'Significant actual and potential negative environmental impacts in the supply chain and actions taken',
    'G4-EN34':'Number of grievances about environmental impacts filed, addressed, and resolved through formal grievance mechanisms',
    'G4-LA1':'Total number and rates of new employee hires and employee turnover by age group, gender and region',
    'G4-LA2':'Benefits provided to full-time employees that are not provided to temporary or parttime employees, by significant locations of operation',
    'G4-LA3':'Return to work and retention rates after parental leave, by gender',
    'G4-LA4':'Minimum notice periods regarding operational changes, including whether these are specified in collective agreements',
    'G4-LA5':'Percentage of total workforce represented in formal joint management–worker health and safety committees that help monitor and advise on occupational health and safety programs',
    'G4-LA6':'Type of injury and rates of injury, occupational diseases, lost days, and absenteeism, and total number of work-related fatalities, by region and by gender',
    'G4-LA7':'Workers with high incidence or high risk of diseases related to their occupation',
    'G4-LA8':'Health and safety topics covered in formal agreements with trade unions',
    'G4-LA9':'Average hours of training per year per employee by gender, and by employee category',
    'G4-LA10':'Programs for skills management and lifelong learning that support the continued employability of employees and assist them in managing career endings',
    'G4-LA11':'Percentage of employees receiving regular performance and career development reviews, by gender and by employee category',
    'G4-LA12':'Composition of governance bodies and breakdown of employees per employee category according to gender, age group, minority group membership, and other indicators of diversity',
    'G4-LA13':'Ratio of basic salary and remuneration of women to men by employee category, by significant locations of operation',
    'G4-LA14':'Percentage of new suppliers that were screened using labor practices criteria',
    'G4-LA15':'Significant actual and potential negative impacts for labor practices in the supply chain and actions taken',
    'G4-LA16':'Number of grievances about labor practices filed, addressed, and resolved through formal grievance mechanisms',
    'G4-HR1':'Total number and percentage of significant investment agreements and contracts that include human rights clauses or that underwent human rights screening',
    'G4-HR2':'Total hours of employee training on human rights policies or procedures concerning aspects of human rights that are relevant to operations, including the percentage of employees trained',
    'G4-HR3':'Total number of incidents of discrimination and corrective actions taken',
    'G4-HR4':'Operations and suppliers identified in which the right to exercise freedom of association and collective bargaining may be violated or at significant risk, and measures taken to support these rights',
    'G4-HR5':'Operations and suppliers identified as having significant risk for incidents of child labor, and measures taken to contribute to the effective abolition of child labor',
    'G4-HR6':'Operations and suppliers identified as having significant risk for incidents of forced or compulsory labor, and measures to contribute to the elimination of all forms of forced or compulsory labor',
    'G4-HR7':'Percentage of security personnel trained in the organization’s human rights policies or procedures that are relevant to operations',
    'G4-HR8':'Total number of incidents of violations involving rights of indigenous peoples and actions taken',
    'G4-HR9':'Total number and percentage of operations that have been subject to human rights reviews or impact assessments',
    'G4-HR10':'Percentage of new suppliers that were screened using human rights criteria',
    'G4-HR11':'Significant actual and potential negative human rights impacts in the supply chain and actions taken',
    'G4-HR12':'Number of grievances about human rights impacts filed, addressed, and resolved through formal grievance mechanisms',
    'G4-SO1':'Percentage of operations with implemented local community engagement, impact assessments, and development programs',
    'G4-SO2':'Operations with significant actual and potential negative impacts on local communities',
    'G4-SO3':'Total number and percentage of operations assessed for risks related to corruption and the significant risks identified',
    'G4-SO4':'Communication and training on anti-corruption policies and procedures',
    'G4-SO5':'Confirmed incidents of corruption and actions taken',
    'G4-SO6':'Total value of political contributions by country and recipient/beneficiary',
    'G4-SO7':'Total number of legal actions for anti-competitive behavior, anti-trust, and monopoly practices and their outcomes',
    'G4-SO8':'Monetary value of significant fines and total number of non-monetary sanctions for non-compliance with laws and regulations',
    'G4-SO9':'Percentage of new suppliers that were screened using criteria for impacts on society',
    'G4-SO10':'Significant actual and potential negative impacts on society in the supply chain and actions taken',
    'G4-SO11':'Number of grievances about impacts on society filed, addressed, and resolved through formal grievance mechanisms',
    'G4-PR1':'Percentage of significant product and service categories for which health and safety impacts are assessed for improvement',
    'G4-PR2':'Total number of incidents of non-compliance with regulations and voluntary codes concerning the health and safety impacts of products and services during their life cycle, by type of outcomes',
    'G4-PR3':'Type of product and service information required by the organization’s procedures for product and service information and labeling, and percentage of significant product and service categories subject to such information requirements',
    'G4-PR4':'Total number of incidents of non-compliance with regulations and voluntary codes concerning product and service information and labeling, by type of outcomes',
    'G4-PR5':'Results of surveys measuring customer satisfaction',
    'G4-PR6':'Sale of banned or disputed products',
    'G4-PR7':'Total number of incidents of non-compliance with regulations and voluntary codes concerning marketing communications, including advertising, promotion, and sponsorship, by type of outcomes',
    'G4-PR8':'Total number of substantiated complaints regarding breaches of customer privacy and losses of customer data',
    'G4-PR9':'Monetary value of significant fines for non-compliance with laws and regulations concerning the provision and use of products and services',
}

# Function to remove non-printable characters
def remove_illegal_chars(text):
    cleaned_text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    return cleaned_text

# Function to search for keywords in PDF files using pdfplumber
def search_keywords_in_pdf_plumber(file_path):
    file_name = file_path.split("/")[-1]
    results = []
    keyword_found = False
    try:
        with pdfplumber.open(file_path) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                text = page.extract_text()
                if text:
                    text = remove_illegal_chars(text)  # Clean the text
                    for keyword, group in keywords_groups.items():
                        if re.search(r'\b' + re.escape(keyword) + r'\b', text, re.IGNORECASE):
                            results.append({
                                "Nama PDF": file_name,
                                "Keyword": keyword,
                                "Page": page_num,
                                "Group": group,
                                "Content": text
                            })
                            keyword_found = True
    except Exception as e:
        results.append({
            "Nama PDF": file_name,
            "Error": "Error processing PDF: " + str(e),
            "Keyword": "",
            "Page": "",
            "Group": "",
            "Content": ""
        })
    if not keyword_found and not results:
        results.append({
            "Nama PDF": file_name,
            "Keyword": "",
            "Page": "",
            "Group": "",
            "Content": ""
        })
    return results

folder_tahun = ['2021']
folder_isi = ['Annual Report','Integrated Report','Sustainability Report']

# folder_tahun = ['Singapore-2021']
# folder_isi = ['Annual Report','Integrated Report','Sustainability Report']

# Path ke folder utama yang berisi subfolder-subfolder
main_folder_path = Path("D:/CESGS/Scrap")
output_folder_path = Path("D:/CESGS/Scrap")

# Mengiterasi setiap nama dalam list untuk memproses file txt
for t in folder_tahun:
    # Folder path
    folder_path = main_folder_path/f"{t}"
    # Menentukan path folder "[P] ISI" untuk setiap nama
    output_fp = output_folder_path / f"[O] {t}"
    # Membuat folder jika belum ada
    # Membuat folder jika belum ada
    if not output_fp.exists():
        output_fp.mkdir()
    # output_fp.mkdir()
    for i in folder_isi:
        all_results = []
        folder_pdf = folder_path/i
        excel_path = output_fp / f"GS 4_{i}.xlsx"
        # Load existing results if Excel file already exists to avoid reprocessing
        if os.path.exists(excel_path):
            df_existing = pd.read_excel(excel_path)
            processed_files = df_existing["Nama PDF"].unique()
        else:
            df_existing = pd.DataFrame()
            processed_files = []

        # Assume a directory 'Singapore-2022/Annual Report/' contains PDF files
        pdf_files = glob.glob(f"{folder_pdf}/*.pdf")
        for file_path in pdf_files:
            file_name = file_path.split("/")[-1]
            if file_name not in processed_files:  # Check if file has not been processed yet
                results = search_keywords_in_pdf_plumber(file_path)
                df_results = pd.DataFrame(results)
                df_existing = pd.concat([df_existing, df_results], ignore_index=True)

                df_existing.to_excel(excel_path, index=False)  # Save after each file is processed
                print(f"{file_name} processed")
print("All files have been processed.")