In [13]:
import fitz 
import pandas as pd
import os

In [14]:
pdf_folder_path = "../report_new"

# weight

In [15]:
#dictionary
metrics_keywords = {
    "Environment": {
        "Biodiversity & Land Use": {
            "Protected or restored habitats": ["304"]
        },
        "Carbon Emissions": {
            "Absolute emissions": [
                "Absolute emissions", "305-1", "305-2", "305-3", "GHG emission",
                "scope 1", "scope 2", "scope 3", "carbon emission"
            ],
            "Emission intensities": [
                "Emission intensity", "Emission intensities", "305-4", "Carbon intensity",
                "Carbon intensities", "GHG intensity", "GHG intensities",
                "Emissions intensity", "Emissions intensities"
            ]
        },
        "Energy": {
            "Total energy consumption": ["energy consumption", "302-1"],
            "Energy consumption intensity": [
                "Energy consumption intensity", "302-3", "energy consumption intensities",
                "energy intensity", "energy intensities"
            ]
        },
        "Waste": {
            "Waste generated": ["waste generated", "Waste generation", "306-1"]
        },
        "Financing Environmental Impact": {
            "Green financing projects": [
                "Sustainable Investment", "Climate-related Financing", "Green Bonds Issued",
                "Environmental Impact of Financed Projects", "201-1"
            ]
        },
        "Opportunities in Green Building": {
            "Green certified buildings": ["LEED", "BREEAM", "WELL"]
        },
        "Water Stress": {
            "water intensity": ["water consumption intensity", "water intensity"],
            "water consumption": [
                "Total water consumption", "water consumed", "303-5", "303", "water use"
            ]
        }
    },
    "Governance": {
        "Board": {
            "Women in the management team": [
                "Women in Leadership", "Gender Balance in Executive Roles",
                "Gender Diversity in Management", "Women in Management", "405-1"
            ],
            "Women on the board": [
                "Female Representation in Governance Bodies", "Women’s Representation on the Board",
                "Gender Diversity on the Board"
            ],
            "Board independence": ["Frequency of Independent Board", "Board independence"]
        },
        "Business Ethics": {
            "List of relevant certifications": [
                "ISO 14001", "ISO 45001", "ISO 37001", "ISO 9001", "LEED",
                "Energy Star", "FSC", "SA8000", "Fair Trade Certification", "B Corp Certification",
                "OHSAS 18001", "AA1000", "SMETA", "CarbonNeutral Certification", "CDP",
                "Green Seal Certification", "GRI"
            ],
            "Assurance of sustainability report": [
                "PwC", "Deloitte", "EY", "KPMG", "SGS", "Bureau Veritas", "DNV GL"
            ],
            "Anti-corruption disclosures": [
                "Anti-corruption Policies", "Anti-corruption Policy", "Anti-bribery Measure",
                "Corruption Risk Assessment"
            ],
            "Anti-corruption training": ["Anti-corruption training"]
        },
        "Accounting and Audit": {
            "External audit conducted": ["External Financial Audit", "102-56"]
        }
    },
    "Social": {
        "Access to Health Care": {
            "Percentage of employees covered by health insurance": [
                "Employee Health Insurance Coverage", "403-3"
            ],
            "Availability of Healthcare Resources": ["Healthcare Coverage for Employees", "403-6"],
            "Community Health Program": ["Community Health Initiatives", "413-1"]
        },
        "Human Capital Development": {
            "Average training hours per employee": ["Employee Training Hours", "404-1"]
        },
        "Labor Management": {
            "Current employees by gender": ["female"],
            "New hires by gender/by age": ["New Hires"],
            "Turnover by gender/by age": ["Turnover"],
            "Total turnover": ["Turnover"],
            "Current employees by age groups": ["over 50", "above 50", ">50", "50 years"],
            "Total number of employees": ["Total Workforce Count", "102-8"],
            "Employee satisfaction rate": ["Work-life Balance Satisfaction", "401-3"]
        },
        "Customer Financial Protection": {
            "Consumer rights protection": [
                "Customer Privacy and Data Protection", "Consumer rights", "Product Health",
                "Customer Feedback", "102-17"
            ]
        },
        "Opportunities in Nutrition & Health": {
            "Community nutrition programs": [
                "Nutrition and Health Initiatives", "Health and Wellness Programs",
                "Balanced Diets", "Nutrition Organizations", "Food Banks"
            ]
        },
        "Community Relations": {
            "Philanthropic initiatives": ["Philanthropy", "Employee Volunteering", "Disaster Relief"],
            "Company donated": ["Corporate Donations", "Charitable Contributions"]
        },
        "Controversial Sourcing": {
            "Controversial Sourcing": ["Controversial Sourcing", "308-2"]
        },
        "Occupational Health & Safety": {
            "Fatalities": ["Fatalities"],
            "High-consequence injuries": ["High consequence", "High-consequence"],
            "Work-related injuries": ["work-related", "workplace injuries", "403-9"]
        }
    }
}



In [16]:
metrics_info = []
for category, issues in metrics_keywords.items():
    for issue, metrics in issues.items():
        for metric, keywords in metrics.items():
            metrics_info.append({
                "ESG categories": category,
                "Issue": issue,
                "Metric": metric,
                "Keywords": [k.lower() for k in keywords]
            })

In [17]:
metrics_count = {info["Metric"]: 0 for info in metrics_info}

for filename in os.listdir(pdf_folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, filename)
        pdf_document = fitz.open(pdf_path)

        metric_keyword_seen = {info["Metric"]: False for info in metrics_info}

        for info in metrics_info:
            metric = info["Metric"]
            
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                text = page.get_text().lower()

                for keyword in info["Keywords"]:
                    if keyword in text:
                        metric_keyword_seen[metric] = True
                        break
                if metric_keyword_seen[metric]:
                    break
                    
        for metric, seen in metric_keyword_seen.items():
            if seen:
                metrics_count[metric] += 1


In [21]:
data = []
total_frequency = sum(metrics_count.values())

for info in metrics_info:
    metric = info["Metric"]
    frequency = metrics_count[metric]
    percentage = (frequency / total_frequency * 100) 
    data.append({
        "ESG categories": info["ESG categories"],
        "Issue": info["Issue"],
        "Metric": metric,
        "Frequency": frequency,
        "Percentage (%)": round(percentage, 5)
    })
    

df = pd.DataFrame(data)

In [22]:
df.to_csv("esg_metrics_frequency_with_proportions.csv", index=False)

In [23]:
print("The result is already saved as esg_metrics_frequency.csv")

The result is already saved as esg_metrics_frequency.csv
