In [1]:
import os
import json
from docling.document_converter import DocumentConverter
import google.genai as genai
from pydantic import BaseModel, Field
from typing import List, Optional


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
import json
from pydantic import BaseModel, Field
from typing import List
import anthropic
from docling.document_converter import DocumentConverter


# 1. Define your ESG Schema
class ESGGovernance(BaseModel):
    metric_name: str = Field(description="The ESG metric, e.g., Scope 1 Emissions")
    value: float = Field(description="The numerical value extracted")
    unit: str = Field(description="The unit of measurement, e.g., tCO2e")
    year: int = Field(description="The reporting year for this data point")


class ESGReportData(BaseModel):
    company_name: str
    emissions_data: List[ESGGovernance]


def parse_esg_report(pdf_path: str) -> str:
    print(f"--- Converting {pdf_path} to Markdown ---")

    # Phase A: High-Fidelity Extraction
    converter = DocumentConverter()
    result = converter.convert(pdf_path)
    markdown_content = result.document.export_to_markdown()

    # Phase B: Targeted Extraction
    client = anthropic.Anthropic()

    prompt = f"""Extract the following EU Taxonomy-related metrics from the Markdown content below.

Return ONLY a valid JSON object matching this exact schema, no explanation:
{{
    "company_name": "string",
    "emissions_data": [
        {{
            "metric_name": "string",
            "value": number,
            "unit": "string",
            "year": number
        }}
    ]
}}

Focus on:
1. Proportion of turnover/CapEx/OpEx aligned with EU Taxonomy
2. KPIs related to climate change

Markdown Content:
{markdown_content[:30000]}"""

    response = client.messages.create(
        model="claude-haiku-4-5",
        max_tokens=4096,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    return response.content[0].text


# Execute
pdf_file = r"C:\Users\arnew\Downloads\BMW_Policy_Paper_BMW_Group_Climate_Strategy_EN.pdf"
extracted_json = parse_esg_report(pdf_file)
print(extracted_json)


--- Converting C:\Users\arnew\Desktop\HE\ultimateclaudelovers\reports\250704-taxonomy-delegated-act-examples-template_en.pdf to Markdown ---


[32m[INFO] 2026-02-21 16:10:50,162 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-02-21 16:10:50,179 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2026-02-21 16:10:50,193 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\arnew\Desktop\HE\ultimateclaudelovers\esg_env\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-02-21 16:10:50,194 [RapidOCR] main.py:50: Using C:\Users\arnew\Desktop\HE\ultimateclaudelovers\esg_env\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-02-21 16:10:50,360 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-02-21 16:10:50,361 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2026-02-21 16:10:50,362 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\arnew\Desktop\HE\ultimateclaudelovers\esg_env\Lib\site-packages\rapidocr\models\ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-02-21 16:1

```json
{
    "company_name": "Multinational construction company",
    "emissions_data": [
        {
            "metric_name": "Proportion of turnover from Taxonomy-aligned activities",
            "value": 66.7,
            "unit": "%",
            "year": 2025
        },
        {
            "metric_name": "Turnover from Taxonomy-aligned activities",
            "value": 1000,
            "unit": "EURm",
            "year": 2025
        },
        {
            "metric_name": "Climate Change Mitigation - Proportion of Taxonomy aligned turnover",
            "value": 53.3,
            "unit": "%",
            "year": 2025
        },
        {
            "metric_name": "Climate Change Adaptation - Proportion of Taxonomy aligned turnover",
            "value": 0.0,
            "unit": "%",
            "year": 2025
        },
        {
            "metric_name": "Circular Economy - Proportion of Taxonomy aligned turnover",
            "value": 13.3,
            "unit": "%",
         

ValidationError: 1 validation error for ESGReportData
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='```json\n{\n    "company...       }\n    ]\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid

In [8]:
extracted_json = extracted_json.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()

data = ESGReportData.model_validate_json(extracted_json)
data

ESGReportData(company_name='Multinational construction company', emissions_data=[ESGGovernance(metric_name='Proportion of turnover from Taxonomy-aligned activities', value=66.7, unit='%', year=2025), ESGGovernance(metric_name='Turnover from Taxonomy-aligned activities', value=1000.0, unit='EURm', year=2025), ESGGovernance(metric_name='Climate Change Mitigation - Proportion of Taxonomy aligned turnover', value=53.3, unit='%', year=2025), ESGGovernance(metric_name='Climate Change Adaptation - Proportion of Taxonomy aligned turnover', value=0.0, unit='%', year=2025), ESGGovernance(metric_name='Circular Economy - Proportion of Taxonomy aligned turnover', value=13.3, unit='%', year=2025), ESGGovernance(metric_name='Proportion of Taxonomy-eligible activities', value=93.3, unit='%', year=2025)])