In [1]:
from openpyxl import load_workbook
from pathlib import Path
import os
import xml.etree.ElementTree as ET
from datetime import datetime

# ---------- XML pretty print ----------
def indent_xml(elem, level=0):
    indent = "\n" + level * "  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = indent + "  "
        for child in elem:
            indent_xml(child, level + 1)
        if not child.tail or not child.tail.strip():
            child.tail = indent
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = indent

# ---------- Main ----------
def generate_dc_and_dcat(xlsx_path):
    path = Path(xlsx_path)
    wb = load_workbook(xlsx_path, read_only=True, data_only=True)
    stat = os.stat(xlsx_path)

    # ---------- Enriched metadata (final, curated) ----------
    metadata = {
        "title": (
            "GHG emissions of all world countries – 2025 Report "
            "(fossil CO₂ emissions by country)"
        ),
        "creator": "European Commission – Joint Research Centre (JRC)",
        "contributor": "EDGAR Community; International Energy Agency (IEA)",
        "publisher": "European Commission – Joint Research Centre (JRC)",
        "description": (
            "Dataset accompanying the report “GHG emissions of all world countries, 2025”, "
            "providing country-level fossil CO₂ emissions data expressed in multiple metrics "
            "(total emissions, per sector, per capita and per GDP). Data cover emissions from "
            "energy production, industry, transport, buildings, agriculture, waste and other "
            "sectors, using metric units and harmonised international methodologies."
        ),
        "subject": [
            "CO2 emissions",
            "Greenhouse gases",
            "Climate change",
            "Energy sector",
            "Transport emissions",
            "Industrial emissions",
            "Environmental statistics"
        ],
        "source": (
            "EDGAR (Emissions Database for Global Atmospheric Research) Community GHG database, "
            "including IEA-EDGAR CO2 v4 data based on IEA Greenhouse Gas Emissions from Energy"
        ),
        "relation": (
            "GHG emissions of all world countries – JRC/IEA 2025 Report "
            "(DOI: 10.2760/9816914); "
            "EDGAR dataset website: https://edgar.jrc.ec.europa.eu/report_2025; "
            "Underlying data sources include IATA, IEA, FAOSTAT, USGS, World Bank, "
            "Energy Institute and World Steel Association, "
            "https://ansperformance.eu/csv/#aptflt-csv"
        ),

        "coverage": "Global; temporal coverage varies by country and metric",
        "date": "2025",
        "type": "Dataset",
        "format": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        "identifier": "EDGAR-GHG-COUNTRIES-CO2-2025",
        "language": "en",
        "rights": (
            "© European Union 2025, European Commission, Joint Research Centre (JRC). "
            "The dataset is part of the EDGAR (Emissions Database for Global Atmospheric Research) "
            "Community GHG database. Reproduction is authorised provided the source is acknowledged. "
            "Portions of the data (IEA-EDGAR CO2 v4) are licensed under CC BY-NC-ND 4.0. "
            "No warranty is given as to accuracy or completeness."
        )
    }

    # =====================
    # Dublin Core XML
    # =====================
    dc_root = ET.Element(
        "metadata",
        attrib={"xmlns:dc": "http://purl.org/dc/elements/1.1/"}
    )

    for key, value in metadata.items():
        el = ET.SubElement(dc_root, f"dc:{key}")
        if isinstance(value, list):
            el.text = "; ".join(value)
        else:
            el.text = value

    indent_xml(dc_root)

    dc_tree = ET.ElementTree(dc_root)
    dc_tree.write(
        "CO2_dublincore.xml",
        encoding="utf-8",
        xml_declaration=True
    )

    # =====================
    # DCAT RDF/XML
    # =====================
    ns = {
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "dcat": "http://www.w3.org/ns/dcat#",
        "dct": "http://purl.org/dc/terms/"
    }

    rdf = ET.Element(
        "rdf:RDF",
        attrib={f"xmlns:{k}": v for k, v in ns.items()}
    )

    dataset = ET.SubElement(
        rdf,
        "dcat:Dataset",
        attrib={"rdf:about": metadata["identifier"]}
    )

    ET.SubElement(dataset, "dct:title").text = metadata["title"]
    ET.SubElement(dataset, "dct:description").text = metadata["description"]
    ET.SubElement(dataset, "dct:publisher").text = metadata["publisher"]
    ET.SubElement(dataset, "dct:creator").text = metadata["creator"]
    ET.SubElement(dataset, "dct:issued").text = metadata["date"]
    ET.SubElement(dataset, "dct:modified").text = (
        datetime.fromtimestamp(stat.st_mtime).date().isoformat()
    )
    ET.SubElement(dataset, "dct:format").text = metadata["format"]
    ET.SubElement(dataset, "dct:identifier").text = metadata["identifier"]
    ET.SubElement(dataset, "dct:language").text = metadata["language"]
    ET.SubElement(dataset, "dct:rights").text = metadata["rights"]
    ET.SubElement(dataset, "dct:source").text = metadata["source"]
    ET.SubElement(dataset, "dct:relation").text = metadata["relation"]

    contact = ET.SubElement(dataset, "dcat:contactPoint")
    vcard_kind = ET.SubElement(contact, "vcard:Kind")
    ET.SubElement(
        vcard_kind,
        "vcard:hasEmail"
    ).text = "mailto:JRC-EDGAR@ec.europa.eu"

    distribution = ET.SubElement(dataset, "dcat:distribution")
    dist = ET.SubElement(distribution, "dcat:Distribution")

    ET.SubElement(
        dist,
        "dct:title"
    ).text = "GHG emissions by country – CSV distribution"

    ET.SubElement(
        dist,
        "dct:description"
    ).text = (
        "CSV distribution providing country-level fossil CO₂ emissions "
        "data from the EDGAR Community GHG database."
    )

    ET.SubElement(
        dist,
        "dcat:accessURL"
    ).text = "https://ansperformance.eu/csv/#aptflt-csv"

    ET.SubElement(
        dist,
        "dct:format"
    ).text = "text/csv"

    for s in metadata["subject"]:
        ET.SubElement(dataset, "dcat:keyword").text = s

    temporal = ET.SubElement(dataset, "dct:temporal")
    period = ET.SubElement(temporal, "rdf:Description")
    ET.SubElement(period, "dct:period").text = metadata["coverage"]

    indent_xml(rdf)

    dcat_tree = ET.ElementTree(rdf)
    dcat_tree.write(
        "CO2_dcat.xml",
        encoding="utf-8",
        xml_declaration=True
    )

    print("✔ Dublin Core y DCAT generados correctamente (CO₂ dataset)")

# ---------- Run ----------
if __name__ == "__main__":
    generate_dc_and_dcat(
        "EDGAR_2025_GHG_booklet_2025_fossilCO2only.xlsx"
    )


✔ Dublin Core y DCAT generados correctamente (CO₂ dataset)
