In [2]:
from pathlib import Path
import os
import xml.etree.ElementTree as ET
from datetime import datetime

# ---------- XML pretty print ----------
def indent_xml(elem, level=0):
    indent = "\n" + level * "  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = indent + "  "
        for child in elem:
            indent_xml(child, level + 1)
        if not child.tail or not child.tail.strip():
            child.tail = indent
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = indent

# ---------- Main ----------
def generate_dc_and_dcat(xls_path):
    path = Path(xls_path)
    stat = os.stat(xls_path)

    # ---------- Enriched metadata (GDP dataset, final) ----------
    metadata = {
        "title": "GDP (current US$) by Country",
        "creator": "World Bank",
        "contributor": (
            "World Bank national accounts data; OECD National Accounts data"
        ),
        "publisher": "World Bank",
        "description": (
            "Dataset providing Gross Domestic Product (GDP) by country expressed in current "
            "US dollars, corresponding to the World Bank World Development Indicators "
            "indicator NY.GDP.MKTP.CD. GDP is measured at purchasers' prices and compiled "
            "according to national accounts methodologies. Country metadata include "
            "regional classification and income group information."
        ),
        "subject": [
            "Gross Domestic Product",
            "GDP",
            "Economic indicators",
            "Macroeconomics",
            "National accounts",
            "World Development Indicators"
        ],
        "source": (
            "World Bank national accounts data, compiled according to national accounting "
            "systems and supplemented with OECD National Accounts data."
        ),
        "relation": (
            "World Bank World Development Indicators, indicator "
            "NY.GDP.MKTP.CD (GDP, current US$); "
            "Indicator landing page: https://data.worldbank.org/indicator/NY.GDP.MKTP.CD"
        ),
        "coverage": (
            "Global coverage by country; temporal coverage varies by country and availability"
        ),
        "date": "2024",
        "type": "Dataset",
        "format": "application/vnd.ms-excel",
        "identifier": "WORLD-BANK-GDP-CURRENT-USD",
        "language": "en",
        "rights": (
            "© World Bank. Data are available under the World Bank Open Data License. "
            "The World Bank does not guarantee the accuracy or completeness of the data "
            "and accepts no responsibility for any use made thereof."
        )
    }

    # =====================
    # Dublin Core XML
    # =====================
    dc_root = ET.Element(
        "metadata",
        attrib={"xmlns:dc": "http://purl.org/dc/elements/1.1/"}
    )

    for key, value in metadata.items():
        el = ET.SubElement(dc_root, f"dc:{key}")
        if isinstance(value, list):
            el.text = "; ".join(value)
        else:
            el.text = value

    indent_xml(dc_root)

    dc_tree = ET.ElementTree(dc_root)
    dc_tree.write(
        "PIB_dublincore.xml",
        encoding="utf-8",
        xml_declaration=True
    )

    # =====================
    # DCAT RDF/XML
    # =====================
    ns = {
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "dcat": "http://www.w3.org/ns/dcat#",
        "dct": "http://purl.org/dc/terms/"
    }

    rdf = ET.Element(
        "rdf:RDF",
        attrib={f"xmlns:{k}": v for k, v in ns.items()}
    )

    dataset = ET.SubElement(
        rdf,
        "dcat:Dataset",
        attrib={"rdf:about": metadata["identifier"]}
    )

    ET.SubElement(dataset, "dct:title").text = metadata["title"]
    ET.SubElement(dataset, "dct:description").text = metadata["description"]
    ET.SubElement(dataset, "dct:publisher").text = metadata["publisher"]
    ET.SubElement(dataset, "dct:creator").text = metadata["creator"]
    ET.SubElement(dataset, "dct:issued").text = metadata["date"]
    ET.SubElement(dataset, "dct:modified").text = (
        datetime.fromtimestamp(stat.st_mtime).date().isoformat()
    )
    ET.SubElement(dataset, "dct:format").text = metadata["format"]
    ET.SubElement(dataset, "dct:identifier").text = metadata["identifier"]
    ET.SubElement(dataset, "dct:language").text = metadata["language"]
    ET.SubElement(dataset, "dct:rights").text = metadata["rights"]
    ET.SubElement(dataset, "dct:source").text = metadata["source"]
    ET.SubElement(dataset, "dct:relation").text = metadata["relation"]

    # --- DCAT specific ---
    ET.SubElement(
        dataset,
        "dcat:landingPage"
    ).text = "https://data.worldbank.org/indicator/NY.GDP.MKTP.CD"

    # --- DCAT Distribution (API access) ---
    distribution = ET.SubElement(dataset, "dcat:distribution")
    dist = ET.SubElement(distribution, "dcat:Distribution")

    ET.SubElement(
        dist,
        "dct:title"
    ).text = "World Bank API – GDP (NY.GDP.MKTP.CD)"

    ET.SubElement(
        dist,
        "dct:description"
    ).text = (
        "REST API providing programmatic access to GDP (current US$) "
        "data by country from the World Bank."
    )

    ET.SubElement(
        dist,
        "dcat:accessURL"
    ).text = "https://api.worldbank.org/v2/country/all/indicator/NY.GDP.MKTP.CD"

    ET.SubElement(
        dist,
        "dct:format"
    ).text = "API (JSON/XML)"


    for s in metadata["subject"]:
        ET.SubElement(dataset, "dcat:keyword").text = s

    temporal = ET.SubElement(dataset, "dct:temporal")
    period = ET.SubElement(temporal, "rdf:Description")
    ET.SubElement(period, "dct:period").text = metadata["coverage"]

    indent_xml(rdf)

    dcat_tree = ET.ElementTree(rdf)
    dcat_tree.write(
        "PIB_dcat.xml",
        encoding="utf-8",
        xml_declaration=True
    )

    print("✔ Dublin Core y DCAT generados correctamente (GDP dataset)")

# ---------- Run ----------
if __name__ == "__main__":
    generate_dc_and_dcat(
        "API_NY.GDP.MKTP.CD_DS2_en_excel_v2_174254.xls"
    )


✔ Dublin Core y DCAT generados correctamente (GDP dataset)
