In [None]:
import requests
import time
from urllib.parse import urlparse, urlunparse
from bs4 import BeautifulSoup  


API_TOKEN = "# Replace with your actual API token."  
BASE_URL = "https://data.eco-platform.org/resource/" 
PAGE_SIZE = 100  
DELAY = 1  


session = requests.Session()
session.headers.update({
    "Authorization": f"Bearer {API_TOKEN}",
    "Accept": "application/xml"
})

def fetch_epd_page_xml(session, base_url, start_index=0, page_size=100):
    """
    Fetch a single page of EPD process entries in XML format from the ECO Portal.
    """
    params = {
        "search": "true",
        "distributed": "true",
        "virtual": "true",
        "metaDataOnly": "false",  # Retrieve only full datasets
        "format": "XML",
        "pageSize": page_size,
        "startIndex": start_index
    }
    try:
        response = session.get(f"{base_url}processes", params=params)
        if response.status_code != 200:
            print(f"Failed to fetch page starting at index {start_index}: HTTP {response.status_code}")
            return None
        return response.text
    except Exception as e:
        print(f"Exception while fetching page starting at index {start_index}: {e}")
        return None

def fetch_epd_detail_xml(session, detail_url):
    """
    Fetch the full EPD dataset in XML (extended view) from a given detail URL.
    """
    sep = '&' if '?' in detail_url else '?'
    url = detail_url + f"{sep}format=XML&view=extended"
    try:
        response = session.get(url)
        if response.status_code == 429:
            wait = int(response.headers.get("Retry-After", 60))
            print(f"Rate limit reached, sleeping for {wait} seconds...")
            time.sleep(wait)
            return fetch_epd_detail_xml(session, detail_url)
        if response.status_code != 200:
            print(f"Failed to fetch detail from {detail_url}: HTTP {response.status_code}")
            return None
        return response.text
    except Exception as e:
        print(f"Exception fetching detail from {detail_url}: {e}")
        return None

# Main loop: iterate through paginated pages and fetch detail XML for each process.
all_epd_xml_details = []
start_index = 0

while True:
    page_xml = fetch_epd_page_xml(session, BASE_URL, start_index=start_index, page_size=PAGE_SIZE)
    if page_xml is None:
        break

    # Parse the XML page to extract detail URLs.
    soup = BeautifulSoup(page_xml, "xml")
    process_tags = soup.find_all("process")
    if not process_tags:
        print("No more records found; ending pagination.")
        break

    for process in process_tags:
        # The detail URL is typically provided in the "xlink:href" attribute.
        detail_url = process.get("xlink:href")
        if detail_url is None:
            detail_url = process.get("href")
        if not detail_url:
            # If not found, attempt to search for any URL-like value within the process element.
            for tag in process.find_all():
                if tag.string and tag.string.startswith("http"):
                    detail_url = tag.string.strip()
                    break

        if not detail_url:
            continue  # skip if no detail URL available

        detail_xml = fetch_epd_detail_xml(session, detail_url)
        if detail_xml:
            all_epd_xml_details.append(detail_xml)
        time.sleep(DELAY)

    start_index += len(process_tags)
    print(f"Fetched {len(process_tags)} records; total EPD details collected so far: {len(all_epd_xml_details)}")
    if len(process_tags) < PAGE_SIZE:
        break

# Combine all XML details into one XML file by wrapping them in a root element.
combined_xml = "<EPDs>\n" + "\n".join(all_epd_xml_details) + "\n</EPDs>"
output_filename = "epd_data.xml"
with open(output_filename, "w", encoding="utf-8") as f:
    f.write(combined_xml)

print(f"Finished fetching EPD details. Total records: {len(all_epd_xml_details)}. Data saved to {output_filename}.")
