In [1]:
# 1. SETUP: Ensure the correct, latest libraries are installed
# ------------------------------------------------------------------------------
!pip install -q --upgrade "crawl4ai>=0.6.0" "pyOpenSSL"
!pip install -q pandas lxml requests
print("✅ All necessary libraries are installed to their latest versions.")

✅ All necessary libraries are installed to their latest versions.


In [2]:
#Deep Crawling

!  playwright install
import asyncio
import os
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter

# --- Configuration ---
START_URL = "https://www.mosdac.gov.in"
URL_LIST_FILE = "/content/discovered_urls.txt" # Ensure this path is correct for your Colab environment

async def run_true_deep_crawl():
    """
    This script performs a TRUE deep crawl, discovering all links on each page
    and exploring them up to the specified depth, ensuring ALL relevant URLs
    (including documents) are discovered by relaxing content type filters during discovery.
    """
    print(f"🚀 Starting TRUE deep crawl on {START_URL}. This may take several minutes...")

    # 1. Define filters for URL patterns ONLY.
    # We explicitly REMOVE the ContentTypeFilter from here.
    # The goal of this phase is *discovery* of URLs, not content filtering.
    filter_chain = FilterChain([
        # Filter 1: Only allow URLs that belong to our specified domain.
        # This is a positive match: ONLY allow URLs that start with our base URL.
        URLPatternFilter(patterns=[f"{START_URL}/*"]),
    ])

    # 2. Configure the deep crawl strategy.
    deep_crawl_strategy = BFSDeepCrawlStrategy(
        max_depth=3,  # Go two "clicks" deep from the starting URL.
        include_external=False,  # Explicitly prevents crawling external domains.
        filter_chain=filter_chain, # Apply only the URL pattern filter.
    )

    # 3. Create a CrawlerRunConfig for the crawl execution.
    # 'stream=True' ensures results are processed as they become available.
    config = CrawlerRunConfig(
        deep_crawl_strategy=deep_crawl_strategy,
        cache_mode=CacheMode.ENABLED, # Enable caching for efficiency on repeated runs.
        stream=True # Stream results to process them incrementally.
    )

    # 4. Initialize and run the crawler.
    # BrowserConfig controls browser-level settings like headless mode.
    discovered_urls = set() # Use a set to automatically handle duplicates.
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        async for result in await crawler.arun(START_URL, config=config):
            if result.success:
                # --- POST-DISCOVERY FILTERING FOR SPECIFIC IRRELEVANT URL PATTERNS ---
                # This catches problematic URLs that we don't want to process further,
                # like mailto links or specific internal non-content pages, even if they were discovered.
                if "mailto:" in result.url or \
                   "/internal/logout" in result.url or \
                   "/internal/registration" in result.url or \
                   "/internal/uops" in result.url:
                    print(f"  [DISCOVERED BUT SKIPPING SAVE] {result.url} (filtered during post-discovery)")
                    continue

                print(f"  [OK] Discovered: {result.url}")
                discovered_urls.add(result.url)
            else:
                # Log pages that couldn't be crawled (e.g., due to network issues or genuinely invalid responses).
                # Crucially, direct file URLs should no longer be hitting ContentTypeFilter issues here.
                print(f"  [SKIPPED] Could not crawl {result.url}: {result.error_message}")

    print("\n--- Deep Crawl Complete! ---")
    print(f"Discovered {len(discovered_urls)} unique pages.")

    # 5. Save the comprehensive list of unique URLs to our file.
    with open(URL_LIST_FILE, "w") as f:
        for url in sorted(list(discovered_urls)):
            f.write(url + "\n")

    print(f"Comprehensive URL list saved to '{URL_LIST_FILE}'.")

# Run the deep crawl function for Colab environment.
await run_true_deep_crawl()

Output hidden; open in https://colab.research.google.com to view.

In [6]:
import asyncio
import json
import os
import re
from bs4 import BeautifulSoup, Comment
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
import base64

from typing import List, Dict, Union
from urllib.parse import urljoin

# --- Configuration ---
URL_LIST_FILE = "/content/discovered_urls.txt"
# Set OUTPUT_DIR to your Google Drive mounted path for persistence
OUTPUT_DIR = "/content/drive/MyDrive/extracted_content"

# Create output directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "structured_json"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "unstructured_markdown"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "pdfs"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "extracted_tables"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "extracted_links"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "debug_html"), exist_ok=True)

# --- Helper functions for BeautifulSoup parsing ---
async def parse_faq_html_with_bs(html_content: str) -> List[Dict[str, str]]:
    """
    Parses FAQ HTML content using BeautifulSoup to extract questions and answers.
    Includes aggressive cleaning of MSO XML remnants and deduplication of answer text.
    """
    faq_items = []
    soup = BeautifulSoup(html_content, 'html.parser')

    faq_containers = soup.select('div.faq-question-answer')

    for container in faq_containers:
        question_element = container.select_one('div.faq-question a')
        answer_element = container.select_one('div.faq-answer')

        question = question_element.get_text(strip=True) if question_element else ""

        answer = ""
        if answer_element:
            for element in answer_element(string=lambda text: isinstance(text, Comment)):
                element.extract()
            for s in answer_element(["script", "style", "link"]):
                s.decompose()

            unique_answer_parts = set()
            for tag in answer_element.find_all(re.compile(r"^(p|span|div|a|li|h[1-6])$")):
                text_content = tag.get_text(strip=True)

                text_content = re.sub(r'<\/?w:[^>]+>', '', text_content)
                text_content = re.sub(r'<\/?xml:namespace[^>]+>', '', text_content)
                text_content = re.sub(r'', '', text_content, flags=re.DOTALL)
                text_content = re.sub(r'[\s\uFEFF\xA0]+', ' ', text_content).strip()

                if text_content:
                    unique_answer_parts.add(text_content)

            answer = "\n".join(sorted(list(unique_answer_parts)))


        if question and answer:
            faq_items.append({"question": question, "answer": answer})

    return faq_items

async def parse_table_html_with_bs(html_content: str, table_css_selector: str) -> List[Dict[str, str]]:
    """
    Parses specific table HTML content using BeautifulSoup.
    Assumes the table is fully rendered in the provided HTML content.
    This is for the 'sticky-enabled' type tables.
    """
    table_rows = []
    soup = BeautifulSoup(html_content, 'html.parser')

    target_table = soup.select_one(table_css_selector)

    if target_table:
        for row_element in target_table.select('tbody tr'):
            cells = row_element.select('td')
            if len(cells) >= 3:
                display_name = ""
                display_link = ""
                modified_date = ""

                name_link_element = cells[1].select_one('a')
                if name_link_element:
                    display_name = name_link_element.get_text(strip=True)
                    display_link = name_link_element.get('href')
                    if display_link and not display_link.startswith('http'):
                        display_link = urljoin("https://www.mosdac.gov.in", display_link)

                modified_date_element = cells[2]
                if modified_date_element:
                    modified_date = modified_date_element.get_text(strip=True)

                table_rows.append({
                    "display_name": display_name,
                    "display_link": display_link,
                    "modified_date": modified_date
                })
    return table_rows

async def parse_angular_table_html_with_bs(html_content: str, table_css_selector: str) -> List[Dict[str, str]]:
    """
    Parses AngularJS table HTML content using BeautifulSoup.
    Assumes the content is fully rendered by Angular/JS in the provided HTML.
    This is for tables with id="tabledata".
    """
    table_rows = []
    soup = BeautifulSoup(html_content, 'html.parser')

    target_table = soup.select_one(table_css_selector)

    if target_table:
        for row_element in target_table.select('tbody tr'):
            cells = row_element.select('td')

            # Based on the HTML provided, these tables have 9 columns for data (excluding Sr.No, including DOI)
            if len(cells) >= 9:
                row_data = {}

                row_data["Sr_No"] = cells[0].get_text(strip=True)

                product_name_element = cells[1].select_one('b')
                row_data["Product_Name"] = product_name_element.get_text(strip=True) if product_name_element else ""
                # We are skipping direct extraction of ng-click/ng-href from md-icon as it's complex without LLM or Angular-aware parser.

                row_data["Product_Description"] = cells[2].get_text(strip=True)
                row_data["Processing_Level"] = cells[3].get_text(strip=True)
                row_data["Temporal_Resolution"] = cells[4].get_text(strip=True)
                row_data["Start_Date"] = cells[5].get_text(strip=True)
                row_data["End_Date"] = cells[6].get_text(strip=True)
                row_data["Processing_Status"] = cells[7].get_text(strip=True)

                doi_link_element = cells[8].select_one('a')
                row_data["DOI"] = doi_link_element.get_text(strip=True) if doi_link_element else ""
                # Attempt to get the href. If ng-href isn't resolved to href, it will be None or template.
                row_data["DOI_Link"] = doi_link_element.get('href') if doi_link_element else ""

                table_rows.append(row_data)
    return table_rows

async def extract_links_from_markdown_file(markdown_text: str, source_page_url: str) -> List[Dict[str, str]]:
    """
    Extracts all Markdown-formatted links from a given Markdown text.
    Resolves relative URLs to absolute URLs and captures context.
    """
    extracted_links = []

    markdown_link_pattern = re.compile(r'\[(.*?)\]\((.*?)\)')

    seen_links = set()

    for match in markdown_link_pattern.finditer(markdown_text):
        full_match_text = match.group(0)
        link_text = match.group(1).strip()
        relative_url = match.group(2).strip()

        if relative_url:
            absolute_url = urljoin(source_page_url, relative_url)
            if not (absolute_url.startswith("http://") or absolute_url.startswith("https://")):
                continue

            start_index = match.start()
            end_index = match.end()

            context_window = 150

            context_before_raw = markdown_text[max(0, start_index - context_window):start_index]
            context_after_raw = markdown_text[end_index:min(len(markdown_text), end_index + context_window)]

            context_snippet_raw = f"{context_before_raw}{full_match_text}{context_after_raw}"
            context_snippet = re.sub(r'\s+', ' ', context_snippet_raw).strip()
            if start_index > context_window:
                context_snippet = "..." + context_snippet
            if end_index + context_window < len(markdown_text):
                context_snippet = context_snippet + "..."

            link_entry = {
                "source_page": source_page_url,
                "link_text": link_text,
                "target_url": absolute_url,
                "context_snippet": context_snippet
            }

            link_hash = json.dumps(link_entry, sort_keys=True)
            if link_hash not in seen_links:
                extracted_links.append(link_entry)
                seen_links.add(link_hash)

    return extracted_links


async def process_page_with_custom_parsing(crawler: AsyncWebCrawler, url: str, content_type: str, output_filename: str, js_script: str = None, wait_for_js_condition: str = None):
    """
    Fetches a page, applies JS/wait conditions, and then uses custom BeautifulSoup parsing.
    Dispatches to the correct parsing function based on content_type.
    """
    print(f"  -> Running CUSTOM BS parsing for {content_type} on {url}")

    config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        wait_until="networkidle",
        js_code=js_script,
        wait_for=wait_for_js_condition,
    )

    try:
        result = await crawler.arun(url, config=config)

        if result.success and result.html:
            parsed_data = []
            if content_type == "faq":
                parsed_data = await parse_faq_html_with_bs(result.html)
            elif content_type == "table": # For 'sticky-enabled' tables
                table_css_selector = "table.sticky-enabled.tableheader-processed.sticky-table"
                parsed_data = await parse_table_html_with_bs(result.html, table_css_selector)
            elif content_type == "angular_table": # For new AngularJS tables
                table_css_selector = "table#tabledata" # Use specific ID for Angular table
                parsed_data = await parse_angular_table_html_with_bs(result.html, table_css_selector)

            if parsed_data:
                if content_type == "faq":
                    filepath = os.path.join(OUTPUT_DIR, "structured_json", f"{output_filename}.json")
                elif content_type == "table" or content_type == "angular_table":
                    filepath = os.path.join(OUTPUT_DIR, "extracted_tables", f"{output_filename}.json")

                with open(filepath, "w", encoding="utf-8") as f:
                    json.dump(parsed_data, f, indent=2)
                print(f"  ✅ Saved custom-parsed {content_type} data to {filepath}")
            else:
                print(f"  ❌ Custom BS parsing yielded empty data for {content_type} on {url}. HTML length: {len(result.html)}. Raw HTML snippet (first 1000 chars): {result.html[:1000]}")
                debug_filepath = os.path.join(OUTPUT_DIR, "debug_html", f"{output_filename}_bs_empty_output.html")
                with open(debug_filepath, "w", encoding="utf-8") as f:
                    f.write(result.html)
                print(f"    (Debug: Saved problematic HTML for BS parsing to {debug_filepath})")
        else:
            print(f"  ❌ Failed to fetch HTML for custom parsing of {content_type} on {url}. Error: {result.error_message}")
            if result.html:
                debug_filepath = os.path.join(OUTPUT_DIR, "debug_html", f"{output_filename}_bs_fetch_failed_html.html")
                with open(debug_filepath, "w", encoding="utf-8") as f:
                    f.write(result.html)
                print(f"    (Debug: Saved problematic HTML to {debug_filepath})")
    except Exception as e:
        print(f"  ❌ Exception during custom BS parsing for {content_type} on {url}: {e}")


async def extract_unstructured_page(crawler: AsyncWebCrawler, url: str, output_filename: str):
    """
    Extractor for unstructured text and native tables.
    Now also extracts links from the markdown content.
    """
    print(f"  -> Running UNSTRUCTURED extraction on {url}")
    try:
        md_generator = DefaultMarkdownGenerator()

        config = CrawlerRunConfig(
            markdown_generator=md_generator,
            cache_mode=CacheMode.ENABLED,
            process_iframes=True,
            table_score_threshold=0
        )
        result = await crawler.arun(url, config=config)

        if result.success:
            markdown_to_save = None
            if result.markdown and result.markdown.fit_markdown and len(result.markdown.fit_markdown.strip()) > 50:
                markdown_to_save = result.markdown.fit_markdown
                print(f"  ℹ️ Using fit_markdown for {url} (length: {len(markdown_to_save)} chars)")
            elif result.markdown and result.markdown.raw_markdown and len(result.markdown.raw_markdown.strip()) > 50:
                markdown_to_save = result.markdown.raw_markdown
                print(f"  ⚠️ Warning: fit_markdown too short for {url}. Using raw_markdown (length: {len(markdown_to_save)} chars)")
            elif result.html and len(result.html.strip()) > 50:
                soup = BeautifulSoup(result.html, 'html.parser')
                main_content_tag = soup.find('main') or soup.find('article') or soup.find('div', id='content')
                markdown_to_save = main_content_tag.get_text(separator='\n', strip=True) if main_content_tag else soup.get_text(separator='\n', strip=True)
                print(f"  ⚠️ Warning: No markdown generated for {url}. Falling back to BeautifulSoup text extraction (length: {len(markdown_to_save)} chars)")

            if markdown_to_save and len(markdown_to_save.strip()) > 50:
                filepath = os.path.join(OUTPUT_DIR, "unstructured_markdown", f"{output_filename}.md")
                with open(filepath, "w", encoding="utf-8") as f:
                    f.write(markdown_to_save)
                print(f"  ✅ Saved markdown content to {filepath}")

                # --- NEW: Extract links from markdown content with context ---
                extracted_links = await extract_links_from_markdown_file(markdown_to_save, url)
                if extracted_links:
                    links_filepath = os.path.join(OUTPUT_DIR, "extracted_links", f"{output_filename}_links.json")
                    with open(links_filepath, "w", encoding="utf-8") as f:
                        json.dump(extracted_links, f, indent=2)
                    print(f"  🔗 Extracted {len(extracted_links)} links with context from markdown to {links_filepath}")
                else:
                    print(f"  ➡️ No links found in markdown for {url}.")

            else:
                print(f"  ❌ Failed to extract any meaningful text content from {url} after all attempts.")
                if result.html:
                    debug_filepath = os.path.join(OUTPUT_DIR, "debug_html", f"{output_filename}_no_markdown.html")
                    with open(debug_filepath, "w", encoding="utf-8") as f:
                        f.write(result.html)
                    print(f"    (Debug: Saved problematic HTML to {debug_filepath})")

            if result.media and "tables" in result.media:
                if result.media["tables"]:
                    print(f"  📊 Native table detection found {len(result.media['tables'])} tables on {url}.")
                else:
                    print(f"  ➡️ Native table detection found no tables with threshold (0) on {url}.")
            else:
                print(f"  ➡️ No 'tables' key found in result.media for {url}.")

        else:
            print(f"  ❌ Failed to fetch page content for {url}. Error: {result.error_message}")
    except Exception as e:
        print(f"  ❌ Exception during unstructured extraction on {url}: {e}")

async def download_document(crawler: AsyncWebCrawler, url: str, output_filename: str):
    """
    Handles downloading a document using Crawl4AI's native PDF capture.
    """
    print(f"  -> Running DOCUMENT download for {url} using native PDF capture.")

    filename = os.path.basename(url.split('?')[0]).replace('%20', ' ')
    if not filename:
        filename = f"{output_filename}_download"

    if not any(filename.lower().endswith(ext) for ext in [".pdf", ".docx", ".xlsx", ".doc", ".xls"]):
        filename += ".pdf"

    pdf_target_path = os.path.join(OUTPUT_DIR, "pdfs", filename)
    os.makedirs(os.path.join(OUTPUT_DIR, "pdfs"), exist_ok=True)

    try:
        config = CrawlerRunConfig(
            pdf=True,
            cache_mode=CacheMode.BYPASS,
            page_timeout=60000
        )
        result = await crawler.arun(url, config=config)

        if result.success and result.pdf:
            decoded_pdf_content = base64.b64decode(result.pdf)
            with open(pdf_target_path, "wb", encoding='utf-8') as f: # Specify encoding for text output
                f.write(decoded_pdf_content)
            print(f"  ✅ Document downloaded successfully: {pdf_target_path} (size: {len(decoded_pdf_content)} bytes) from {url}")
        else:
            print(f"  ❌ Failed to download document from {url}. Result success: {result.success}. Result.pdf present: {bool(result.pdf)}. Error: {result.error_message}")
            if result.html:
                print(f"  ⚠️ Note: HTML content was retrieved for {url}. This might not be a direct document link and requires further investigation for content extraction.")
                debug_filepath = os.path.join(OUTPUT_DIR, "debug_html", f"{output_filename}_pdf_download_failed_html.html")
                with open(debug_filepath, "w", encoding="utf-8") as f:
                    f.write(result.html)
                print(f"    (Debug: Saved problematic HTML for PDF URL to {debug_filepath})")

    except Exception as e:
        print(f"  ❌ Exception during document download on {url}: {e}")


async def main_orchestrator():
    """
    Main orchestrator to read the URL list and delegate to the correct extractor.
    """
    print("🚀 Starting Main Extraction Orchestrator (with Markdown Link Extraction!)...")

    if not os.path.exists(URL_LIST_FILE):
        print(f"❌ Error: '{URL_LIST_FILE}' not found.")
        print("Please ensure you've run the 'run_true_deep_crawl' script first to populate this file.")
        return

    with open(URL_LIST_FILE, "r") as f:
        urls = [line.strip() for line in f.readlines()]

    browser_config = BrowserConfig(headless=True, accept_downloads=True)

    async with AsyncWebCrawler(config=browser_config) as crawler:
        for url in urls:
            print(f"\nProcessing URL: {url}")

            safe_filename = url.replace('https://', '').replace('http://', '').replace('/', '_').replace('?', '_').replace('&', '_').replace('=', '_').replace('.', '_').replace('__', '_')
            if not safe_filename:
                safe_filename = "index"

            # --- ROUTING LOGIC ---

            # Explicitly skip known problematic/irrelevant URLs from content extraction.
            if "mailto:" in url or \
               "/internal/logout" in url or \
               "/internal/registration" in url or \
               "/internal/uops" in url:
                print(f"  -> Skipping explicitly excluded URL for content extraction: {url}")
                continue

            # Prioritize direct document downloads for file extensions and known download paths
            elif url.lower().endswith((".pdf", ".docx", ".xlsx", ".doc", ".xls")) or \
                 "/docs/" in url.lower() or "/look/docs/" in url.lower() or \
                 "/filebrowser/download/" in url.lower().replace('%20', ' '):
                print(f"  -> Detected document for download: {url}")
                await download_document(crawler, url, f"doc_{safe_filename}")

            # --- FAQ Extraction (Using Custom BeautifulSoup Parsing with aggressive cleaning) ---
            elif "/faq-page" in url:
                print(f"  -> Detected FAQ page. Running CUSTOM BeautifulSoup extraction for: {url}")
                js_script_faq_expand = """
                    (async () => {
                        const questionElements = document.querySelectorAll('div.faq-question.faq-dt-hide-answer');
                        for (let i = 0; i < questionElements.length; i++) {
                            const el = questionElements[i];
                            if (el.offsetParent !== null) {
                                el.click();
                                await new Promise(resolve => setTimeout(resolve, 750));
                            }
                        }
                    })();
                """
                wait_for_faq_content = "js:() => document.querySelectorAll('div.faq-answer:not(.collapsed)').length > 0"

                await process_page_with_custom_parsing(crawler, url, "faq", f"faq_{safe_filename}", js_script=js_script_faq_expand, wait_for_js_condition=wait_for_faq_content)

            # --- Targeted Table Extraction (Using Custom BeautifulSoup Parsing for sticky-enabled) ---
            elif "/data-quality" in url or \
                 "/calibration-reports" in url or \
                 "/validation-reports" in url or \
                 "/insitu" in url:
                print(f"  -> Detected 'sticky-enabled' table page. Running CUSTOM BeautifulSoup extraction for: {url}")
                await process_page_with_custom_parsing(crawler, url, "table", f"table_{safe_filename}", js_script=None, wait_for_js_condition=None)

            # --- NEW: Targeted AngularJS Table Extraction (Using Custom BeautifulSoup Parsing) ---
            # Assuming these URLs contain the AngularJS table (based on product catalog theme)
            elif "internal/catalog-satellite" in url or \
                 "internal/catalog-insitu" in url or \
                 "internal/catalog-radar" in url or \
                 "internal/catalog-insat3a" in url or \
                 "internal/catalog-insat3d" in url or \
                 "internal/catalog-insat3dr" in url or \
                 "internal/catalog-insat3s" in url or \
                 "internal/catalog-kalpana1" in url or \
                 "internal/catalog-meghatropiques" in url or \
                 "internal/catalog-oceansat2" in url or \
                 "internal/catalog-oceansat3" in url or \
                 "internal/catalog-saral" in url or \
                 "internal/catalog-scatsat" in url:
                print(f"  -> Detected AngularJS table page. Running CUSTOM BeautifulSoup extraction for: {url}")

                # JavaScript wait for the first dynamically rendered cell to have content
                js_wait_for_angular_content = "js:() => document.querySelector('table#tabledata tbody tr:first-child td:nth-child(2) b').textContent.trim().length > 0"

                await process_page_with_custom_parsing(
                    crawler,
                    url,
                    "angular_table", # Content type identifier for dispatching
                    f"angular_table_{safe_filename}",
                    js_script=None, # No specific Angular interaction JS needed, relying on load state and wait_for
                    wait_for_js_condition=js_wait_for_angular_content
                )

            # Generic skipping of common navigational or less content-rich pages
            elif any(keyword in url for keyword in ["/about-us", "/contact-us", "policy", "/sitemap", "/gallery", "/rss", "announcements"]):
                print(f"  -> Skipping generic navigational/policy page: {url}")
                continue

            # Handle XML files as unstructured text
            elif url.lower().endswith(".xml"):
                print(f"  -> Treating XML file as UNSTRUCTURED text. URL: {url}")
                await extract_unstructured_page(crawler, url, f"xml_page_{safe_filename}")

            # Dynamic tabbed content pages (e.g., /node?qt-latest_products=1)
            elif "node?qt" in url:
                print(f"  -> Treating dynamic tab page as UNSTRUCTURED for now. URL: {url}")
                await extract_unstructured_page(crawler, url, f"dynamic_tab_{safe_filename}")

            # Internal section pages (e.g., /internal/aws, /internal/cyclone)
            elif "/internal/" in url:
                print(f"  -> Treating /internal/ page as UNSTRUCTURED for now. URL: {url}")
                await extract_unstructured_page(crawler, url, f"internal_page_{safe_filename}")

            else:
                # Default to unstructured extraction for all other HTML pages not specifically routed.
                await extract_unstructured_page(crawler, url, f"page_{safe_filename}")

    print("\n--- Orchestrator Finished! ---")
    print(f"Check the '{OUTPUT_DIR}' directory for extracted content.")

await main_orchestrator()

Output hidden; open in https://colab.research.google.com to view.