The pipeline involves:

(1) querying the Serper API to find a website’s sitemap,

(2) extracting the general sitemap URL,

(3) fetching the sitemap content,

(4) saving it as a text file, and

(5) using Gemini to parse the sitemap into a JSON hierarchy. Here’s how to automate it:


In [None]:
%pip install requests
%pip install google-generativeai
import requests
import json
import google.generativeai as genai
from bs4 import BeautifulSoup
from urllib.parse import urlparse

import os
%pip install python-dotenv
from dotenv import load_dotenv

load_dotenv()

GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
SERPER_API_KEY = os.environ.get('SERPER_API_KEY')

if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY is not set")
if not SERPER_API_KEY:
    raise ValueError("SERPER_API_KEY is not set")

# Configure Gemini API
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash')

# List of 100 companies to process
companies = [
    "Sony",
    "Nintendo",
]

def search_sitemap(company_name):
    """Search for a company's sitemap using Serper API"""
    url = "https://google.serper.dev/search"
    payload = json.dumps({"q": f"{company_name} sitemap"})
    headers = {
        'X-API-KEY': SERPER_API_KEY,
        'Content-Type': 'application/json'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    search_results = json.loads(response.text)
    candidate_urls = []
    if 'organic' in search_results and len(search_results['organic']) > 0:
        for result in search_results['organic']:
            candidate_urls.append(result['link'])
    
    suffixes = ['', '/sitemap.xml', '/site-map', '/sitemap/']
    for base_url in candidate_urls:
        for suffix in suffixes:
            test_url = base_url.rstrip('/') + suffix
            try:
                resp = requests.get(test_url, timeout=5)
                if resp.status_code == 200:
                    return test_url
            except Exception:
                continue
    return None

def fetch_sitemap_content(sitemap_url):
    """Fetch the content of the sitemap"""
    response = requests.request("GET", sitemap_url)
    return response.text

def analyze_with_gemini(sitemap_content):
    """Analyze the sitemap content using Gemini API"""
    prompt = """
    Scrape this HTML or XML sitemap and give me the sitemap hierarchy in JSON format.
    Use a 2-level hierarchy: section and subsections (no URLs).
    Make sure to use the sitemap section that includes investors.
    Focus on common categories like About, Investors, News, Products, Services, Store.

    Output example:
    {
      "sitemap": {
        "Section1": {
          "subsections": ["SubsectionA", "SubsectionB"]
        },
        "Section2": {
          "subsections": ["SubsectionC", "SubsectionD"]
        }
      }
    }
    Only output valid JSON.
    Here's the sitemap content:
    """
    response = model.generate_content(prompt + sitemap_content)
    # Try to extract only the JSON part
    try:
        json_start = response.text.find('{')
        json_end = response.text.rfind('}') + 1
        json_str = response.text[json_start:json_end]
        return json.loads(json_str)
    except Exception as e:
        print("Error parsing Gemini output:", e)
        return None

def save_to_file(analysis_data, company_name):
    """
    Save analysis data to a JSON file.

    Args:
        analysis_data (dict): Analysis data to save
        company_name (str): Company name for filename

    Returns:
        str: Path to the saved file
    """
    # Generate filename
    filename = f"{company_name.replace(' ', '_')}_sitemap_analysis.json"
    filepath = filename 

    # Save to file
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(analysis_data, f, indent=2)
        print(f"  File saved: {filepath}")
        return filepath
    except Exception as e:
        print(f"  Error saving file {filepath}: {e}")
        return None

def run_sitemap_analysis(company_name):
    print(f"\nProcessing: {company_name}")
    sitemap_url = search_sitemap(company_name)
    if not sitemap_url:
        print("  Could not find sitemap URL.")
        return
    print(f"  Found sitemap URL: {sitemap_url}")
    sitemap_content = fetch_sitemap_content(sitemap_url)
    if not sitemap_content:
        print("  Could not fetch sitemap content.")
        # Fallback: Try homepage footer extraction
        homepage_url = sitemap_url.split('/sitemap')[0] if '/sitemap' in sitemap_url else sitemap_url.split('/site-map')[0]
        if not homepage_url.endswith('/'):
            homepage_url += '/'
        print(f"  Trying to fetch homepage footer: {homepage_url}")
        try:
            resp = requests.get(homepage_url, timeout=10)
            if resp.status_code == 200:
                soup = BeautifulSoup(resp.text, 'lxml')
                footer = soup.find('footer')
                if footer:
                    print("  Found <footer> tag. Analyzing with Gemini...")
                    analysis_result = analyze_with_gemini(str(footer))
                    if analysis_result:
                        save_to_file(analysis_result, company_name)
                        return
                    else:
                        print("  Gemini analysis of footer failed.")
                else:
                    print("  No <footer> tag found on homepage.")
            else:
                print(f"  Could not fetch homepage HTML. Status: {resp.status_code}")
        except Exception as e:
            print(f"  Error fetching homepage/footer: {e}")
        return
    print("  Analyzing sitemap with Gemini...")
    analysis_result = analyze_with_gemini(sitemap_content)
    if not analysis_result:
        print("  Gemini analysis failed.")
        # Fallback: Try footer extraction from sitemap page
        try:
            resp = requests.get(sitemap_url, timeout=10)
            if resp.status_code == 200:
                soup = BeautifulSoup(resp.text, 'lxml')
                footer = soup.find('footer')
                if footer:
                    print("  Found <footer> tag in sitemap page. Analyzing with Gemini...")
                    analysis_result = analyze_with_gemini(str(footer))
                    if analysis_result:
                        save_to_file(analysis_result, company_name)
                        return
                    else:
                        print("  Gemini analysis of sitemap footer failed.")
                else:
                    print("  No <footer> tag found in sitemap page.")
            else:
                print(f"  Could not fetch sitemap HTML for footer. Status: {resp.status_code}")
        except Exception as e:
            print(f"  Error fetching sitemap/footer: {e}")
        return
    filename = f"{company_name.replace(' ', '_')}_sitemap_analysis.json"
    save_to_file(analysis_result, company_name)

# --- Batch process all companies ---
for company in companies:
    run_sitemap_analysis(company)

print("\nAll done! Each company's sitemap structure is saved as a JSON file in the current directory.")

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm



Processing: Sony
  Found sitemap URL: https://www.sony.com/en/sitemap
  Analyzing sitemap with Gemini...
Error parsing Gemini output: Expecting value: line 1 column 1 (char 0)
  Gemini analysis failed.
  Could not fetch sitemap HTML for footer. Status: 403

All done! Each company's sitemap structure is saved as a JSON file in the current directory.


Use the existing sitemap analysis JSON files to automatically generate a "real training set" as few-shot examples, which are dynamically inserted into the prompt.

Steps:
Aggregate all sitemap analysis JSON files to extract the mappings between sections/subsections and their actual company-level categories (e.g. from your previous manual classification).

Automatically select representative section names under each major category as few-shot examples, and insert them at the beginning of the prompt.

For each batch of URL classification, include these real examples in the prompt to encourage the LLM to imitate the true distribution.

This approach significantly increases the likelihood that the LLM will mimic the true label distribution and greatly reduce misclassification into "Other".

(Previously, even with detailed prompts and examples, the LLM still classified many obvious "Investors" or "About" sections as "Other" during batch classification. This is because the LLM lacks global context when processing in batches—it cannot "remember" the full structure and real classification logic across all websites. It relies solely on the examples and limited context provided in the prompt.)

The LLM also tends to misclassify short or vague terms (e.g. Board, Leadership) as "Other" when there is insufficient context.

Furthermore:

The LLM treats few-shot examples only as references, not as strict constraints.

It relies more on the batch content and the current prompt than on the actual historical label distribution.

The LLM's classification ability is inherently limited, especially when dealing with industry jargon or ambiguous terms.


In [None]:
# Install dependencies
%pip install google-generativeai pandas tqdm

import os
import glob
import json
import pandas as pd
import re
from collections import Counter, defaultdict
import google.generativeai as genai
from tqdm import tqdm

# 1. Collect all section and subsection names from all sitemap JSON files
json_files = glob.glob("*_sitemap_analysis.json")
section_counter = Counter()
section_to_subsections = defaultdict(list)

for file in json_files:
    with open(file, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            sitemap = data.get("sitemap", {})
            for section, value in sitemap.items():
                if section and isinstance(section, str):
                    section_counter[section.strip()] += 1
                subsections = value.get("subsections", []) if isinstance(value, dict) else []
                for sub in subsections:
                    if sub and isinstance(sub, str):
                        section_to_subsections[section.strip()].append(sub.strip())
        except Exception as e:
            print(f"Error in {file}: {e}")

# 2. Use LLM to merge all section synonyms (not just top-N)
all_sections = list(section_counter.keys())
def merge_synonyms_llm(names, model, level="section"):
    prompt = (
        f"You are a website taxonomy expert. Here is a list of {level} names from many real company sitemaps. "
        "Please group them by meaning (synonyms, similar intent, or spelling variants), and for each group, pick the most common or most standard English name as the group label. "
        "Output a JSON object: {group_label: [all names in this group, ...]}. "
        "Do not invent new categories, just merge similar ones. "
        "Here are the names:\n"
        + "\n".join(f"- {n}" for n in names)
        + "\n\nJSON:"
    )
    for _ in range(3):
        try:
            response = model.generate_content(prompt)
            text = response.text
            match = re.search(r"\{[\s\S]*\}", text)
            if match:
                return json.loads(match.group(0))
            else:
                return json.loads(text)
        except Exception as e:
            print("LLM merge failed, retrying...", e)
    return {}

section_groups = merge_synonyms_llm(all_sections, model, level="section")

# 3. Build a mapping from original section to standardized section
section_name_to_main = {}
for main, group in section_groups.items():
    for n in group:
        section_name_to_main[n.strip()] = main.strip()

# 4. For each standardized section, merge its subsections using LLM
main_section_to_subsections = defaultdict(list)
for section, subs in section_to_subsections.items():
    main_section = section_name_to_main.get(section, section)
    main_section_to_subsections[main_section].extend(subs)

universal_framework = {}
for main_section, subs in main_section_to_subsections.items():
    sub_counter = Counter(subs)
    all_subs = list(sub_counter.keys())
    if not all_subs:
        continue
    # Use LLM to merge subsection synonyms within this section
    sub_groups = merge_synonyms_llm(all_subs, model, level="subsection")
    # Count frequency of each standardized subsection
    sub_name_to_main = {}
    for main, group in sub_groups.items():
        for n in group:
            sub_name_to_main[n.strip()] = main.strip()
    sub_freq = Counter()
    for sub in subs:
        main_sub = sub_name_to_main.get(sub, sub)
        sub_freq[main_sub] += 1
    # Only keep subsections that appear at least twice (adjust as needed)
    filtered_subs = [s for s, c in sub_freq.items() if c >= 2]
    if filtered_subs:
        universal_framework[main_section] = sorted(filtered_subs)

# 5. Output the result
with open("universal_website_category_framework_graph.json", "w", encoding="utf-8") as f:
    json.dump(universal_framework, f, indent=2, ensure_ascii=False)
print("Done! universal website category framework saved to universal_website_category_framework_graph.json")
print(json.dumps(universal_framework, indent=2, ensure_ascii=False))


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.
LLM merge failed, retrying... 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 46
}
]
LLM merge failed, retrying... 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "genera

In [None]:
# Install dependencies if needed
# %pip install lxml beautifulsoup4 google-generativeai

import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
import json
import random
import re
import time
from urllib.parse import urljoin

# 1. Load your universal website category framework
with open("universal_website_category_framework.json", "r", encoding="utf-8") as f:
    framework = json.load(f)

section_subsections = []
for section, subs in framework.items():
    for sub in subs:
        section_subsections.append((section, sub))


# 2. Input company name
company_name = input("Enter company name: ")

# 3. Search for sitemap URL (reuse your function if available)
def search_sitemap(company_name):
    url = "https://google.serper.dev/search"
    payload = json.dumps({"q": f"{company_name} sitemap"})
    headers = {
        'X-API-KEY': SERPER_API_KEY,
        'Content-Type': 'application/json'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    search_results = json.loads(response.text)
    candidate_urls = []
    if 'organic' in search_results and len(search_results['organic']) > 0:
        for result in search_results['organic']:
            candidate_urls.append(result['link'])
    suffixes = ['', '/sitemap.xml', '/site-map', '/sitemap/']
    for base_url in candidate_urls:
        for suffix in suffixes:
            test_url = base_url.rstrip('/') + suffix
            try:
                resp = requests.get(test_url, timeout=5)
                if resp.status_code == 200:
                    return test_url
            except Exception:
                continue
    return None

sitemap_url = search_sitemap(company_name)
if not sitemap_url:
    print("Could not find sitemap URL.")
    exit()
print("Sitemap URL:", sitemap_url)

# 4. Fetch URLs from sitemap (your function, works for XML/HTML)
def fetch_urls_from_sitemap(sitemap_url, max_links=100):
    urls = []
    try:
        resp = requests.get(sitemap_url, timeout=10)
        if resp.status_code != 200:
            print(f"Failed to fetch {sitemap_url}")
            return urls
        content_type = resp.headers.get("Content-Type", "")
        text = resp.text
        # Try XML first
        if sitemap_url.endswith('.xml') or 'xml' in content_type:
            soup = BeautifulSoup(text, "lxml-xml")
            sitemap_tags = soup.find_all("sitemap")
            if sitemap_tags:
                # Sitemap index
                for sitemap in sitemap_tags:
                    loc = sitemap.find("loc")
                    if loc and len(urls) < max_links:
                        child_urls = fetch_urls_from_sitemap(loc.text, max_links=max_links-len(urls))
                        urls += child_urls
                        if len(urls) >= max_links:
                            break
            else:
                # Regular XML sitemap
                for loc in soup.find_all("loc"):
                    if len(urls) < max_links:
                        urls.append(loc.text)
                    else:
                        break
        else:
            # Try HTML sitemap
            soup = BeautifulSoup(text, "lxml")
            for a in soup.find_all("a", href=True):
                href = a['href']
                if href.startswith("http"):
                    urls.append(href)
                elif href.startswith("/"):
                    full_url = urljoin(sitemap_url, href)
                    urls.append(full_url)
                if len(urls) >= max_links:
                    break
    except Exception as e:
        print(f"Error fetching/parsing sitemap: {e}")
    return list(set(urls))

max_links = 100
sitemap_urls = fetch_urls_from_sitemap(sitemap_url, max_links=max_links)
print(f"Found {len(sitemap_urls)} URLs in sitemap.")

# 5. Prepare mapping examples from the universal framework
mapping_examples = []
for section, subs in universal_framework.items():
    for sub in subs:
        mapping_examples.append(f"  '{section} > {sub}' -> section: {section}, subsection: {sub}")
    if not subs:
        mapping_examples.append(f"  '{section}' -> section: {section}, subsection: null")

# 6. LLM mapping function
def extract_json_from_text(text):
    text = text.strip()
    if text.startswith("```json"):
        text = text[7:]
    if text.startswith("```"):
        text = text[3:]
    if text.endswith("```"):
        text = text[:-3]
    matches = re.findall(r"\{[\s\S]*?\}", text)
    for m in matches:
        try:
            return json.loads(m)
        except Exception:
            continue
    try:
        return json.loads(text)
    except Exception:
        return None

# for privity concern, the company example here is deleted
def llm_url_to_framework_category_single(urls, section_subsections, model, max_retries=3):
    options_str = "\n".join([f"- {sec} / {sub}" for sec, sub in section_subsections])
    example = (
        '{\n'
        '  "https://www.xxxxxxx.com/about-us/index": ["About Us", "About Us"],\n'
        '  "https://www.xxxxxxx.com/investors/shareholders/registration-services": ["Investors", "Investor Relations"]\n'
        '}'
    )
    result = {}
    for url in urls:
        prompt = (
            "You are a website taxonomy expert. For the following URL, "
            "assign it to the most appropriate (section, subsection) from the universal website category framework below. "
            "You MUST choose from the provided pairs, do NOT invent new ones. Do NOT use 'Other'.\n\n"
            "Valid (section, subsection) pairs:\n"
            f"{options_str}\n\n"
            "Output as a JSON object mapping the URL to a [section, subsection] list. Example:\n"
            f"{example}\n"
            "URL:\n"
            f"- {url}\n\nJSON:"
        )
        for attempt in range(max_retries):
            try:
                response = model.generate_content(prompt)
                text = response.text
                print("LLM raw output:\n", text)
                parsed = extract_json_from_text(text)
                if parsed:
                    result.update(parsed)
                    break
                else:
                    print("No JSON found in LLM response, retrying...")
            except Exception as e:
                print(f"Error: {e}, retrying...")
                time.sleep(3)
    return result
    

# 7. Configure Gemini
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash')

# 8. Classify URLs
classified_urls = llm_url_to_framework_category_single(
    sitemap_urls, section_subsections, model
)
print(f"Classified {len(classified_urls)} URLs.")
for item in list(classified_urls.items())[:10]:
    print(item)

# 9. Save result as JSON
safe_company_name = company_name.replace(" ", "_").replace("/", "_")
filename = f"url_to_universal_category_{safe_company_name}.json"
with open(filename, "w", encoding="utf-8") as f:
    json.dump(classified_urls, f, indent=2, ensure_ascii=False)
print(f"Saved to {filename}")

In [None]:
#Coverage
import glob
import json

result_files = glob.glob("url_to_universal_category_*.json")
total_urls = 0
classified_urls = 0

for file in result_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        total_urls += len(data)
        
        classified_urls += sum(1 for v in data.values() if v[0] != "Other")

coverage = classified_urls / total_urls if total_urls else 0
print(f"Coverage: {coverage:.2%} ({classified_urls}/{total_urls})")

#Diversity, sections used
diversity_list = []
for file in result_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        sections = set(v[0] for v in data.values() if v[0] != "Other")
        diversity_list.append(len(sections))
print(f"Average category diversity per company: {sum(diversity_list)/len(diversity_list):.2f}")

#show most common section
from collections import Counter
all_sections = []
for file in result_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        all_sections += [v[0] for v in data.values()]

section_counter = Counter(all_sections)
total = sum(section_counter.values())
for section, count in section_counter.most_common():
    print(f"{section}: {count} ({count/total:.2%})")

#visualisation
import matplotlib.pyplot as plt
labels, values = zip(*section_counter.most_common())
plt.figure(figsize=(10,5))
plt.bar(labels, values)
plt.xticks(rotation=45)
plt.title("Distribution of Sections Across All Companies")
plt.ylabel("Number of Pages")
plt.show()

#consistency
import random

sampled = []
for file in result_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        sampled += random.sample(list(data.items()), min(5, len(data)))
for url, value in sampled:
    section = value[0] if len(value) > 0 else None
    subsection = value[1] if len(value) > 1 else None
    print(f"{url} -> {section} / {subsection}")