In [6]:
import requests
from bs4 import BeautifulSoup
import time
import json
from textwrap import wrap

In [3]:
BASE_URL = "https://cloud.google.com"
DOCS_URL = "https://cloud.google.com/docs"

headers = {
    "User-Agent": "Mozilla/5.0 (compatible; GoogleCloudScraper/1.0)"
}

In [None]:
# Scrap the Google Cloud documentation
def get_doc_links(start_url):
    """Scrape documentation article links from the main docs page."""
    print(f"Fetching links from {start_url}...")
    links = set()
    res = requests.get(start_url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    for a in soup.select("a[href^='/']"):
        href = a.get("href")
        if "/docs" in href and "/docs/" in href and not href.endswith("/docs"):
            full_url = BASE_URL + href.split("#")[0]
            links.add(full_url)

    return list(links)

def scrape_article(url):
    """Scrape title and main content from an article URL."""
    print(f"Scraping: {url}")
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    title = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No title"
    paragraphs = soup.select("main p, main li")

    content = "\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
    return {
        "url": url,
        "title": title,
        "content": content
    }

def scrape_docs():
    doc_links = get_doc_links(DOCS_URL)
    docs_data = []

    for url in doc_links:
        try:
            data = scrape_article(url)
            if data["content"]:
                docs_data.append(data)
            time.sleep(1)  # Politeness delay
        except Exception as e:
            print(f"Error scraping {url}: {e}")

    with open("google_cloud_docs.json", "w", encoding="utf-8") as f:
        json.dump(docs_data, f, indent=2, ensure_ascii=False)

    print(f"Saved {len(docs_data)} articles.")

if __name__ == "__main__":
    scrape_docs()

Fetching links from https://cloud.google.com/docs...
Scraping: https://cloud.google.com/resource-manager/docs/organization-policy/overview
Scraping: https://cloud.google.com/vertex-ai/docs/training-overview
Scraping: https://cloud.google.com/docs/authentication
Scraping: https://cloud.google.com/distributed-cloud/hosted/docs/latest/gdch
Scraping: https://cloud.google.com/free/docs/free-cloud-features
Scraping: https://cloud.google.com/vertex-ai/docs/evaluation/introduction
Scraping: https://cloud.google.com/network-connectivity/docs/router
Scraping: https://cloud.google.com/docs/application-hosting
Scraping: https://cloud.google.com/vertex-ai/docs/featurestore/latest/overview
Scraping: https://cloud.google.com/sql/docs/mysql
Scraping: https://cloud.google.com/carbon-footprint/docs/view-carbon-data
Scraping: https://cloud.google.com/vertex-ai/docs/training/overview
Scraping: https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/overview
Scraping: https://cloud.google.com/

In [None]:
# Chunk the scraped documents
# Split large documents into smaller, meaningful chunks (100–300 words each) for better retrieval.

with open("google_cloud_docs.json", "r") as f:
    docs = json.load(f)
chunked_docs = []
CHUNK_SIZE = 300  # words

for doc in docs:
    words = doc["content"].split()
    chunks = wrap(" ".join(words), CHUNK_SIZE * 6)  # ~6 chars per word
    for i, chunk in enumerate(chunks):
        chunked_docs.append({
            "source_url": doc["url"],
            "title": doc["title"],
            "chunk_id": f"{doc['url']}#chunk-{i}",
            "content": chunk
        })

# Optional: save
with open("chunked_docs.json", "w") as f:
    json.dump(chunked_docs, f, indent=2)
