In [None]:
import requests
import json
import zipfile
import os
import time
from google.colab import files

# ===============================
# 1. ARRAY OF FIRECRAWL API KEYS
# ===============================
FIRECRAWL_KEYS = [

]

# ====================
# 2. ARRAY OF URLS
# ====================
URLS = [
    "https://stgeorgembc.com.au/",
    "https://stgeorgembc.com.au/about-us/",
    "https://stgeorgembc.com.au/clubgrants-2022/",
    "https://stgeorgembc.com.au/club-history/",
    "https://stgeorgembc.com.au/trading-hours/",
    "https://stgeorgembc.com.au/gallery/",
    "https://stgeorgembc.com.au/courtesy-bus/",
    "https://stgeorgembc.com.au/frequently-asked-questions/",
    "https://stgeorgembc.com.au/we-care/",
    "https://stgeorgembc.com.au/latest-news/",
    "https://stgeorgembc.com.au/venue-safety-plan/",
    "https://stgeorgembc.com.au/join-our-team/",
    "https://stgeorgembc.com.au/our-fantastic-staff/",
    "https://stgeorgembc.com.au/annual-report/",
    "https://stgeorgembc.com.au/membership-reminder/",
    "https://stgeorgembc.com.au/dress-regulations/",
    "https://stgeorgembc.com.au/marina/",
    "https://stgeorgembc.com.au/marina/marina-berthing/",
    "https://stgeorgembc.com.au/environmental/",
    "https://stgeorgembc.com.au/mooring-detail-search/",
    "https://stgeorgembc.com.au/boat-ramp/",
    "https://stgeorgembc.com.au/boat-hire/",
    "https://stgeorgembc.com.au/restaurant-menus/",
    "https://stgeorgembc.com.au/meet-our-chefs/",
    "https://stgeorgembc.com.au/wp-content/uploads/2023/10/BayBreeze-Cafe-231017.pdf",
    "https://stgeorgembc.com.au/wp-content/uploads/2023/05/Pizza-.pdf",
    "https://stgeorgembc.com.au/#",
    "https://stgeorgembc.com.au/whats-on/entertainment/",
    "https://stgeorgembc.com.au/sub-clubs/",
    "https://stgeorgembc.com.au/contact/",
    "https://stgeorgembc.com.au/whats-on/",
    "https://stgeorgembc.com.au/whats-on/promotions-and-raffles/",
    "https://stgeorgembc.com.au/new-badge-draw/",
    "https://stgeorgembc.com.au/christmas-buffet/",
    "https://stgeorgembc.com.au/marina-extension/",
    "https://stgeorgembc.com.au/new-marina-bonds/",
    "https://stgeorgembc.com.au/instant-membership/",
    "https://stgeorgembc.com.au/hire-me-today/",
    "https://stgeorgembc.com.au/cocktail-hour/",
    "https://stgeorgembc.com.au/virtualtour/",
    "https://stgeorgembc.com.au/railway-pde-kogarah-clubhouse/",
    "https://stgeorgembc.com.au/club-renovations-2022/",
    "https://stgeorgembc.com.au/?page_id=328",
    "https://stgeorgembc.com.au/?page_id=468"
]




# =======================================================
# 3. REQUEST FUNCTION WITH RETRIES & ERROR HANDLING
# =======================================================
def get_raw_html(url, api_key, max_retries=3, retry_delay=3):
    """
    Sends a Firecrawl scrape request and returns raw HTML.
    Retries up to max_retries times if the request fails.
    """
    api_url = "https://api.firecrawl.dev/v1/scrape"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "url": url,
        "formats": ["rawHtml"]
    }

    for attempt in range(1, max_retries + 1):
        try:
            response = requests.post(api_url, headers=headers, data=json.dumps(payload))

            # Successful request
            if response.status_code == 200:
                data = response.json()
                if "rawHtml" in data:
                    return data["rawHtml"]
                else:
                    print(f"‚ö†Ô∏è Attempt {attempt}: 'rawHtml' missing in response.")
            else:
                print(f"‚ùå Attempt {attempt}: HTTP {response.status_code} for {url}")

        except requests.exceptions.RequestException as e:
            print(f"‚ùå Attempt {attempt}: Request error ‚Üí {e}")

        # Retry if not last attempt
        if attempt < max_retries:
            print(f"üîÅ Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)

    print(f"‚õî Failed after {max_retries} attempts ‚Üí {url}")
    return None


# ================================
# 4. FETCH + SAVE HTML FILES
# ================================
os.makedirs("html_raw", exist_ok=True)

for i, url in enumerate(URLS):
    api_key = FIRECRAWL_KEYS[i % len(FIRECRAWL_KEYS)]  # rotate API key

    print(f"\n=== Fetching ({i+1}/{len(URLS)}) ‚Üí {url} ===")
    print(f"Using API key: {api_key[:10]}...")

    html_content = get_raw_html(url, api_key)

    if html_content:
        filename = f"html_raw/page_{i+1}.html"
        with open(filename, "w", encoding="utf-8") as f:
            f.write(html_content)
        print(f"‚úî Saved: {filename}")
    else:
        print(f"‚ùå Skipped saving due to repeated errors: {url}")


# ======================
# 5. ZIP HTML FILES
# ======================
zip_filename = "html-raw.zip"

with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in os.listdir("html_raw"):
        filepath = os.path.join("html_raw", file)
        zipf.write(filepath, arcname=file)

print("\n‚úî All HTML files zipped into:", zip_filename)


# =========================
# 6. DOWNLOAD ZIP FILE
# =========================
files.download(zip_filename)
print("‚¨áÔ∏è Download should begin automatically.")


=== Fetching (1/44) ‚Üí https://stgeorgembc.com.au/ ===
Using API key: fc-8574178...
‚ùå Attempt 1: HTTP 408 for https://stgeorgembc.com.au/
üîÅ Retrying in 3 seconds...
‚ùå Attempt 2: HTTP 408 for https://stgeorgembc.com.au/
üîÅ Retrying in 3 seconds...
‚ùå Attempt 3: HTTP 408 for https://stgeorgembc.com.au/
‚õî Failed after 3 attempts ‚Üí https://stgeorgembc.com.au/
‚ùå Skipped saving due to repeated errors: https://stgeorgembc.com.au/

=== Fetching (2/44) ‚Üí https://stgeorgembc.com.au/about-us/ ===
Using API key: fc-9d7d39e...
‚ùå Attempt 1: HTTP 402 for https://stgeorgembc.com.au/about-us/
üîÅ Retrying in 3 seconds...
‚ùå Attempt 2: HTTP 402 for https://stgeorgembc.com.au/about-us/
üîÅ Retrying in 3 seconds...
‚ùå Attempt 3: HTTP 402 for https://stgeorgembc.com.au/about-us/
‚õî Failed after 3 attempts ‚Üí https://stgeorgembc.com.au/about-us/
‚ùå Skipped saving due to repeated errors: https://stgeorgembc.com.au/about-us/

=== Fetching (3/44) ‚Üí https://stgeorgembc.com.au/club