In [None]:
import requests
import json
import zipfile
import os
import time
from google.colab import files

# ===============================
# 1. ARRAY OF FIRECRAWL API KEYS
# ===============================
FIRECRAWL_KEYS = [

]

# ====================
# 2. ARRAY OF URLS
# ====================
URLS = [
    "https://stgeorgembc.com.au/",
    "https://stgeorgembc.com.au/about-us/",
    "https://stgeorgembc.com.au/clubgrants-2022/",
    "https://stgeorgembc.com.au/club-history/",
    "https://stgeorgembc.com.au/trading-hours/",
    "https://stgeorgembc.com.au/gallery/",
    "https://stgeorgembc.com.au/courtesy-bus/",
    "https://stgeorgembc.com.au/frequently-asked-questions/",
    "https://stgeorgembc.com.au/we-care/",
    "https://stgeorgembc.com.au/latest-news/",
    "https://stgeorgembc.com.au/venue-safety-plan/",
    "https://stgeorgembc.com.au/join-our-team/",
    "https://stgeorgembc.com.au/our-fantastic-staff/",
    "https://stgeorgembc.com.au/annual-report/",
    "https://stgeorgembc.com.au/membership-reminder/",
    "https://stgeorgembc.com.au/dress-regulations/",
    "https://stgeorgembc.com.au/marina/",
    "https://stgeorgembc.com.au/marina/marina-berthing/",
    "https://stgeorgembc.com.au/environmental/",
    "https://stgeorgembc.com.au/mooring-detail-search/",
    "https://stgeorgembc.com.au/boat-ramp/",
    "https://stgeorgembc.com.au/boat-hire/",
    "https://stgeorgembc.com.au/restaurant-menus/",
    "https://stgeorgembc.com.au/meet-our-chefs/",
    "https://stgeorgembc.com.au/wp-content/uploads/2023/10/BayBreeze-Cafe-231017.pdf",
    "https://stgeorgembc.com.au/wp-content/uploads/2023/05/Pizza-.pdf",
    "https://stgeorgembc.com.au/#",
    "https://stgeorgembc.com.au/whats-on/entertainment/",
    "https://stgeorgembc.com.au/sub-clubs/",
    "https://stgeorgembc.com.au/contact/",
    "https://stgeorgembc.com.au/whats-on/",
    "https://stgeorgembc.com.au/whats-on/promotions-and-raffles/",
    "https://stgeorgembc.com.au/new-badge-draw/",
    "https://stgeorgembc.com.au/christmas-buffet/",
    "https://stgeorgembc.com.au/marina-extension/",
    "https://stgeorgembc.com.au/new-marina-bonds/",
    "https://stgeorgembc.com.au/instant-membership/",
    "https://stgeorgembc.com.au/hire-me-today/",
    "https://stgeorgembc.com.au/cocktail-hour/",
    "https://stgeorgembc.com.au/virtualtour/",
    "https://stgeorgembc.com.au/railway-pde-kogarah-clubhouse/",
    "https://stgeorgembc.com.au/club-renovations-2022/",
    "https://stgeorgembc.com.au/?page_id=328",
    "https://stgeorgembc.com.au/?page_id=468"
]




# =======================================================
# 3. REQUEST FUNCTION WITH RETRIES & ERROR HANDLING
# =======================================================
def get_raw_html(url, api_key, max_retries=3, retry_delay=3):
    """
    Sends a Firecrawl scrape request and returns raw HTML.
    Retries up to max_retries times if the request fails.
    """
    api_url = "https://api.firecrawl.dev/v1/scrape"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "url": url,
        "formats": ["rawHtml"]
    }

    for attempt in range(1, max_retries + 1):
        try:
            response = requests.post(api_url, headers=headers, data=json.dumps(payload))

            # Successful request
            if response.status_code == 200:
                data = response.json()
                if "rawHtml" in data:
                    return data["rawHtml"]
                else:
                    print(f"‚ö†Ô∏è Attempt {attempt}: 'rawHtml' missing in response.")
            else:
                print(f"‚ùå Attempt {attempt}: HTTP {response.status_code} for {url}")

        except requests.exceptions.RequestException as e:
            print(f"‚ùå Attempt {attempt}: Request error ‚Üí {e}")

        # Retry if not last attempt
        if attempt < max_retries:
            print(f"üîÅ Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)

    print(f"‚õî Failed after {max_retries} attempts ‚Üí {url}")
    return None


# ================================
# 4. FETCH + SAVE HTML FILES
# ================================
os.makedirs("html_raw", exist_ok=True)

for i, url in enumerate(URLS):
    api_key = FIRECRAWL_KEYS[i % len(FIRECRAWL_KEYS)]  # rotate API key

    print(f"\n=== Fetching ({i+1}/{len(URLS)}) ‚Üí {url} ===")
    print(f"Using API key: {api_key[:10]}...")

    html_content = get_raw_html(url, api_key)

    if html_content:
        filename = f"html_raw/page_{i+1}.html"
        with open(filename, "w", encoding="utf-8") as f:
            f.write(html_content)
        print(f"‚úî Saved: {filename}")
    else:
        print(f"‚ùå Skipped saving due to repeated errors: {url}")


# ======================
# 5. ZIP HTML FILES
# ======================
zip_filename = "html-raw.zip"

with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in os.listdir("html_raw"):
        filepath = os.path.join("html_raw", file)
        zipf.write(filepath, arcname=file)

print("\n‚úî All HTML files zipped into:", zip_filename)


# =========================
# 6. DOWNLOAD ZIP FILE
# =========================
files.download(zip_filename)
print("‚¨áÔ∏è Download should begin automatically.")


=== Fetching (1/44) ‚Üí https://stgeorgembc.com.au/ ===
Using API key: fc-8574178...
‚ùå Attempt 1: HTTP 408 for https://stgeorgembc.com.au/
üîÅ Retrying in 3 seconds...
‚ùå Attempt 2: HTTP 408 for https://stgeorgembc.com.au/
üîÅ Retrying in 3 seconds...
‚ùå Attempt 3: HTTP 408 for https://stgeorgembc.com.au/
‚õî Failed after 3 attempts ‚Üí https://stgeorgembc.com.au/
‚ùå Skipped saving due to repeated errors: https://stgeorgembc.com.au/

=== Fetching (2/44) ‚Üí https://stgeorgembc.com.au/about-us/ ===
Using API key: fc-9d7d39e...
‚ùå Attempt 1: HTTP 402 for https://stgeorgembc.com.au/about-us/
üîÅ Retrying in 3 seconds...
‚ùå Attempt 2: HTTP 402 for https://stgeorgembc.com.au/about-us/
üîÅ Retrying in 3 seconds...
‚ùå Attempt 3: HTTP 402 for https://stgeorgembc.com.au/about-us/
‚õî Failed after 3 attempts ‚Üí https://stgeorgembc.com.au/about-us/
‚ùå Skipped saving due to repeated errors: https://stgeorgembc.com.au/about-us/

=== Fetching (3/44) ‚Üí https://stgeorgembc.com.au/club

In [2]:

!pip install trafilatura

import os
import zipfile
from bs4 import BeautifulSoup
from trafilatura import extract
from google.colab import files

# ====================================================
# 1. UNRAR html-raws.rar ‚Üí html-raws/ folder
# ====================================================

# rar_path = "/content/drive/MyDrive/Upwork/serv.rar"
extract_folder = "/content/html-raws/serv"

os.makedirs(extract_folder, exist_ok=True)

# Extract using unrar command
#!unrar x -y rar_path html-raws/

print("‚úî Extracted RAR into:", extract_folder)


# ====================================================
# 2. READ & PARSE HTML FILES WITH TRAFILATURA
# ====================================================

output_folder = "text-files"
os.makedirs(output_folder, exist_ok=True)

text_file_paths = []

for filename in os.listdir(extract_folder):
    if not filename.lower().endswith(".html"):
        continue

    html_path = os.path.join(extract_folder, filename)

    with open(html_path, "r", encoding="utf-8", errors="ignore") as f:
        html_content = f.read()

    # ------------------------------------------------
    # Extract <title>
    # ------------------------------------------------
    soup = BeautifulSoup(html_content, "html.parser")
    title_tag = soup.find("title")

    title = title_tag.text.strip() if title_tag else "untitled"

    # Clean title for filenames
    safe_title = (
        title.replace("/", "_")
             .replace("\\", "_")
             .replace(":", "_")
             .replace("*", "_")
             .replace("?", "_")
             .replace('"', "_")
             .replace("<", "_")
             .replace(">", "_")
             .replace("|", "_")
             .strip()
    )

    # ------------------------------------------------
    # Extract readable text using TRAFILATURA
    # ------------------------------------------------
    try:
        text_content = extract(html_content)
    except Exception as e:
        print(f"‚ùå Extraction error on {filename}: {e}")
        continue

    if not text_content:
        print(f"‚ö†Ô∏è No main content found in {filename}, skipping...")
        continue

    # ------------------------------------------------
    # Save to text file
    # ------------------------------------------------
    text_filename = f"{safe_title}.txt"
    text_path = os.path.join(output_folder, text_filename)

    with open(text_path, "w", encoding="utf-8") as t:
        t.write(text_content)

    text_file_paths.append(text_path)

    print(f"‚úî Extracted ‚Üí {text_filename}")


# ==========================================================
# 3. COMBINE ALL TXT FILES INTO ONE MASTER FILE
# ==========================================================

combined_path = "combined.txt"

with open(combined_path, "w", encoding="utf-8") as combined:
    for text_file in text_file_paths:
        title = os.path.basename(text_file).replace(".txt", "")
        combined.write("\n\n==============================\n")
        combined.write(f"### {title}\n")
        combined.write("==============================\n\n")

        with open(text_file, "r", encoding="utf-8") as t:
            combined.write(t.read())
            combined.write("\n")

print("‚úî Combined file created:", combined_path)


# ==========================================================
# 4. ZIP ALL TEXT FILES
# ==========================================================

zip_output = "text-files.zip"

with zipfile.ZipFile(zip_output, "w") as zipf:
    for file in os.listdir(output_folder):
        filepath = os.path.join(output_folder, file)
        zipf.write(filepath, arcname=file)

    zipf.write(combined_path, arcname="combined.txt")

print("‚úî Zipped all text files into:", zip_output)


# ==========================================================
# 5. DOWNLOAD ZIP
# ==========================================================

files.download(zip_output)
print("‚¨áÔ∏è Download should start automatically.")

Collecting trafilatura
  Downloading trafilatura-2.0.0-py3-none-any.whl.metadata (12 kB)
Collecting courlan>=1.3.2 (from trafilatura)
  Downloading courlan-1.3.2-py3-none-any.whl.metadata (17 kB)
Collecting htmldate>=1.9.2 (from trafilatura)
  Downloading htmldate-1.9.4-py3-none-any.whl.metadata (10 kB)
Collecting justext>=3.0.1 (from trafilatura)
  Downloading justext-3.0.2-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting tld>=0.13 (from courlan>=1.3.2->trafilatura)
  Downloading tld-0.13.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting dateparser>=1.1.2 (from htmldate>=1.9.2->trafilatura)
  Downloading dateparser-1.2.2-py3-none-any.whl.metadata (29 kB)
Collecting lxml_html_clean (from lxml[html_clean]>=4.4.2->justext>=3.0.1->trafilatura)
  Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)
Downloading trafilatura-2.0.0-py3-none-any.whl (132 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None


‚úî Extracted ‚Üí The Waterfront Function Centre - Wedding Reception Venues Sydney - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Membership Renewal - St George Motor Boat Club.txt
‚ö†Ô∏è No main content found in 54.html, skipping...
‚úî Extracted ‚Üí New Marina Berths - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Instant Membership - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Club History - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Join Our Team - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Home - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Sub Clubs - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Marina - St George Motor Boat Club.txt


ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None


‚úî Extracted ‚Üí Restaurant - St George Motor Boat Club.txt
‚úî Extracted ‚Üí FISHING CLUB - St George Motor Boat Club.txt
‚ö†Ô∏è No main content found in 55.html, skipping...
‚úî Extracted ‚Üí Gallery - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Happy Hour - St George Motor Boat Club.txt
‚úî Extracted ‚Üí What's On - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Digital Magazine - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Marina Extension - St George Motor Boat Club.txt
‚úî Extracted ‚Üí New Badge Draw - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Venue Safety Plan - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Christmas Marina's Edge Restaurant - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Restaurant - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Club Renovations 2022 - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Members Free Raffle - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Meet our Team - St George Motor Boat Club.txt
‚úî Extracted ‚Üí Marina 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚¨áÔ∏è Download should start automatically.


In [1]:
!unrar x -y "/content/drive/MyDrive/Upwork/serv.rar" html-raws/


UNRAR 6.11 beta 1 freeware      Copyright (c) 1993-2022 Alexander Roshal


Extracting from /content/drive/MyDrive/Upwork/serv.rar

Creating    html-raws                                                 OK
Creating    html-raws/serv                                            OK
Extracting  html-raws/serv/1.html                                          0%  OK 
Extracting  html-raws/serv/10.html                                         0%  OK 
Extracting  html-raws/serv/11.html                                         0%  OK 
Extracting  html-raws/serv/12.html                                         0%  OK 
Extracting  html-raws/serv/13.html                                         0%  OK 
Extracting  html-raws/serv/14.html                                         0%  OK 
Extracting  html-raws/serv/15.html                                         0%  OK 
Extracting  html-raws/serv/16.html                                        

In [3]:
combined_path = "combined2.txt"

with open(combined_path, "w", encoding="utf-8") as combined:
    for text_file in text_file_paths:
        title = os.path.basename(text_file).replace(".txt", "")
        combined.write("\n\n==============================\n")
        combined.write(f"### {title}\n")
        combined.write("==============================\n\n")

        with open(text_file, "r", encoding="utf-8") as t:
            combined.write(t.read())
            combined.write("\n")

print("‚úî Combined file created:", combined_path)

‚úî Combined file created: combined2.txt


In [4]:
import os
import zipfile
from google.colab import files

output_folder = "text-files"
combined_path = "combined_new.txt"
zip_output = "text-files_new.zip"

text_file_paths = []
for filename in os.listdir(output_folder):
    if filename.lower().endswith(".txt"):
        text_file_paths.append(os.path.join(output_folder, filename))

# ==========================================================
# 1. COMBINE ALL TXT FILES INTO ONE MASTER FILE
# ==========================================================

with open(combined_path, "w", encoding="utf-8") as combined:
    for text_file in text_file_paths:
        title = os.path.basename(text_file).replace(".txt", "")
        combined.write("\n\n==============================\n")
        combined.write(f"### {title}\n")
        combined.write("==============================\n\n")

        with open(text_file, "r", encoding="utf-8") as t:
            combined.write(t.read())
            combined.write("\n")

print("‚úî Combined file created:", combined_path)


# ==========================================================
# 2. ZIP ALL TEXT FILES
# ==========================================================

with zipfile.ZipFile(zip_output, "w") as zipf:
    for file in os.listdir(output_folder):
        filepath = os.path.join(output_folder, file)
        zipf.write(filepath, arcname=file)

    zipf.write(combined_path, arcname=os.path.basename(combined_path))

print("‚úî Zipped all text files into:", zip_output)


# ==========================================================
# 3. DOWNLOAD ZIP
# ==========================================================

files.download(zip_output)
print("‚¨áÔ∏è Download should start automatically.")

‚úî Combined file created: combined_new.txt
‚úî Zipped all text files into: text-files_new.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚¨áÔ∏è Download should start automatically.
