In [4]:
import requests
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from urllib.parse import urlparse
import re
import time
from pprint import pprint 
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from config import (
    ELASTICSEARCH_URL,
    INDEX_NAME_HUGGINGFACE,
    INDEX_NAME_GEMINI,
    INDEX_NAME_OPENAI,
    SITEMAP_URL,
    HUGGINGFACE_EMBEDDING_MODEL_NAME,
    GEMINI_EMBEDDING_MODEL_NAME,
    OPENAI_EMBEDDING_MODEL_NAME
)

**Connect To Elastic Search**

In [None]:
es = Elasticsearch(ELASTICSEARCH_URL)
print("Connected to Elasticsearch!")

client_info = es.info()

Crate Index utils function

In [None]:
def create_index(index_name: str):
    if es.indices.exists(index=index_name):
        print(f"Index '{index_name}' already exists.")
        return

    mapping = {
        "mappings": {
            "properties": {
                "url": {"type": "keyword"},
                "title": {"type": "text"},
                "content": {"type": "text"},
                "embedding": {"type": "dense_vector", "dims": 384}
            }
        }
    }

    es.indices.create(index=index_name, body=mapping)
    print(f"Index '{index_name}' created successfully.")

Extract page content utils function

In [None]:
def extract_page_text(url):
    try:
        res = requests.get(url, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")

        # Remove unwanted elements
        for tag in soup(["script", "style", "noscript"]):
            tag.extract()

        title = soup.title.string.strip() if soup.title else "Untitled"
        text = re.sub(r"\s+", " ", soup.get_text(separator=" ", strip=True))
        return title, text

    except Exception as e:
        print(f"❌ Error fetching {url}: {e}")
        return None, None

scrape sitemap and index pages utils func

In [8]:
def scrape_and_index(model: None, index_name: None):
    sitemap_xml = requests.get(SITEMAP_URL).text
    soup = BeautifulSoup(sitemap_xml, "xml")

    urls = [loc.text for loc in soup.find_all("loc") if "/en-ie/" in loc.text]
    print(f"Found {len(urls)} URLs to crawl.")
    pprint(urls[:10])
    print("...")

    # for i, url in enumerate(urls, start=1):
    #     title, content = extract_page_text(url)
    #     if not content:
    #         continue

    #     embedding = model.encode(content)

    #     doc = {
    #         "url": url,
    #         "title": title,
    #         "content": content,
    #         "embedding": embedding.tolist()
    #     }

    #     es.index(index=index_name, document=doc)
    #     print(f"✅ Indexed ({i}/{len(urls)}): {url}")
    #     time.sleep(0.5)  # be polite to server

scrape_and_index(None, None)

Found 141 URLs to crawl.
['https://www.sisuclinic.com/en-ie/treatments',
 'https://www.sisuclinic.com/en-ie/providers',
 'https://www.sisuclinic.com/en-ie/faqs',
 'https://www.sisuclinic.com/en-ie/locations',
 'https://www.sisuclinic.com/en-ie/about-us',
 'https://www.sisuclinic.com/en-ie/pricing',
 'https://www.sisuclinic.com/en-ie/results',
 'https://www.sisuclinic.com/en-ie/accessibility',
 'https://www.sisuclinic.com/en-ie/contact-us',
 'https://www.sisuclinic.com/en-ie/get-the-look']
...


**Scrape the pages from the sitemap and use HuggingFace Embedding Model**