<a href="https://colab.research.google.com/github/abalaji-blr/EAG/blob/main/BuildIndexUsingNomic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install flask sentence-transformers faiss-cpu beautifulsoup4 requests pydantic pyngrok


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.5-py3-none-any.whl.metadata (8.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-tr

In [2]:
import torch
import faiss
import json
from sentence_transformers import SentenceTransformer
from typing import List, Tuple, Optional
import numpy as np
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global variables to hold the model and index
# Avoid reloading them on every request
MODEL = None
INDEX = None
URL_MAPPING = None # Will store {index_id: {"url": "...", "text": "..."}}

DIMENSIONS = 768 # Example dimension for nomic-embed-text-v1. Adjust if needed.
MODEL_NAME = "nomic-ai/nomic-embed-text-v1"

In [3]:
def load_embedding_model(model_name: str = MODEL_NAME, trust_remote_code: bool = True) -> SentenceTransformer:
    """Loads the Nomic embedding model."""
    global MODEL
    if MODEL is None:
        logger.info(f"Loading embedding model: {model_name}")
        # Nomic recommends trusting remote code for their model
        MODEL = SentenceTransformer(model_name, trust_remote_code=trust_remote_code)
        logger.info("Embedding model loaded.")
    return MODEL

In [4]:
def scrape_page(url: str) -> Optional[str]:
    """Basic placeholder for scraping web page text."""
    # In a real scenario, use libraries like requests and BeautifulSoup
    # Handle errors, timeouts, content types, JS rendering etc.
    # This is a highly simplified example.
    try:
        import requests
        from bs4 import BeautifulSoup
        response = requests.get(url, timeout=10)
        response.raise_for_status() # Raise an exception for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        # Basic text extraction, needs significant improvement for real use
        text_content = ' '.join(p.get_text() for p in soup.find_all('p'))
        return text_content
    except Exception as e:
        logger.error(f"Failed to scrape {url}: {e}")
        return None


In [5]:
def scrape_website(url, headers_list):
    try:
        # Rotate headers
        headers = random.choice(headers_list)

        # Add a random delay between requests (1-3 seconds)
        time.sleep(random.uniform(1, 3))

        response = requests.get(url, headers=headers, timeout=10)

        # Check if we're getting blocked
        if response.status_code == 429:
            logger.error(f"Rate limited at {url}. Waiting longer before retry...")
            time.sleep(random.uniform(5, 10))  # Wait longer before potential retry
            return None

        response.raise_for_status()

        # Check if we got a reasonable amount of content
        if len(response.text) < 1000:  # Arbitrary threshold
            logger.warning(f"Response from {url} seems too small, might be blocked")
            return None

        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.extract()

        text = soup.get_text()

        # Clean up text
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)

        return text

    except requests.exceptions.RequestException as e:
        logger.error(f"Failed to scrape {url}: {e}")
        return None

In [6]:
headers_list = [
    {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.google.com/',
        'DNT': '1',
    },
    {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.bing.com/',
        'DNT': '1',
    },
    {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://duckduckgo.com/',
        'DNT': '1',
    }
]

In [7]:
def build_index_offline(urls: List[str], index_path: str, mapping_path: str):
    """
    Offline process to scrape URLs, embed content, and build FAISS index.
    (Run this in Colab or locally, not part of the live server).
    """
    logger.info("Starting offline index build process...")
    model = load_embedding_model()
    if not model: return

    all_embeddings = []
    url_map = {}
    doc_id_counter = 0

    # Nomic might have a different prefix for indexing passages
    # Check documentation. Example: "search_passage: "
    passage_prefix = "search_passage: "

    for url in urls:
        logger.info(f"Processing {url}...")
        content = scrape_page(url)
        if content and len(content) > 50: # Basic filter for meaningful content
            try:
                # Note: Nomic works best with chunks <= 512 tokens.
                # Real implementation should chunk long documents.
                # This example embeds the whole (potentially truncated) content.
                embeddings = model.encode([passage_prefix + content], convert_to_numpy=True)
                faiss.normalize_L2(embeddings) # Normalize for cosine similarity
                all_embeddings.append(embeddings[0])
                # Store URL and snippet (e.g., first 500 chars)
                url_map[doc_id_counter] = {
                    "url": url,
                    "text": content[:500] + "..." if len(content) > 500 else content
                }
                doc_id_counter += 1
            except Exception as e:
                 logger.error(f"Failed to process/embed content from {url}: {e}")
        else:
            logger.warning(f"Skipping {url} due to lack of content or scraping error.")

    if not all_embeddings:
        logger.error("No embeddings were generated. Index cannot be built.")
        return

    embeddings_np = np.array(all_embeddings).astype('float32')
    logger.info(f"Generated {embeddings_np.shape[0]} embeddings with dimension {embeddings_np.shape[1]}.")

    # Create FAISS index
    index = faiss.IndexFlatL2(DIMENSIONS) # Use IndexFlatL2 for cosine similarity after normalization
    # Or use a more advanced index like IndexIVFFlat for larger datasets
    # index = faiss.IndexIDMap(index) # If you want to use custom IDs, though sequential is easier here

    index.add(embeddings_np)
    logger.info(f"Added {index.ntotal} embeddings to the FAISS index.")

    # Save index and mapping
    faiss.write_index(index, index_path)
    logger.info(f"FAISS index saved to {index_path}")
    with open(mapping_path, 'w') as f:
        json.dump(url_map, f, indent=4)
    logger.info(f"URL mapping saved to {mapping_path}")
    logger.info("Offline index build complete.")


In [8]:
your_urls = [ "https://www.yahoo.com/",
              "https://www.google.com",
              "https://stockanalysis.com/",
             ]
build_index_offline(your_urls, "index.faiss", "url_mapping.json")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.2k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/103k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/547M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

ERROR:__main__:Failed to scrape https://www.yahoo.com/: 429 Client Error: Too Many Requests for url: https://www.yahoo.com/
