# Blog search

Trying to set up a script which will crawl the content of my website. And then I will store embeddings inside FAISS DB and use that for search. 

This is just an experiment. No intention to add this to my website as a search option.

In [45]:
import requests
import faiss
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

In [3]:
def crawl_content(url: str):
    if not url:
        raise ValueError("Invalid URL provided.")
    
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()  # Raise an exception for non-200 status codes

        soup = BeautifulSoup(response.content, 'html.parser')
        article_content = soup.find("main", class_="article-full")

        if article_content:
            return article_content.text.strip()  # Extract and clean the text content
        else:
            print(f"Content not found in <main> tag with class 'article-full' on {url}.")
            return None

    except (requests.exceptions.RequestException, ValueError) as e:
        print(f"Error crawling {url}: {e}")
        return None

In [46]:
def chunk_sentences(content, max_chunk_length=32):
    sentences = content.split(". ")  # Assuming sentence endings are marked by ". "
    chunks = []
    current_chunk = []
    for sentence in sentences:
        if len(current_chunk) + len(sentence.split()) <= max_chunk_length:
            current_chunk.append(sentence)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [4]:
urls = [
    'https://www.amitavroy.com/articles/beyond-boundaries-how-frankenphp-redefines-php-application-runtimes-2024-01-01',
    'https://www.amitavroy.com/articles/the-future-is-low-code-adapting-to-the-inevitable-2023-10-24',
]

In [47]:
model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)

In [None]:
url = urls[0]
content = crawl_content(url)

In [None]:
chunks = chunk_sentences(content, max_chunk_length=32)
embeddings = model.encode(chunks)

In [None]:
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

In [None]:
query = "This is a query to search against the content."
query_embedding = model.encode(query)

In [None]:
k = 5
D, I = index.search(query_embedding.reshape(1, -1), k)

In [None]:
for i in range(k):
    print(f"Chunk {i+1} (distance: {D[0][i]}): {chunks[I[0][i]]}")