# Blog search

Trying to set up a script which will crawl the content of my website. And then I will store embeddings inside FAISS DB and use that for search. 

This is just an experiment. No intention to add this to my website as a search option.

In [7]:
import requests
from bs4 import BeautifulSoup
from transformers import BertTokenizer, BertModel
import faiss
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def get_article_content(url: str):
    if not url:
        raise ValueError("Invalid URL provided.")
    
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()  # Raise an exception for non-200 status codes

        soup = BeautifulSoup(response.content, 'html.parser')
        article_content = soup.find("main", class_="article-full")

        if article_content:
            return article_content.text.strip()  # Extract and clean the text content
        else:
            print(f"Content not found in <main> tag with class 'article-full' on {url}.")
            return None

    except (requests.exceptions.RequestException, ValueError) as e:
        print(f"Error crawling {url}: {e}")
        return None

In [9]:
def chunk_text(text, chunk_size=1000):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

In [10]:
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model(**inputs)
    encoded_vector = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return encoded_vector

In [11]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [12]:
index = faiss.IndexFlatIP(768)  # Assuming 768 is the size of BERT encoding

In [17]:
urls = [
    'https://www.amitavroy.com/articles/beyond-boundaries-how-frankenphp-redefines-php-application-runtimes-2024-01-01',
    'https://www.amitavroy.com/articles/the-future-is-low-code-adapting-to-the-inevitable-2023-10-24',
    'https://www.amitavroy.com/articles/what-is-next-js-incremental-static-generation-isr-a-complete-guide-2023-10-02',
    'https://www.amitavroy.com/articles/how-saloon-php-helped-me-changing-my-newsletter-integration-in-minutes-2023-09-24',
    'https://www.amitavroy.com/articles/multi-tenant-apps-and-why-they-are-efficient-2023-09-16',
    'https://www.amitavroy.com/articles/why-use-saloon-connect-third-party-api'
];

In [22]:
text_chunks = []

for url in urls:
    article_content = get_article_content(url)
    text_chunks = chunk_text(article_content)
    encoded_vectors = []
    for chunk in text_chunks:
        encoded_vector = encode_text(chunk, tokenizer, model)
        index.add(np.array([encoded_vector]))
        encoded_vectors.append(encoded_vector)

In [24]:
# Search for similar content
search_query = "saloon php laravel"
query_vector = encode_text(search_query, tokenizer, model)

In [25]:
D, I = index.search(np.array([query_vector]), k=5)  # Adjust k based on the number of similar documents you want

In [26]:
print("Similar documents:")
for i, idx in enumerate(I[0]):
    similarity_score = D[0][i]
    similar_content = text_chunks[idx]
    print(f"Similarity Score: {similarity_score}, Content: {similar_content}")

Similar documents:


IndexError: list index out of range