In [1]:
pip install requests beautifulsoup4 sentence-transformers faiss-cpu transformers


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [4]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

class RAGPipeline:
    def __init__(self, embedding_model_name='all-MiniLM-L6-v2'):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.vector_dim = self.embedding_model.get_sentence_embedding_dimension()
        self.index = faiss.IndexFlatL2(self.vector_dim)  # FAISS index
        self.metadata = []  # Stores metadata for each embedding
        self.qa_model = pipeline("text2text-generation", model="google/flan-t5-base")

    def crawl_and_scrape(self, url):
        """Crawls and scrapes content from a given URL."""
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch {url}")
            return []

        soup = BeautifulSoup(response.text, 'html.parser')
        text_segments = [p.get_text() for p in soup.find_all('p')]
        return text_segments

    def preprocess_and_store(self, text_segments, url):
        """Preprocess text, generate embeddings, and store in FAISS index."""
        for segment in text_segments:
            embedding = self.embedding_model.encode(segment, convert_to_tensor=False)
            self.index.add(embedding.reshape(1, -1))
            self.metadata.append({'url': url, 'text': segment})

    def query(self, user_question):
        """Handle user query by retrieving relevant chunks and generating a response."""
        # Generate query embedding
        query_embedding = self.embedding_model.encode(user_question, convert_to_tensor=False)

        # Retrieve top-k similar chunks
        k = 5
        distances, indices = self.index.search(query_embedding.reshape(1, -1), k)

        # Compile relevant chunks
        retrieved_chunks = [self.metadata[idx]['text'] for idx in indices[0]]
        context = "\n".join(retrieved_chunks)

        # Generate response using LLM
        prompt = f"Context: {context}\n\nQuestion: {user_question}\nAnswer:"
        response = self.qa_model(prompt, max_length=200, do_sample=False)[0]['generated_text']

        return response




In [6]:
# Usage example
if __name__ == "__main__":
    rag = RAGPipeline()

    # Step 1: Crawl and scrape a sample website
    website_url = "https://www.blogger.com/about/?bpli=1"
    scraped_text = rag.crawl_and_scrape(website_url)

    # Step 2: Preprocess and store data
    rag.preprocess_and_store(scraped_text, website_url)

    # Step 3: Handle user query
    user_query = "What is the main topic of the website?"
    answer = rag.query(user_query)
    print("Response:", answer)

Response: Blogger
