In [1]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [2]:
!pip install faiss-cpu
!pip install datasets
!pip install rank_bm25

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloa

In [3]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from rank_bm25 import BM25Okapi
import requests
from bs4 import BeautifulSoup
import re

In [4]:
def scrape_website(url):
    """Scrape text content from a website."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        paragraphs = [p.get_text() for p in soup.find_all("p")]
        return " ".join(paragraphs)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

In [5]:
scrape_website("https://aws.amazon.com/what-is/retrieval-augmented-generation/")

'Retrieval-Augmented Generation (RAG) is the process of optimizing the output of a large language model, so it references an authoritative knowledge base outside of its training data sources before generating a response. Large Language Models (LLMs) are trained on vast volumes of data and use billions of parameters to generate original output for tasks like answering questions, translating languages, and completing sentences. RAG extends the already powerful capabilities of LLMs to specific domains or an organization\'s internal knowledge base, all without the need to retrain the model. It is a cost-effective approach to improving LLM output so it remains relevant, accurate, and useful in various contexts. LLMs are a key artificial intelligence (AI) technology powering intelligent chatbots and other natural language processing (NLP) applications. The goal is to create bots that can answer user questions in various contexts by cross-referencing authoritative knowledge sources. Unfortuna

In [6]:
def embed_documents(model, documents):
    """Compute embeddings for documents."""
    return np.array(model.encode(documents))

In [7]:
def load_documents(source=None):
    """Load documents either from Wikipedia or a given website."""
    if source and source.startswith("http"):
        return [scrape_website(source)]
    else:
        dataset = load_dataset("wikipedia", "20220301.simple", split="train")
        docs = dataset.select(range(100))
        return [doc["text"] for doc in docs]

In [8]:
def chunk_text(text, chunk_size=500, overlap=50):
    """Splits text into overlapping chunks."""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [9]:
def create_faiss_index(embeddings):
    """Create a FAISS index for vector search."""
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)
    return index

In [10]:
# What is BM25?
# BM25 (Best Matching 25) is a ranking function used in information retrieval to score and rank documents based on their relevance to a given query.
# It is an improvement over traditional TF-IDF (Term Frequency-Inverse Document Frequency) and is widely used in search engines.

# How Does BM25 Work?
# BM25 scores documents based on the frequency of query terms within them while normalizing for document length.
# It assigns a higher weight to rare terms (important words) and adjusts scores to avoid favoring long documents that contain a term many times.
def create_bm25_index(documents):
    """Create a BM25 index for keyword search."""
    tokenized_docs = [doc.split() for doc in documents]
    return BM25Okapi(tokenized_docs)

In [50]:
def rewrite_query(llm, tokenizer, query):
    """
    What: Uses an LLM to rewrite the user query for better search accuracy.
    Why: Reformulating queries can improve search results by making them more precise.
    How: Passes the query through a pre-trained LLM and generates a rewritten query.
    """
    prompt = f"Please improve the following search query to make it more precise:\n\nOriginal Query: {query}\nRewritten Query: "
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = llm.generate(**inputs, max_new_tokens=50)
    rewritten_query = tokenizer.decode(output[0], skip_special_tokens=True)
    if "Rewritten Query: " in rewritten_query:
      rewritten_query = rewritten_query.split("Rewritten Query: ")[-1].strip()
    return rewritten_query

In [1]:
def rerank_results(cross_encoder, query, retrieved_docs, top_k=3):
    """
    What: Uses a Cross-Encoder model to rerank the retrieved documents.
    Why: Improves retrieval accuracy by considering query-document pairs holistically.
    How: Scores each document-query pair and sorts them based on relevance scores.
    """
    # A Cross-Encoder is a type of transformer model typically used for tasks that require a pair of inputs to be processed together
    # in a single model pass. Unlike bi-encoders, which independently encode each input (such as a query and document) and then
    # combine their representations, a cross-encoder processes both inputs together at once.
    # Reranking is a process where a preliminary ranking of documents (from, say, a keyword search or initial vector-based search)
    # is refined or reranked using a more sophisticated model like a Cross-Encoder. The goal of reranking is to improve the search
    # results by scoring documents based on their actual relevance to the query, considering interactions between the query and document text.
    scores = cross_encoder.predict([(query, doc) for doc in retrieved_docs])
    ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)[:top_k]]
    return ranked_docs


In [60]:

def generate_response(model, tokenizer, context, query):
    """
    What: Generates a response using an LLM based on retrieved documents.
    Why: Provides a context-aware answer to the user's query.
    How: Constructs a prompt including context and query, then generates text.
    """
    prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=100, do_sample=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [51]:
rewrite_query(model_llm, tokenizer, "list LLM benefits")

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


'"LLM benefits" OR "LLM benefits and services" OR "LLM benefits and services for students" OR "LLM benefits and services for lawyers" OR "LLM benefits and services for accountants" OR "'

In [53]:
def hybrid_retrieve(index, bm25, model, query, documents, top_k=5, alpha=0.5):
    """
    What: Performs hybrid retrieval using FAISS (vector search) and BM25 (keyword search).
    Why: Combines semantic and lexical search for more robust retrieval.
    How: Scores documents separately with FAISS and BM25, then combines scores.
    """
    # Create encoding for this
    query_embedding = np.array(model.encode([query]))
     # Search in FAISS index and compute inverse distance scores
    D, I = index.search(query_embedding, top_k)
    faiss_scores = {documents[i]: 1 / (D[0][idx] + 1e-5) for idx, i in enumerate(I[0])}

    # Compute BM25 keyword matching scores
    bm25_scores_list = bm25.get_scores(query.split())  # Returns scores for all documents
    bm25_scores = {doc: bm25_scores_list[idx] for idx, doc in enumerate(documents)}

    # Normalize both scores to 0-1 scale
    max_faiss = max(faiss_scores.values(), default=1)
    max_bm25 = max(bm25_scores.values(), default=1)
    for doc in faiss_scores:
        faiss_scores[doc] /= max_faiss
    for doc in bm25_scores:
        bm25_scores[doc] /= max_bm25

    # Combine scores with weighted average for hybrid retrieval
    hybrid_scores = {doc: alpha * faiss_scores.get(doc, 0) + (1 - alpha) * bm25_scores.get(doc, 0) for doc in documents}

    sorted_docs = sorted(hybrid_scores, key=hybrid_scores.get, reverse=True)[:top_k]
    return sorted_docs

In [31]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# llm_name = "meta-llama/Llama-2-7b-chat-hf"
# tokenizer = AutoTokenizer.from_pretrained(llm_name)
# llm = AutoModelForCausalLM.from_pretrained(llm_name, device_map="auto")

# Load alternative open-source model (e.g., Falcon-7B-Instruct)
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
model_llm = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b-instruct", load_in_4bit=True)

In [68]:
def main():
    source = "https://aws.amazon.com/what-is/retrieval-augmented-generation/"
    query = "what are the usecases of RAG?"

    raw_documents = load_documents(source)

    documents = []
    for doc in raw_documents:
        documents.extend(chunk_text(doc))

    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embed_documents(model, documents)
    index = create_faiss_index(embeddings)
    bm25 = create_bm25_index(documents)

    cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2")

    rewritten_query = rewrite_query(model_llm, tokenizer, query)
    retrieved_docs = hybrid_retrieve(index, bm25, model, rewritten_query, documents)
    reranked_docs = rerank_results(cross_encoder, rewritten_query, retrieved_docs)
    print(rewritten_query)

    context = "\n".join(reranked_docs)
    response = generate_response(model_llm, tokenizer, context, rewritten_query)

    print("Generated Response:", response)
    print("Relevant Documents:", reranked_docs)

if __name__ == "__main__":
    main()

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


"What are the usecases of RAG?"
Generated Response: Context: Retrieval-Augmented Generation (RAG) is the process of optimizing the output of a large language model, so it references an authoritative knowledge base outside of its training data sources before generating a response. Large Language Models (LLMs) are trained on vast volumes of data and use billions of parameters to generate original output for tasks like answering questions, translating languages, and completing sentences. RAG extends the already powerful capabilities of LLMs to specific domains or an organization's internal knowledge base, all without the need to retrain the model. It is a cost-effective approach to improving LLM output so it remains relevant, accurate, and useful in various contexts. LLMs are a key artificial intelligence (AI) technology powering intelligent chatbots and other natural language processing (NLP) applications. The goal is to create bots that can answer user questions in various contexts by c