# 0. Setup.

In [1]:
# Imports.
import faiss
import numpy as np
import torch

# Set random seed.
np.random.seed(42)  

# Find device.
device      = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device = {device}.")

device = cuda.


In [2]:
%%html
<style>
    table {
        float: left;
        margin-right: 20px; /* Optional: Adds space between table and other content */
    }
</style>

# 1. Introduction.

# 2. Document Preparation.

## 2.1. Document.

In [3]:
# Sample docs for DB vectors, as a simple str format.
docs = ["""ChatGPT is a generative artificial intelligence chatbot[2][3] developed by OpenAI and launched in 2022. 
It is currently based on the GPT-4o large language model (LLM). ChatGPT can generate human-like conversational 
responses and enables users to refine and steer a conversation towards a desired length, format, style, level of 
detail, and language.[4] It is credited with accelerating the AI boom, which has led to ongoing rapid investment 
in and public attention to the field of artificial intelligence (AI).[5] Some observers have raised concerns 
about the potential of ChatGPT and similar programs to displace human intelligence, enable plagiarism, or fuel 
misinformation.""",

"""By January 2023, ChatGPT had become what was then the fastest-growing consumer software application in history, 
gaining over 100 million users in two months[8][9] and contributing to the growth of OpenAI's current valuation of 
$86 billion.[10][11] ChatGPT's release spurred the development of competing products, including Gemini, Claude, 
Llama, Ernie, and Grok.[12] Microsoft launched Copilot, initially based on OpenAI's GPT-4. In May 2024, a partnership 
between Apple Inc. and OpenAI was announced, in which ChatGPT was integrated into the Apple Intelligence feature of 
Apple operating systems.[13] As of July 2024, ChatGPT's website is among the 10 most-visited websites globally.[14][15]"""]


## 2.2. Split.

### 2.2.1. `RecursiveCharacterTextSplitter`.
- Excellent default.
- Recursively splits text with delimters `separators` = ["d_1", "d_2", ...].
- i.e. if len > chunk_size, split by "d_1", if still too long, split by "d_2", and so on.
- e.g.
  - "Paragraph 1.\nParagraph 2.\nParagraph 3."  
  - ["Paragraph 1.", "Paragraph 2.", "Paragraph 3."]

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Init splitter.
text_splitter = RecursiveCharacterTextSplitter(
    separators       = [". ", " "],   # First split by sentences, then by " ".
    chunk_size       = 50,            # Maximum chunk size.
    chunk_overlap    = 5,             # Overlaps btw chunks.
#    length_function  = lambda text: len(tokenizer.encode(text)),  # Define how to measure chunk_size. Default: character count.
    keep_separator   = False,         # DO NOT USE IT, it can produce unexpected result. 
    add_start_index  = True,          # If `True`, stores position at chunks[0].metadata["start_index"]. Default: `False`.
    strip_whitespace = True,          # If `True`, delete leading/trailing spaces. Default: `True`.
)

# Split into chunks.
chunks = text_splitter.create_documents(docs)

# Print the result.
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")
    if i == 2:
        break

Chunk 1:
page_content='ChatGPT is a generative artificial intelligence' metadata={'start_index': 0}

Chunk 2:
page_content='chatbot[2][3] developed by OpenAI and launched in' metadata={'start_index': 48}

Chunk 3:
page_content='in 2022' metadata={'start_index': 95}



### 2.2.2. `CharacterTextSplitter`.
- Splits by the specified character.
- Example:
  - Document: "12345678".
  - separator = "5", chunk_size = 3, chunk_overlap = 1.
  - Steps:
      1. Split by separator `"5"`.  
         - Original: "12345678".  
         - After splitting: ["1234", "678"].
      2. Chunk each part with `chunk_size=3`, `chunk_overlap=1`.  
         - "1234" → ["123", "34"].  
         - "678" → ["678"].
      3. Final output.  
         - ["123", "34", "678"].


In [5]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator     = " ",
    chunk_size    = 50,
    chunk_overlap = 10
)

chunks = text_splitter.create_documents(docs)

for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")
    if i == 2:
        break

Chunk 1:
page_content='ChatGPT is a generative artificial intelligence'

Chunk 2:
page_content='chatbot[2][3] developed by OpenAI and launched in'

Chunk 3:
page_content='in 2022. 
It is currently based on the GPT-4o'



### 2.2.3. `TokenTextSpliter`.
- Splits by the token, from the specified tokenizer, `encoding_name`.
- Useful for models with token limitations.

In [6]:
from langchain.text_splitter import TokenTextSplitter

# Example tokenizer.

# Splitter with tokenizer.
text_splitter = TokenTextSplitter(
    chunk_size     = 5,      
    chunk_overlap  = 1,      
    encoding_name  = "gpt2"  # Defines which tokenizer to use
)

# Split the text into chunks
chunks = text_splitter.split_text(docs[0])    # Note) input should be a single str, not a list of str.

# Print results
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {chunk}")
    if i == 2:
        break

Chunk 1: ChatGPT is a
Chunk 2:  a generative artificial intelligence
Chunk 3:  intelligence chatbot[2


### 2.2.4. Specific Formats.

#### 2.2.4.1. For Markdown.
- `MarkdownHeaderTextSplitter`: Splits Markdown text based on header levels, preserving document structure.  
   - "# Header 1  
     Content under header 1  
     \## Header 2  
     Content under header 2"  
   - ["# Header 1\nContent under header 1", "## Header 2\nContent under header 2"]

#### 2.2.4.2. For HTML.
- `HTMLHeaderTextSplitter`: Splits HTML content based on header tags, maintaining the hierarchical structure.  
   - "\<h1\>Title\</h1\>  
     \<p\>Intro paragraph.\</p\>  
     \<h2\>Subtitle\</h2\>  
     \<p\>Details.\</p\>"  
   - ["\<h1\>Title\</h1\>  
     \<p\>Intro paragraph.\</p\>",  
     "\<h2\>Subtitle\</h2\>  
     \<p\>Details.\</p\>"]

#### 2.2.4.3. For .py.
- `PythonCodeTextSplitter`: Splits Python code into chunks based on logical code structures like functions and classes.  
   - "def func1():  
       pass  

     def func2():  
       pass"  
   - ["def func1():  
       pass",  
       "def func2():  
       pass"]

#### 2.2.4.4. For .JSON.
- `RecursiveJsonSplitter`: Recursively splits JSON data into smaller chunks while preserving the hierarchical structure.  
   - '{"key1": {"subkey": "value"},  
     "key2": "value2"}'  
   - ['{"key1": {"subkey": "value"}}',  
     '{"key2": "value2"}']

#### 2.2.4.5. `SpacyTextSplitter`: 
   - Smart sentence splitter based on SpaCy.  
   - "This is the first sentence.  
     This is the second sentence."  
   - ["This is the first sentence.",  
     "This is the second sentence."]


# 3. Embedding.

## 3.1 Embedding Models for Retrieval and Semantic Search

| Model Series | Example Models | Pros | Cons |
|-------------|---------------|------|------|
| **Sentence Transformers** | `all-MiniLM-L6-v2` <br> Small, fast, and good for general embeddings. <br><br> `all-MPNet-base-v2` <br> More accurate but slower. <br><br> `nomic-ai/nomic-embed-text-v1` <br> Strong for document search. <br><br> `bge-large-en-v1.5` <br> Good for retrieval-augmented generation (RAG). | - No need for manual pooling. <br> - Efficient and optimized for semantic search. <br> - Many multilingual options. | - Less flexible than raw transformer models. <br> - Large models may be resource-intensive. |
| **E5 Series (Optimized for Retrieval)** | `intfloat/multilingual-e5-small` <br> Small, fast, and supports multiple languages. <br><br> `intfloat/multilingual-e5-base` <br> Good for multilingual tasks. <br><br> `intfloat/multilingual-e5-large` <br> Stronger, but heavier. <br><br> `intfloat/e5-large-v2` <br> High-performance English embeddings. | - Optimized for FAISS and retrieval. <br> - Good for retrieval-augmented generation (RAG). <br> - English and multilingual versions available. | - Requires specific query formatting (`query: ...`, `passage: ...`). <br> - Larger models may be slow on CPU. |
| **BGE Series (Best for RAG)** | `BAAI/bge-small-en` <br> Fast, good for retrieval. <br><br> `BAAI/bge-large-en` <br> Stronger, but requires more resources. <br><br> `BAAI/bge-m3` <br> Multilingual version. | - Strong retrieval performance. <br> - High efficiency for large-scale search. <br> - Good FAISS integration. | - Large models require more computational power. <br> - Not widely tested outside of retrieval. |
| **GTE Series (Lightweight Embeddings)** | `thenlper/gte-small` <br> Smallest model, fast inference. <br><br> `thenlper/gte-large` <br> Better performance. | - Compact and efficient. <br> - Balanced speed and accuracy. | - Not as powerful as `bge` or `e5` for retrieval. <br> - Limited multilingual support. |
| **Nomic AI (Best for Large Text)** | `nomic-ai/nomic-embed-text-v1` <br> Strong performance for large documents. | - Optimized for long-text retrieval. <br> - Designed for large-scale document search. | - Heavier than `MiniLM` or `GTE`. <br> - Requires high memory. |
| **Cohere Embeddings (Commercial API)** | `cohere/embed-english-light-v3.0` <br> Lightweight but good for retrieval. <br><br> `cohere/embed-english-v3.0` <br> More powerful, requires API. | - Optimized for similarity tasks. <br> - Good for commercial applications. | - API-based, not open-source. <br> - Usage costs may apply. |


## 3.2. How to Choose.
- General Use: `all-MiniLM-L6-v2`
- Retrieval (FAISS, RAG): `bge-large-en-v1.5`, `e5-large-v2`
- Multilingual: `intfloat/multilingual-e5-base`, `bge-m3`
- Light Model: `gte-small`
- Large Documents: `nomic-embed-text-v1`
- Note) Be careful when mixing different embedding models, as FAISS retrieval results depend on consistent embeddings.

## 3.3. Example.

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split into sentence.
text_splitter = RecursiveCharacterTextSplitter(
    separators       = [". ", "\n"],   # First split by sentences, then by paragraphs.
    chunk_size       = 100,            # Maximum chunk size.
    chunk_overlap    = 10,             # Overlaps btw chunks.
    strip_whitespace = True,           # If `True`, delete leading/trailing spaces. Default: `True`.
)

splits = text_splitter.create_documents(docs)
chunks = [split.page_content for split in splits]

# Embedding models.
from sentence_transformers import SentenceTransformer

embedding_models = {
    "e5-small": "intfloat/multilingual-e5-small"
}

# Initialize FAISS dictionary
embeddings = {}

# Process each model **one by one** (ensures logging after each step)
for model_name, model_path in embedding_models.items():
    # Load embedding model
    embedding_model = SentenceTransformer(model_path, device=device)

    # Encode all documents (ensures GPU usage if available)
    embedding = embedding_model.encode(chunks)
    
    embeddings[model_name] = embedding

    # ✅ Print log **immediately after each model is processed**
    print(f"✅ Embedding for '{model_name}' completed.")
    print(f"- {'Chunk':<15} : {chunks[0]}")
    print(f"- {'Embedding':<15} : {embedding[0][:10]}")

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


README.md:   0%|          | 0.00/498k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

✅ Embedding for 'e5-small' completed.
- Chunk           : ChatGPT is a generative artificial intelligence chatbot[2][3] developed by OpenAI and launched in 2022
- Embedding       : [ 0.03022079  0.00715949 -0.02565044 -0.02243968  0.04284703 -0.00995779
  0.06023172  0.04781923  0.04110067 -0.04076944]


# 4. Vector DB Construction.

## 4.1. FAISS Index Function.
- A **FAISS Index Function** defines the methodology on how to construct and search the VDB.

## 4.2. Types.
- **Exact Search:** Calculate a distance with **all** samples in a vector DB.

  - **`IndexFlatL2` (L2-norm, Euclidean distance)**  
    - $ d(\mathbf{a}, \mathbf{b}) = \|\mathbf{a} - \mathbf{b}\|_2 = \sqrt{\sum_{i} (a_i - b_i)^2} $

  - **`IndexFlatIP` (Inner product, or Dot product similarity)**  
    - $ \text{d}(\mathbf{a}, \mathbf{b}) = \sum_{i} a_i \cdot b_i $
    - If the input vectors are normalized, it calculates the cosine similarity.

- **Efficient ANN:**
  - **`IndexIVFFlat`**
    - Perform a k-means clustering, search only nearest clusters.
    - `nlist` = n_cluster $\approx \sqrt{n_{samples}}$, `nprobe` = n_search_clusters $\approx 0.1 \times n_{list}$

  - **`IndexHNSWFlat`**
    - Graph-based ANN search, constructs a multi-layer small-world network.
    - Searches fewer nodes than brute-force while maintaining high recall.

  - **`IndexPQ`**
    - Compresses vectors using Product Quantization (PQ) to reduce memory.
    - Approximates distance calculations by storing only compressed representations.

  - **`IndexIVFPQ`**
    - Combines `IndexIVFFlat` (clustering) and `IndexPQ` (compression).
    - First reduces search space with IVF, then speeds up with PQ.

## 4.3. How to Choose.

| Index Type        | Speed  | Accuracy | Memory Usage | Best For |
|------------------|--------|----------|--------------|----------|
| **`IndexFlatL2`**  | Slow  | ✅✅✅ High  | 🔺 High  | Exact nearest neighbor search, small datasets |
| **`IndexFlatIP`**  | Slow  | ✅✅✅ High  | 🔺 High  | Cosine similarity (with normalized vectors), exact search |
| **`IndexIVFFlat`** | ⚡ Fast | ✅✅ Medium | 🔹 Medium | Large-scale search with clustering |
| **`IndexHNSWFlat`** | ⚡⚡⚡ Very Fast | ✅✅✅ High | 🔺 High | High-speed, high-recall ANN search |
| **`IndexPQ`**  | ⚡⚡ Fast | ✅ Low | ✅✅ Low | Memory-efficient ANN search |
| **`IndexIVFPQ`** | ⚡⚡ Fast | ✅ Medium | ✅✅ Very Low | Large-scale ANN search with memory constraints |

**Key Takeaways:**
- Use **`IndexFlatL2` / `IndexFlatIP`** for **exact search** when dataset size is small.
- Use **`IndexIVFFlat`** for **large datasets**, tuning `nlist` and `nprobe` for speed vs. accuracy.
- Use **`IndexHNSWFlat`** if you need **fastest ANN search with high recall**.
- Use **`IndexPQ`** or **`IndexIVFPQ`** when **memory is limited**.

## 4.4. Example.

In [8]:
# Imports.
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle

# Load sentence embedding model.
model        = SentenceTransformer("all-MiniLM-L6-v2")       # Small, fast transformer for sentence embeddings.

# Sample database documents.
docs         = [
    "The cat sits on the mat.",
    "A dog is barking outside.",
    "The weather is sunny today.",
    "I love machine learning and AI.",
    "Natural language processing is fascinating."
]

# Convert sentences to embeddings.
d            = model.get_sentence_embedding_dimension()      # Get embedding dimension.
xb           = np.array(model.encode(docs), dtype='float32') # Convert documents to vector representations.

# Construct vector DBs.
index_l2     = faiss.IndexFlatL2(d)     # L2 norm index.
index_ip     = faiss.IndexFlatIP(d)     # Inner product index.

nlist  = min(len(xb), 2)                # Number of clusters (recommended: √N).
nprobe = max(1, nlist // 2)             # Number of clusters to search.
m_pq   = min(d, 2)
n_bits = 2

index_ivf    = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, nlist)  # IVF with L2.
index_hnsw   = faiss.IndexHNSWFlat(d, 32)                          # HNSW with 32 neighbors per node.
index_pq     = faiss.IndexPQ(d, m_pq, n_bits)                      # PQ with 2 bits each, requiring 2^2=4 clusters.
index_ivfpq  = faiss.IndexIVFPQ(faiss.IndexFlatL2(d), d, nlist, m_pq, n_bits)  # IVF + PQ.

# Train IVF and PQ indexes.
index_ivf.train(xb)   
index_pq.train(xb)
index_ivfpq.train(xb)  

# Add sentence embeddings to FAISS.
index_l2.add(xb)      
index_ip.add(xb)      
index_ivf.add(xb)     
index_hnsw.add(xb)    
index_pq.add(xb)      
index_ivfpq.add(xb)   

# Query example.
query        = "I enjoy studying artificial intelligence."   
q            = np.array(model.encode([query]), dtype='float32')  # Convert query to embedding.

# Find 'top_k' nearest neighbors.
top_k        = 2  

# Search and retrieval.
D_l2, I_l2   = index_l2.search(q, top_k)   # D: distance from each retrieval, I: index of each retrieval.
D_ip, I_ip   = index_ip.search(q, top_k)  

index_ivf.nprobe = nprobe  # Set number of clusters to search.
D_ivf, I_ivf     = index_ivf.search(q, top_k)  

D_hnsw, I_hnsw   = index_hnsw.search(q, top_k)  
D_pq, I_pq       = index_pq.search(q, top_k)  
D_ivfpq, I_ivfpq = index_ivfpq.search(q, top_k)  

# Display results.
print(f"🔍 Query: {query}")

print("\n📄 Retrieved Documents (L2 Distance):")
for i in I_l2[0]:
    print(f"- {docs[i]}")

print("\n📄 Retrieved Documents (Inner Product):")
for i in I_ip[0]:
    print(f"- {docs[i]}")

print("\n📄 Retrieved Documents (IVF):")
for i in I_ivf[0]:
    print(f"- {docs[i]}")

print("\n📄 Retrieved Documents (HNSW):")
for i in I_hnsw[0]:
    print(f"- {docs[i]}")

print("\n📄 Retrieved Documents (PQ):")
for i in I_pq[0]:
    print(f"- {docs[i]}")

print("\n📄 Retrieved Documents (IVF + PQ):")
for i in I_ivfpq[0]:
    print(f"- {docs[i]}")

# Store VDB.
faiss.write_index(index_l2, "./tmp/index_l2.faiss")
print(f"Number of vectors in VDB: {index_l2.ntotal}.")

# Store docs.
with open("./tmp/doc_rag_index_l2.pkl", "wb") as f:
    pickle.dump(docs, f)

🔍 Query: I enjoy studying artificial intelligence.

📄 Retrieved Documents (L2 Distance):
- I love machine learning and AI.
- Natural language processing is fascinating.

📄 Retrieved Documents (Inner Product):
- I love machine learning and AI.
- Natural language processing is fascinating.

📄 Retrieved Documents (IVF):
- I love machine learning and AI.
- Natural language processing is fascinating.

📄 Retrieved Documents (HNSW):
- I love machine learning and AI.
- Natural language processing is fascinating.

📄 Retrieved Documents (PQ):
- I love machine learning and AI.
- Natural language processing is fascinating.

📄 Retrieved Documents (IVF + PQ):
- I love machine learning and AI.
- Natural language processing is fascinating.
Number of vectors in VDB: 5.


# 5. Prompt Engineering.

## 5.1. Basic Prompt Structure.
- `Instruction`: Clearly define the task.
- `Context`: Provide necessary background information.
- `Input Data`: The specific input to be processed.
- `Output Format`: Specify how the response should be structured.

✔ **Be explicit** – Avoid vague instructions.  
✔ **Provide constraints** – Define length, format, or style.  
✔ **Use natural language** – Avoid overly technical language.  
✔ **Use step-by-step reasoning** – Encourage logical outputs.  

## 5.2. Types of Prompting.
- Different prompting strategies influence how the model generates responses. There are three main types:
  - Zero-Shot Prompting.
  - One-Shot Prompting.
  - Few-Shot Prompting.

## 5.3. Fine-Tuning vs. Prompting.  

| Approach          | Pros | Cons |
|------------------|------|------|
| **Prompt Optimization** | - Quickly learn a new task. <br>- Cost-effective. <br>- Works across multiple tasks. | - Limited control over the model.<br>- Performance may plateau. |
| **Fine-Tuning** | - Can significantly improve accuracy.<br>- Customizable for specific tasks.<br>- Model adapts better to new data. | - Higher cost.<br>- Overfitting. |


## 5.4. Advanced Prompt Techniques.

### 5.4.1. CoT Prompting.

- Chain-of-Thought (CoT) prompting breaks down complex problems into **step-by-step reasoning**, improving performance on logical and mathematical tasks.

✅ *Example (Without CoT)*:
> Q: A farmer has 3 apples. He buys 5 more and eats 2. How many apples are left?  
> A: 5.

✅ *Example (Few-Shot CoT)*:  
> **Example 1:**  
> Q: A farmer has 3 apples. He buys 5 more and eats 2. How many apples are left?  
> A: Let's think step by step. The farmer starts with 3 apples. He buys 5 more, so he has 3 + 5 = 8 apples. Then he eats 2, so 8 - 2 = 6 apples left.  
> **Final answer:** 6.  
>  
> **Example 2:**  
> Q: A bookstore receives 120 books. It sells 45 books and then restocks with 30 more. How many books are in the store now?  
> A: Let's think step by step. The bookstore starts with 120 books. It sells 45 books, so 120 - 45 = 75 books remain. Then it restocks 30 more, so 75 + 30 = 105 books.  
> **Final answer:** 105.  
>  
> **Now, solve this:**  
> Q: A bakery bakes 240 loaves of bread in the morning. It sells 90 loaves before noon and another 75 loaves in the afternoon. How many loaves are left at the end of the day?  
> A: Let's think step by step.  

🔹 **Why use it?**
- Improves reasoning for math, logic, and multi-step problems.
- Commonly used in **question answering, problem-solving, and programming tasks**.

---

### 5.4.2. Self-Consistency.

- Self-consistency improves accuracy by generating multiple responses and selecting the most consistent answer.
- Instead of relying on a single response, the model samples multiple outputs and aggregates the best one.

✅ *Example (Without Self-Consistency)*:
> Q: What is 17 × 23?  
> A: 374.

✅ *Example (With Self-Consistency)*:
> - First response: 391.  
> - Second response: 374.  
> - Third response: 391.  
> - **Final answer: 391 (majority vote).**

🔹 **Why use it?**  
- Reduces random errors in reasoning tasks.  
- Increases stability for **math, logic, and factual questions**.  
- Used in **LLM reasoning benchmarks and AI safety applications**.  

---

### 5.4.3. ReAct (Reason + Act).

- ReAct combines **reasoning** (thinking step-by-step) with **acting** (interacting with tools, APIs, or environments).
- It enables LLMs to retrieve real-time information and make dynamic decisions.

✅ *Example (Without ReAct)*:
> Q: What is the latest stock price of Tesla?  
> A: I don't know.

✅ *Example (With ReAct using a search tool)*:
> **Thought:** I need to check the latest Tesla stock price.  
> **Action:** Searching online...  
> **Observation:** Tesla stock price is $850.

> **Final Answer:** The latest Tesla stock price is $850.

🔹 **Why use it?**  
- Enables models to **retrieve and verify information dynamically**.  
- Used in **chatbots, AI assistants, and research agents**.  
- Key method for **LLMs integrated with APIs (LangChain, OpenAI Functions, Hugging Face Agents)**.

## 5.5. Examples.

### 5.5.1. Retrieval.

In [9]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle

# Load VDB.
index_l2 = faiss.read_index('./tmp/index_l2.faiss')

# Load documents.
with open("./tmp/doc_rag_index_l2.pkl", "rb") as f:
    docs = pickle.load(f)

# Query.
model        = SentenceTransformer("all-MiniLM-L6-v2")  

query = "I love cat."
query_embedding = np.array(model.encode([query]), dtype='float32')

# Retrieve.
top_k     = 2
distance, idx = index_l2.search(query_embedding, top_k)

retrieved_docs = [f"- {docs[i]}" for i in idx[0] if i < len(docs)]
retrieved_text = "\n".join(retrieved_docs)  # Join retrieved docs first.

### 5.5.2. Basic Prompt.

In [10]:
from langchain.prompts import PromptTemplate

template_basic = """Context:
{context}

Question: {question}

Answer:
"""

prompt_template_basic = PromptTemplate(
    input_variables=["context", "question"],
    template=template_basic
)

prompt_basic = prompt_template_basic.format(context=retrieved_text, question=query)

print("Generated Prompt:\n")
print(prompt_basic)


Generated Prompt:

Context:
- The cat sits on the mat.
- I love machine learning and AI.

Question: I love cat.

Answer:



### 5.5.3. CoT.

In [11]:
from langchain.prompts import PromptTemplate

template_cot = """Context:
{context}

Example 1:
Question: What is 2 + 2?
Answer: Let's think step by step. We start with the number 2. Adding another 2 gives us 4.
Final answer: 4.

Example 2:
Question: If a train travels at 60 km/h for 2 hours, how far does it go?
Answer: Let's think step by step. The train's speed is 60 km/h. In 2 hours, it travels 60 × 2 = 120 km.
Final answer: 120 km.

Now, answer the following question using the same reasoning:

Question: {question}

Answer:
Let's think step by step to provide a well-reasoned answer.
"""

prompt_template_cot = PromptTemplate(
    input_variables=["context", "question"],
    template=template_cot
)

prompt_cot = prompt_template_cot.format(context=retrieved_text, question=query)

print("Generated CoT Prompt:\n")
print(prompt_cot)


Generated CoT Prompt:

Context:
- The cat sits on the mat.
- I love machine learning and AI.

Example 1:
Question: What is 2 + 2?
Answer: Let's think step by step. We start with the number 2. Adding another 2 gives us 4.
Final answer: 4.

Example 2:
Question: If a train travels at 60 km/h for 2 hours, how far does it go?
Answer: Let's think step by step. The train's speed is 60 km/h. In 2 hours, it travels 60 × 2 = 120 km.
Final answer: 120 km.

Now, answer the following question using the same reasoning:

Question: I love cat.

Answer:
Let's think step by step to provide a well-reasoned answer.



### 5.5.4. Self-Consistency.

In [17]:
# Hard voting example.

# Use Counter.
from collections import Counter

# Extract final answers from responses.
responses = {  # collect responses with multiple queries.
    "Question: {question_1} \nAnswer: {answer_1}",
    "Question: {question_2} \nAnswer: {answer_2}",
    "Question: {question_3} \nAnswer: {answer_2}",
}
answers = [resp.split("Answer:")[-1].strip() for resp in responses if "Answer:" in resp]

# Perform hard voting (majority vote).
if answers:
    most_common_answer, count = Counter(answers).most_common(1)[0]
else:
    print("No consistent answer found.")
    most_common_answer = answers[0]    # Just use 1st answer.

print("Final Answer (Majority Vote):", most_common_answer)

Final Answer (Majority Vote): {answer_2}


### 5.5.5. ReAct.

In [13]:
from langchain.prompts import PromptTemplate

# Implement 'Action: Search[]' for practice.
template_react = """Context:
{context}

Example 1:
Question: What is the capital of France?
Thought: The capital of France is well known. I will recall the information.
Action: Answer directly
Observation: Paris
Final Answer: Paris

Example 2:
Question: What is the latest stock price of Tesla?
Thought: I need real-time stock price information. I will search online.
Action: Search["Tesla stock price"]
Observation: Tesla's stock price is $850.
Final Answer: Tesla's stock price is $850.

Now, follow the same reasoning process to answer the question.

Question: {question}

Thought:
"""

prompt_template_react = PromptTemplate(
    input_variables=["context", "question"],
    template=template_react
)

prompt_react = prompt_template_react.format(context=retrieved_text, question=query)

print("Generated ReAct Prompt:\n")
print(prompt_react)

Generated ReAct Prompt:

Context:
- The cat sits on the mat.
- I love machine learning and AI.

Example 1:
Question: What is the capital of France?
Thought: The capital of France is well known. I will recall the information.
Action: Answer directly
Observation: Paris
Final Answer: Paris

Example 2:
Question: What is the latest stock price of Tesla?
Thought: I need real-time stock price information. I will search online.
Action: Search["Tesla stock price"]
Observation: Tesla's stock price is $850.
Final Answer: Tesla's stock price is $850.

Now, follow the same reasoning process to answer the question.

Question: I love cat.

Thought:



# 6. LLM for Answer.

In [14]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle

# Load VDB.
index_l2 = faiss.read_index('./tmp/index_l2.faiss')

# Load documents.
with open("./tmp/doc_rag_index_l2.pkl", "rb") as f:
    docs = pickle.load(f)

# Query.
model        = SentenceTransformer("all-MiniLM-L6-v2")  

query = "I love cat."
query_embedding = np.array(model.encode([query]), dtype='float32')

# Retrieve.
top_k     = 2
distance, idx = index_l2.search(query_embedding, top_k)

retrieved_docs = [f"- {docs[i]}" for i in idx[0] if i < len(docs)]
retrieved_text = "\n".join(retrieved_docs)

from langchain.prompts import PromptTemplate

template_cot = """Context:
{context}

Example 1:
Question: What is 2 + 2?
Answer: Let's think step by step. We start with the number 2. Adding another 2 gives us 4.
Final answer: 4.

Example 2:
Question: If a train travels at 60 km/h for 2 hours, how far does it go?
Answer: Let's think step by step. The train's speed is 60 km/h. In 2 hours, it travels 60 × 2 = 120 km.
Final answer: 120 km.

Now, answer the following question using the same reasoning:

Question: {question}

Answer:
Let's think step by step to provide a well-reasoned answer.
"""

prompt_template_cot = PromptTemplate(
    input_variables=["context", "question"],
    template=template_cot
)

prompt_cot = prompt_template_cot.format(context=retrieved_text, question=query)

print("Generated CoT Prompt:\n")
print(prompt_cot)


Generated CoT Prompt:

Context:
- The cat sits on the mat.
- I love machine learning and AI.

Example 1:
Question: What is 2 + 2?
Answer: Let's think step by step. We start with the number 2. Adding another 2 gives us 4.
Final answer: 4.

Example 2:
Question: If a train travels at 60 km/h for 2 hours, how far does it go?
Answer: Let's think step by step. The train's speed is 60 km/h. In 2 hours, it travels 60 × 2 = 120 km.
Final answer: 120 km.

Now, answer the following question using the same reasoning:

Question: I love cat.

Answer:
Let's think step by step to provide a well-reasoned answer.



In [15]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline

# Load the tokenizer and model
model_name = "distilgpt2"  # or choose another model as needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a text generation pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, 
                          max_length=200, truncation=True)

# Define the LLM using the updated HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=text_generator)

# Simply generate the answer by invoking the LLM on prompt_cot:
result = llm.invoke(prompt_cot)

# Print the generated answer
print("Generated Answer:")
print(result)

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Answer:
Context:
- The cat sits on the mat.
- I love machine learning and AI.

Example 1:
Question: What is 2 + 2?
Answer: Let's think step by step. We start with the number 2. Adding another 2 gives us 4.
Final answer: 4.

Example 2:
Question: If a train travels at 60 km/h for 2 hours, how far does it go?
Answer: Let's think step by step. The train's speed is 60 km/h. In 2 hours, it travels 60 × 2 = 120 km.
Final answer: 120 km.

Now, answer the following question using the same reasoning:

Question: I love cat.

Answer:
Let's think step by step to provide a well-reasoned answer.
Answer:
But, this really doesn't happen! Suppose we get a list of 100 people running in a 100-min range,
