In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm





### 1. Knowledge Base (KB) Creation and Embedding 

In [3]:
# Chunk text file, then calculate embeddings by calling model.encode()
knowledgeBase = "C:/Users/awubs/OneDrive/Desktop/AD331/KB.txt"

with open(knowledgeBase, 'r', encoding='utf-8') as f:
    # Use .strip() to remove leading/trailing whitespace, and filter out empty lines
    knowledge_chunks = [line.strip() for line in f if line.strip()]

chunk_embeddings = model.encode(knowledge_chunks)
print(chunk_embeddings.shape)

(3, 384)


### 2. Indexing 

In [4]:
import faiss

embedding_dimension = chunk_embeddings.shape[1]  # For 'all-MiniLM-L6-v2'
number_of_vectors = len(knowledge_chunks)

faiss_data = chunk_embeddings.astype('float32')

index = faiss.IndexFlatL2(embedding_dimension)
index.add(faiss_data)

print(f"Index created successfully!")
print(f"Vector dimension: {index.d}")
print(f"Total vectors indexed: {index.ntotal}")

Index created successfully!
Vector dimension: 384
Total vectors indexed: 3


### 3. Retrieval

In [None]:
def retrieve_chunks(query: str, k: int) -> list[str]:
    """
    Takes a user query, embeds it, and searches the FAISS index to retrieve the
    top 'k' most relevant chunks from the knowledge base.
    """
    # 1. Generate Query Embedding
    # The [None] is used to add a batch dimension (shape: [1, dimension])
    query_embedding = model.encode(query)
    query_vector = np.array([query_embedding]).astype('float32')

    # 2. Perform Similarity Search (Nearest Neighbor)
    # D: Distances (or scores)
    # I: Indices of the nearest neighbors in the FAISS index
    # FAISS uses L2 distance for IndexFlatL2, where smaller distance = higher similarity
    Distances, Indices = index.search(query_vector, k)

    # 3. Retrieve the Chunks
    retrieved_chunks = []
    for i, index_id in enumerate(Indices[0]):
        # index_id corresponds to the index in the original 'chunks' list
        # Check if index_id is valid (sometimes -1 is returned if k > n_total)
        if index_id >= 0:
            retrieved_chunks.append(knowledge_chunks[index_id])

    print(f"--- Retrieved Top {len(retrieved_chunks)} Chunks ---")
    for i, chunk in enumerate(retrieved_chunks):
        print(f"Chunk {i+1} (FAISS Index {Indices[0][i]} | Distance: {Distances[0][i]:.4f}):")
        print(chunk.strip())
        print("-" * 20)
        
    return retrieved_chunks


### 4. Generation (Augmentation)

In [6]:
# Construct a final prompt for a pre-trained LLM  that includes the original query AND the retrieved context chunks.

def create_augmented_prompt(query: str, context: list[str]) -> str:
    """
    Constructs the final prompt for the LLM by framing the context
    and demanding an answer based *only* on that context.
    """
    context_str = "\n".join([f"--- Context Chunk ---\n{c}" for c in context])

    # The augmented prompt template
    prompt_template = f"""
    --- USER QUESTION ---
    {query}

    Answer From LLM:
    """
    return prompt_template.strip()

# Generate the final prompt
from transformers import pipeline
qa_pipeline = pipeline("text-generation", model="gpt2")

Device set to use cpu


In [14]:
user_question = input("Enter a query:")
retrieved_context = retrieve_chunks(user_question, k=3)
final_prompt = create_augmented_prompt(user_question, retrieved_context)
final_answer = qa_pipeline(final_prompt, max_new_tokens=50, temperature=0.8, do_sample=True, top_p=0.9, repetition_penalty=1.5)[0]['generated_text']

print(final_answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--- Retrieved Top 3 Chunks ---
Chunk 1 (FAISS Index 2 | Distance: 1.4335):
Indrik: A mythical creature from Slavic folklore, the indrik is a six-legged beast that is said to possess a powerful sense of smell, which it uses to locate water sources.
--------------------
Chunk 2 (FAISS Index 1 | Distance: 1.6237):
Baku: A benevolent dream-eating creature from Japanese mythology, the baku is said to consume nightmares, which helps people sleep better.
--------------------
Chunk 3 (FAISS Index 0 | Distance: 1.6928):
Bake-kujira: A ghostly whale from Japanese folklore that appears on the sea, often followed by a host of ghostly fish.
--------------------
--- USER QUESTION ---
    Slavic

    Answer From LLM: On the basis of this, you would not be able to answer that question. In particular it is impossible for anyone who cannot understand what they are saying if one does so with a very limited understanding and experience as well (e.-g., in school


### 5. Testing

In [8]:
test_q1="What is the primary function of the Baku?"
retrieved_context = retrieve_chunks(test_q1, k=1)
final_prompt = create_augmented_prompt(test_q1, retrieved_context)
final_answer = qa_pipeline(final_prompt, max_new_tokens=50, temperature=0.8, do_sample=True, top_p=0.9, repetition_penalty=1.5)[0]['generated_text']

print(final_answer)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--- Retrieved Top 1 Chunks ---
Chunk 1 (FAISS Index 1 | Distance: 0.9490):
Baku: A benevolent dream-eating creature from Japanese mythology, the baku is said to consume nightmares, which helps people sleep better.
--------------------
--- USER QUESTION ---
    What is the primary function of the Baku?

    Answer From LLM: Â  There are a number that must be discussed here. The main one being "the central role". I can say with absolute certainty what this position actually stands on, but it's just as much an open question to me why there isn't any official


In [9]:
test_q2="How are you supposed to cook sausages?"
retrieved_context = retrieve_chunks(test_q2, k=1)
final_prompt = create_augmented_prompt(test_q2, retrieved_context)
final_answer = qa_pipeline(final_prompt, max_new_tokens=50, temperature=0.8, do_sample=True, top_p=0.9, repetition_penalty=1.5)[0]['generated_text']

print(final_answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--- Retrieved Top 1 Chunks ---
Chunk 1 (FAISS Index 2 | Distance: 1.8035):
Indrik: A mythical creature from Slavic folklore, the indrik is a six-legged beast that is said to possess a powerful sense of smell, which it uses to locate water sources.
--------------------
--- USER QUESTION ---
    How are you supposed to cook sausages?

    Answer From LLM: I'm usually on a quest for the best recipes and have only just come across an excellent recipe. That's not how it works here, except that this is my first time eating one of these...the other day (my last), as soon after


In [10]:
test_q3="Which creature has a powerful sense of smell, and which creature is known for consuming dreams?"
retrieved_context = retrieve_chunks(test_q3, k=2)
final_prompt = create_augmented_prompt(test_q3, retrieved_context)
final_answer = qa_pipeline(final_prompt, max_new_tokens=50, temperature=0.8, do_sample=True, top_p=0.9, repetition_penalty=1.5)[0]['generated_text']

print(final_answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--- Retrieved Top 2 Chunks ---
Chunk 1 (FAISS Index 1 | Distance: 0.9379):
Baku: A benevolent dream-eating creature from Japanese mythology, the baku is said to consume nightmares, which helps people sleep better.
--------------------
Chunk 2 (FAISS Index 2 | Distance: 1.1586):
Indrik: A mythical creature from Slavic folklore, the indrik is a six-legged beast that is said to possess a powerful sense of smell, which it uses to locate water sources.
--------------------
--- USER QUESTION ---
    Which creature has a powerful sense of smell, and which creature is known for consuming dreams?

    Answer From LLM: The following are the answers to questions 1. Will it be possible (or advisable) if you eat them before waking up in order that they will not wake people when seen by their senses during sleep or from outside view? 2. Can I make an
