In [1]:
with open('Dataset/StudentHandbookDataset.txt', 'r', encoding='utf-8') as f:
    dataset = f.read()

print(f"Dataset loaded: {len(dataset):,} characters")
print(f"Estimated pages: ~{len(dataset) // 2000}")

Dataset loaded: 171,284 characters
Estimated pages: ~85


In [2]:
import torch
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
import faiss
import numpy as np
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Step 1: Chunking (RUN ONCE ONLY)

This section handles text chunking and saves the results. You only need to run this **once** or when:
- You update your dataset
- You want to change chunking strategy (e.g., chunk size, overlap)
- The saved chunks file is deleted

**Note**: After running once, skip to Step 2 for embedding experiments!

In [15]:
def check_chunks_exist():
    """Check if chunks have been saved"""
    return os.path.exists("saved_chunks/chunks.pkl")

def save_chunks(chunks, filename="saved_chunks/chunks.pkl"):
    """Save chunks to disk"""
    os.makedirs("saved_chunks", exist_ok=True)
    with open(filename, 'wb') as f:
        pickle.dump(chunks, f)
    print(f"[SUCCESS] Saved {len(chunks)} chunks to {filename}")

def load_chunks(filename="saved_chunks/chunks.pkl"):
    """Load chunks from disk"""
    try:
        with open(filename, 'rb') as f:
            chunks = pickle.load(f)
        print(f"[SUCCESS] Loaded {len(chunks)} chunks from {filename}")
        return chunks
    except FileNotFoundError:
        print(f"[ERROR] Chunks file not found: {filename}")
        return None

# Check if chunks already exist
if check_chunks_exist():
    print("[SUCCESS] Chunks file found! You can skip to Step 2 (Embedding Experiments)")
    print("[INFO] To reload chunks, run: chunks = load_chunks()")
else:
    print("[INFO] No saved chunks found. Run the chunking cells below.")

[INFO] No saved chunks found. Run the chunking cells below.


In [16]:
# ONLY RUN THIS IF CHUNKS DON'T EXIST OR YOU WANT TO RE-CHUNK
print("Loading embedding model for semantic chunking...")
print("   Using CPU for stable performance")

chunking_embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

print("Setting up semantic chunker...")
text_splitter = SemanticChunker(
    embeddings=chunking_embed_model,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=80,
    buffer_size=1,
    add_start_index=True
)
print("[SUCCESS] Chunker ready (using CPU)!")

Loading embedding model for semantic chunking...
   Using CPU for stable performance
Setting up semantic chunker...
[SUCCESS] Chunker ready (using CPU)!
Setting up semantic chunker...
[SUCCESS] Chunker ready (using CPU)!


In [17]:
# ONLY RUN THIS IF CHUNKS DON'T EXIST OR YOU WANT TO RE-CHUNK
print("Creating semantic chunks from raw text...")
chunks = text_splitter.create_documents([dataset])
print(f"[SUCCESS] Created {len(chunks)} semantic chunks")

# Analyze chunk quality
chunk_sizes = [len(chunk.page_content) for chunk in chunks]
print(f"\nChunk Analysis:")
print(f"   Average size: {np.mean(chunk_sizes):.0f} characters")
print(f"   Size range: {min(chunk_sizes)} - {max(chunk_sizes)} characters")
print(f"   Total chunks: {len(chunks)}")

# Show sample chunks
print("\nSample chunks:")
for i in range(min(3, len(chunks))):
    chunk_preview = chunks[i].page_content[:150].replace('\n', ' ')
    print(f"   Chunk {i+1}: {chunk_preview}...")

# SAVE THE CHUNKS!
save_chunks(chunks)
print("\n[SUCCESS] Chunking complete and saved! You won't need to run this again.")

Creating semantic chunks from raw text...
[SUCCESS] Created 246 semantic chunks

Chunk Analysis:
   Average size: 650 characters
   Size range: 2 - 7102 characters
   Total chunks: 246

Sample chunks:
   Chunk 1: ﻿Republic of the Philippines  Eulogio "Amang" Rodriguez Institute of Science and Technology Office of Student Affairs and Services   EARIST STUDENT HA...
   Chunk 2: ii - HISTORY OF EARIST ..... 1 - MISSION STATEMENTS   - Vision ..... 3   - Mission ..... 3   - Goal ........
   Chunk 3: 3   - Objectives ..... 3 - CURRICULAR OFFERINGS   - Main Campus     - College of Architecture and Fine Arts ..... 4     - College of Arts and Sciences...
[SUCCESS] Saved 246 chunks to saved_chunks/chunks.pkl

[SUCCESS] Chunking complete and saved! You won't need to run this again.
[SUCCESS] Created 246 semantic chunks

Chunk Analysis:
   Average size: 650 characters
   Size range: 2 - 7102 characters
   Total chunks: 246

Sample chunks:
   Chunk 1: ﻿Republic of the Philippines  Eulogio "Amang" R

---

## Step 2: Load Chunks & Experiment with Embeddings

Start here after chunking is done! This section lets you experiment with different embedding models.

In [18]:
# Load saved chunks
chunks = load_chunks()

if chunks is None:
    print("[ERROR] Please run Step 1 (Chunking) first!")
else:
    print(f"[SUCCESS] Ready to experiment with {len(chunks)} chunks!")

[SUCCESS] Loaded 246 chunks from saved_chunks/chunks.pkl
[SUCCESS] Ready to experiment with 246 chunks!


In [None]:
### Choose Your Embedding Model

Experiment with different embedding models here! Uncomment the one you want to try:

**Options:**
- `all-mpnet-base-v2`: Best quality, slower (768 dim)
- `all-MiniLM-L6-v2`: Fast, good quality (384 dim) 
- `multi-qa-mpnet-base-dot-v1`: Optimized for Q&A
- `paraphrase-multilingual-mpnet-base-v2`: Multi-language support
- Or try any model from https://huggingface.co/sentence-transformers

In [19]:
# EXPERIMENT: Choose your embedding model
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"  # Change this to experiment!

# Alternative options to try:
# MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Faster, smaller
# MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"  # Q&A optimized
# MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"  # Multilingual

print(f"\nInitializing embedding model: {MODEL_NAME}")
print("   Using CPU for stable performance")

embedding_model = HuggingFaceEmbeddings(
    model_name=MODEL_NAME,
    model_kwargs={
        'device': 'cpu',
        'trust_remote_code': True
    },
    encode_kwargs={'normalize_embeddings': True}
)

# Get embedding dimension
test_embed = embedding_model.embed_query("test")

print(f"[SUCCESS] Embedding model loaded: {MODEL_NAME.split('/')[-1]}")
print(f"Embedding dimension: {len(test_embed)}")
print(f"Device: CPU")


Initializing embedding model: sentence-transformers/all-mpnet-base-v2
   Using CPU for stable performance
[SUCCESS] Embedding model loaded: all-mpnet-base-v2
Embedding dimension: 768
Device: CPU
[SUCCESS] Embedding model loaded: all-mpnet-base-v2
Embedding dimension: 768
Device: CPU


In [20]:
print("\nGenerating embeddings for all chunks with current model...")
chunk_texts = [chunk.page_content for chunk in chunks]

# Process embeddings in batches to avoid memory issues
batch_size = 32
all_embeddings = []

import time
start_time = time.time()

for i in range(0, len(chunk_texts), batch_size):
    batch = chunk_texts[i:i+batch_size]
    batch_embeddings = embedding_model.embed_documents(batch)
    all_embeddings.extend(batch_embeddings)
    print(f"   Processed batch {i//batch_size + 1}/{(len(chunk_texts) + batch_size - 1)//batch_size}")

elapsed_time = time.time() - start_time
print(f"\n[SUCCESS] Generated {len(all_embeddings)} embeddings in {elapsed_time:.2f}s")
print(f"Average: {elapsed_time/len(all_embeddings):.3f}s per chunk")


Generating embeddings for all chunks with current model...
   Processed batch 1/8
   Processed batch 1/8
   Processed batch 2/8
   Processed batch 2/8
   Processed batch 3/8
   Processed batch 3/8
   Processed batch 4/8
   Processed batch 4/8
   Processed batch 5/8
   Processed batch 5/8
   Processed batch 6/8
   Processed batch 6/8
   Processed batch 7/8
   Processed batch 7/8
   Processed batch 8/8

[SUCCESS] Generated 246 embeddings in 35.86s
Average: 0.146s per chunk
   Processed batch 8/8

[SUCCESS] Generated 246 embeddings in 35.86s
Average: 0.146s per chunk


In [21]:
# Build FAISS vector store for fast similarity search
print("\nBuilding FAISS vector database...")
dimension = len(all_embeddings[0])
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity

# Normalize embeddings for proper cosine similarity
embeddings_array = np.array(all_embeddings).astype('float32')
faiss.normalize_L2(embeddings_array)
index.add(embeddings_array)

print(f"[SUCCESS] FAISS index ready: {index.ntotal:,} vectors ({dimension} dimensions)")
print(f"Model: {MODEL_NAME.split('/')[-1]}")


Building FAISS vector database...
[SUCCESS] FAISS index ready: 246 vectors (768 dimensions)
Model: all-mpnet-base-v2


---

## Step 3: Retrieval Testing

Now test your retrieval system with the current embedding model!

In [22]:
def retrieve_relevant_chunks(query, top_k=5):
    """Find most relevant chunks for the query"""
    print(f"\nSearching for: '{query}'")
    print(f"Model: {MODEL_NAME.split('/')[-1]}")
    print("=" * 50)
    
    # Embed the query
    query_embedding = embedding_model.embed_query(query)
    query_vector = np.array([query_embedding]).astype('float32')
    faiss.normalize_L2(query_vector)

    # Search FAISS index
    scores, indices = index.search(query_vector, top_k)

    # Return results with metadata
    results = []
    for idx, score in zip(indices[0], scores[0]):
        if idx < len(chunks):  # Safety check
            chunk = chunks[idx]
            results.append({
                'text': chunk.page_content,
                'score': float(score),
                'chunk_id': int(idx),
                'start_pos': chunk.metadata.get('start_index', 0) if hasattr(chunk, 'metadata') else 0
            })

    return results

def display_retrieval_results(query, results):
    """Display comprehensive retrieval results"""
    print(f"Found {len(results)} relevant chunks for: '{query}'")
    print("=" * 70)
    
    for i, result in enumerate(results, 1):
        print(f"\nChunk {i} (ID: {result['chunk_id']})")
        print(f"Relevance Score: {result['score']:.4f}")
        print(f"Position in Document: Character {result['start_pos']:,}")
        print(f"Length: {len(result['text'])} characters")
        print(f"Content Preview (first 200 chars):")
        print(f"   {result['text'][:200].replace(chr(10), ' ').replace(chr(13), '')}...")
        
        # Show full content if it's short enough
        if len(result['text']) <= 500:
            print(f"Full Content:")
            print(f"   {result['text']}")
        
        print("-" * 40)
    
    return results

print("[SUCCESS] Retrieval functions ready!")

[SUCCESS] Retrieval functions ready!


## Understanding `top_k=5`

The `top_k=5` parameter means we retrieve the **5 most relevant chunks** for each query. Here's what this does:

1. **Similarity Search**: When you ask a question, the system converts it to a vector and finds the most similar document chunks
2. **Ranking**: All chunks get a similarity score (0-1, where 1 is perfect match)
3. **Top Results**: We take only the top 5 highest-scoring chunks
4. **Why 5?**: This balances between:
   - **Coverage**: Enough context to answer most questions
   - **Quality**: Avoiding too many irrelevant results
   - **Speed**: Faster processing with fewer chunks

**You can adjust this number**:
- `top_k=3`: Fewer, more focused results
- `top_k=10`: More comprehensive but potentially noisier results
- `top_k=1`: Just the single best match

In [26]:
# Test the retrieval system with sample queries
print("\nTesting Retrieval System!")
print("=" * 50)

# Sample test questions for university handbook
test_questions = [
    "What are the graduation requirements?",
    "How do I withdraw from a course?",
    "What is the academic probation policy?",
    "What happens if I'm caught cheating?",
    "How do I change my major?"
]

print("Running test queries...")

# Test with the first question
test_query = test_questions[0]
test_results = retrieve_relevant_chunks(test_query, top_k=5)
display_retrieval_results(test_query, test_results)


Testing Retrieval System!
Running test queries...

Searching for: 'What are the graduation requirements?'
Model: all-mpnet-base-v2
Found 5 relevant chunks for: 'What are the graduation requirements?'

Chunk 1 (ID: 84)
Relevance Score: 0.6105
Position in Document: Character 59,469
Length: 86 characters
Content Preview (first 200 chars):
   2. Must carry a minimum academic load of 18 units (except for graduating students). 3....
Full Content:
   2. Must carry a minimum academic load of 18 units (except for graduating students). 3.
----------------------------------------

Chunk 2 (ID: 85)
Relevance Score: 0.5864
Position in Document: Character 59,555
Length: 161 characters
Content Preview (first 200 chars):
   Must have a passing grade in all subjects including P.E. and NSTP (MTSLTS/CWTS) enrolled in order to qualify for continuance for the following semester. ##### b....
Full Content:
   Must have a passing grade in all subjects including P.E. and NSTP (MTSLTS/CWTS) enrolled in order t

[{'text': '2. Must carry a minimum academic load of 18 units (except for graduating students). 3.',
  'score': 0.6104676723480225,
  'chunk_id': 84,
  'start_pos': 59469},
 {'text': 'Must have a passing grade in all subjects including P.E. and NSTP (MTSLTS/CWTS) enrolled in order to qualify for continuance for the following semester. ##### b.',
  'score': 0.5863968133926392,
  'chunk_id': 85,
  'start_pos': 59555},
  'score': 0.5534185767173767,
  'chunk_id': 51,
  'start_pos': 50050},
 {'text': 'TERTIARY EDUCATION PROGRAM\n\n\n##### a. Qualifications\n1. Must be enrolled.',
  'score': 0.5496562123298645,
  'chunk_id': 65,
  'start_pos': 56945},
 {'text': '## SECTION 10. GRADUATION WITH HONORS\n\n\n### 10.1 The College Dean/Campus Administrator, in close coordination with the College Registrar, shall recommend a student who completes his baccalaureate course with any of the following cumulative grade point averages to be graduated with honors:\n\n\nHonor: Summa cum Laude\n* Required Gr

In [23]:
def interactive_retrieval():
    """Interactive mode for testing retrieval"""
    print(f"\nInteractive Retrieval Testing Mode")
    print(f"Current model: {MODEL_NAME.split('/')[-1]}")
    print("Test how well the system finds relevant information!")
    print("Type 'quit' to exit, 'batch' to run all test questions, or 'help' for commands")
    print("-" * 70)

    while True:
        question = input("\n❓ Enter your question (or command): ").strip()

        if question.lower() in ['quit', 'exit', 'q']:
            print("Goodbye! To try a different embedding model, change MODEL_NAME and re-run from Step 2.")
            break
            
        elif question.lower() == 'help':
            print("Available commands:")
            print("   'quit' or 'q' - Exit interactive mode")
            print("   'batch' - Run all predefined test questions")
            print("   'test1', 'test2', etc. - Run specific test question")
            print("   'model' - Show current embedding model")
            print("   Or just type any question to search!")
            continue
            
        elif question.lower() == 'model':
            print(f"Current embedding model: {MODEL_NAME}")
            print(f"Embedding dimension: {len(all_embeddings[0])}")
            print(f"Total chunks indexed: {len(chunks)}")
            continue
            
        elif question.lower() == 'batch':
            print("\nRunning batch test of all questions...")
            for i, test_q in enumerate(test_questions, 1):
                print(f"\n{'='*20} TEST {i}/5 {'='*20}")
                results = retrieve_relevant_chunks(test_q, top_k=3)  # Use top_k=3 for batch
                display_retrieval_results(test_q, results)
            continue
            
        elif question.lower().startswith('test') and len(question) > 4 and question[4:].isdigit():
            test_num = int(question[4:]) - 1
            if 0 <= test_num < len(test_questions):
                test_q = test_questions[test_num]
                print(f"\nRunning test question {test_num + 1}")
                results = retrieve_relevant_chunks(test_q, top_k=5)
                display_retrieval_results(test_q, results)
            else:
                print(f"[ERROR] Test number must be between 1 and {len(test_questions)}")
            continue

        if not question:
            print("Please enter a question!")
            continue

        try:
            results = retrieve_relevant_chunks(question, top_k=5)
            display_retrieval_results(question, results)
            
            # Ask if user wants to try different top_k values
            while True:
                modify = input("\nTry different number of results? (Enter number 1-10, or 'n' for no): ").strip()
                if modify.lower() in ['n', 'no', '']:
                    break
                elif modify.isdigit() and 1 <= int(modify) <= 10:
                    new_k = int(modify)
                    print(f"\nRetrieving top {new_k} results...")
                    new_results = retrieve_relevant_chunks(question, top_k=new_k)
                    display_retrieval_results(question, new_results)
                    break
                else:
                    print("Please enter a number between 1-10 or 'n'")
                    
        except Exception as e:
            print(f"[ERROR] {str(e)}")

print("[SUCCESS] Interactive retrieval function ready!")
print("[INFO] Call interactive_retrieval() to start testing!")

[SUCCESS] Interactive retrieval function ready!
[INFO] Call interactive_retrieval() to start testing!


In [25]:
# Start interactive mode
interactive_retrieval()


Interactive Retrieval Testing Mode
Current model: all-mpnet-base-v2
Test how well the system finds relevant information!
Type 'quit' to exit, 'batch' to run all test questions, or 'help' for commands
----------------------------------------------------------------------

Searching for: 'what is the requirement if i want to apply for computer science'
Model: all-mpnet-base-v2
Found 5 relevant chunks for: 'what is the requirement if i want to apply for computer science'

Chunk 1 (ID: 83)
Relevance Score: 0.5325
Position in Document: Character 59,388
Length: 81 characters
Content Preview (first 200 chars):
   Qualifications 1. Currently enrolled in one of the courses in any of the college....
Full Content:
   Qualifications
1. Currently enrolled in one of the courses in any of the college.
----------------------------------------

Chunk 2 (ID: 30)
Relevance Score: 0.5012
Position in Document: Character 14,536
Length: 2316 characters
Content Preview (first 200 chars):
   Admission Require