In [2]:
with open('Dataset/StudentHandbookDataset.txt', 'r', encoding='utf-8') as f:
    dataset = f.read()

print(f"üìö Dataset loaded: {len(dataset):,} characters")
print(f"üìÑ Estimated pages: ~{len(dataset) // 2000}")

üìö Dataset loaded: 171,284 characters
üìÑ Estimated pages: ~85


In [3]:
import torch
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import warnings
warnings.filterwarnings('ignore')

In [4]:
print("\nüîß Initializing embedding model...")
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",  # Best quality for academic text
    model_kwargs={
        'device': 'cuda' if torch.cuda.is_available() else 'cpu',
        'trust_remote_code': True
    },
    encode_kwargs={'normalize_embeddings': True}
)
print("‚úÖ Embedding model loaded: all-mpnet-base-v2 (768 dimensions)")


üîß Initializing embedding model...
‚úÖ Embedding model loaded: all-mpnet-base-v2 (768 dimensions)


In [5]:
print("\nüîß Setting up semantic chunker...")
text_splitter = SemanticChunker(
    embeddings=embedding_model,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=80,  # Good balance for policy docs
    buffer_size=1,
    add_start_index=True  # Track position in original text
)


üîß Setting up semantic chunker...


In [6]:
# 3. Split raw text into semantic chunks using create_documents
print("\nüìù Creating semantic chunks from raw text...")
chunks = text_splitter.create_documents([dataset])
print(f"‚úÖ Created {len(chunks)} semantic chunks")

# Analyze chunk quality
chunk_sizes = [len(chunk.page_content) for chunk in chunks]
print(f"\nüìä Chunk Analysis:")
print(f"   Average size: {np.mean(chunk_sizes):.0f} characters")
print(f"   Size range: {min(chunk_sizes)} - {max(chunk_sizes)} characters")
print(f"   Total chunks: {len(chunks)}")

# Show sample chunks
print("\nüìã Sample chunks:")
for i in range(min(3, len(chunks))):
    chunk_preview = chunks[i].page_content[:150].replace('\n', ' ')
    print(f"   Chunk {i+1}: {chunk_preview}...")



üìù Creating semantic chunks from raw text...
‚úÖ Created 246 semantic chunks

üìä Chunk Analysis:
   Average size: 650 characters
   Size range: 2 - 10109 characters
   Total chunks: 246

üìã Sample chunks:
   Chunk 1: ÔªøRepublic of the Philippines  Eulogio "Amang" Rodriguez Institute of Science and Technology Office of Student Affairs and Services   EARIST STUDENT HA...
   Chunk 2: ii - HISTORY OF EARIST ..... 1 - MISSION STATEMENTS   - Vision ..... 3   - Mission ..... 3   - Goal ........
   Chunk 3: 3   - Objectives ..... 3 - CURRICULAR OFFERINGS   - Main Campus     - College of Architecture and Fine Arts ..... 4     - College of Arts and Sciences...


In [7]:
print("\nüîÑ Generating embeddings for all chunks...")
chunk_texts = [chunk.page_content for chunk in chunks]

# Process embeddings in batches to avoid memory issues
batch_size = 32
all_embeddings = []
for i in range(0, len(chunk_texts), batch_size):
    batch = chunk_texts[i:i+batch_size]
    batch_embeddings = embedding_model.embed_documents(batch)
    all_embeddings.extend(batch_embeddings)
    print(f"   Processed batch {i//batch_size + 1}/{(len(chunk_texts) + batch_size - 1)//batch_size}")

print(f"‚úÖ Generated {len(all_embeddings)} embeddings")




üîÑ Generating embeddings for all chunks...
   Processed batch 1/8
   Processed batch 2/8
   Processed batch 3/8
   Processed batch 4/8
   Processed batch 5/8
   Processed batch 6/8
   Processed batch 7/8
   Processed batch 8/8
‚úÖ Generated 246 embeddings


In [8]:

# 5. Build FAISS vector store for fast similarity search
print("\nüóÑÔ∏è Building FAISS vector database...")
dimension = len(all_embeddings[0])
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity

# Normalize embeddings for proper cosine similarity
embeddings_array = np.array(all_embeddings).astype('float32')
faiss.normalize_L2(embeddings_array)
index.add(embeddings_array)

print(f"‚úÖ FAISS index ready: {index.ntotal:,} vectors ({dimension} dimensions)")



üóÑÔ∏è Building FAISS vector database...
‚úÖ FAISS index ready: 246 vectors (768 dimensions)


In [9]:
# 6. Load high-quality language model for generation
print("\nü§ñ Loading language model...")

# Configure 4-bit quantization for T4 GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)



ü§ñ Loading language model...


In [9]:
from huggingface_hub import notebook_login

# This will prompt you to enter your HF token
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [10]:
# Use Mistral 7B for quality (perfect for T4)
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

print(f"‚úÖ Model loaded: {model_name}")
print(f"üéØ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ Model loaded: mistralai/Mistral-7B-Instruct-v0.1
üéØ GPU Memory: 8.5GB


In [11]:
def retrieve_relevant_chunks(query, top_k=5):
    """Find most relevant chunks for the query"""
    # Embed the query
    query_embedding = embedding_model.embed_query(query)
    query_vector = np.array([query_embedding]).astype('float32')
    faiss.normalize_L2(query_vector)

    # Search FAISS index
    scores, indices = index.search(query_vector, top_k)

    # Return results with metadata
    results = []
    for idx, score in zip(indices[0], scores[0]):
        if idx < len(chunks):  # Safety check
            chunk = chunks[idx]
            results.append({
                'text': chunk.page_content,
                'score': float(score),
                'chunk_id': int(idx),
                'start_pos': chunk.metadata.get('start_index', 0) if hasattr(chunk, 'metadata') else 0
            })

    return results

# def generate_answer(query, context_chunks, max_new_tokens=350):
#     """Generate answer using retrieved context"""
#     # Combine context from relevant chunks
#     context_parts = []
#     for i, chunk in enumerate(context_chunks):
#         context_parts.append(f"[Section {i+1}]\n{chunk['text']}")

#     combined_context = "\n\n".join(context_parts)

#     # Create optimized prompt for university handbook
#     prompt = f"""<s>[INST] You are a university student advisor with access to the official student handbook. Answer the student's question accurately using only the provided handbook sections.

# HANDBOOK SECTIONS:
# {combined_context}

# STUDENT QUESTION: {query}

# Provide a clear, helpful answer based on the handbook information above. If the handbook doesn't contain enough information, say so. Be specific about policies, procedures, and requirements. [/INST]"""

#     # Tokenize and generate
#     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3072)
#     inputs = {k: v.to(model.device) for k, v in inputs.items()}

#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=max_new_tokens,
#             temperature=0.2,  # Low for factual accuracy
#             do_sample=True,
#             top_p=0.9,
#             repetition_penalty=1.1,
#             pad_token_id=tokenizer.eos_token_id,
#             eos_token_id=tokenizer.eos_token_id
#         )

#     # Extract generated answer
#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     answer_start = response.find("[/INST]") + len("[/INST]")
#     answer = response[answer_start:].strip()

#     return answer


def generate_answer(query, context_chunks, max_new_tokens=350):

    if not context_chunks:
        # Handle case where no context is retrieved
        combined_context = "No information found in the handbook."
    else:
        # --- MODIFICATION 1: Re-ranking and XML-style Tags ---
        # ASSUMPTION: context_chunks is a list of dicts, e.g., [{'text': ...}, ...],
        # pre-sorted from MOST relevant [0] to LEAST relevant [-1].

        best_chunk = context_chunks[0]  # The most relevant chunk
        other_chunks = context_chunks[1:] # All other (less relevant) chunks

        # Reverse the 'other' chunks so the absolute least relevant is first
        other_chunks.reverse() 

        context_parts = []
        
        # 1. Add the less relevant chunks first
        for i, chunk in enumerate(other_chunks):
            # Using XML-style tags for better separation
            context_parts.append(f"<HANDBOOK_SECTION_{i+1}>\n{chunk['text']}\n</HANDBOOK_SECTION_{i+1}>")

        # 2. Add the MOST relevant chunk at the very end of the context
        context_parts.append(f"<HANDBOOK_SECTION_MOST_RELEVANT>\n{best_chunk['text']}\n</HANDBOOK_SECTION_MOST_RELEVANT>")
        
        combined_context = "\n\n".join(context_parts)
        # --- End of Modification 1 ---

    # Create optimized prompt (your template is already excellent)
    prompt = f"""<s>[INST] You are Amang Bot (Ambot) a university student advisor. Answer the student's question accurately using only the provided sections below.

HANDBOOK SECTIONS:
{combined_context}

STUDENT QUESTION: {query}

Provide a clear, helpful answer based on the information above. If the handbook doesn't contain enough information, say so. Be specific about policies, procedures, and requirements. [/INST]"""


    untruncated_inputs = tokenizer(prompt)
    original_length = len(untruncated_inputs['input_ids'])
    # --- End Debug ---

    # Tokenize and generate (Your original code)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3072)
    
    # --- MODIFICATION 2: Store input length for better decoding ---
    input_length = inputs['input_ids'].shape[1] 
    
    # --- DEBUG: Print the comparison ---
    print(f"--- TRUNCATION REPORT ---")
    print(f"Original token length: {original_length}")
    print(f"Truncated token length: {input_length} (Max: 3072)")
    if original_length > input_length:
        print("WARNING: The prompt was truncated.")
    else:
        print("INFO: The prompt was not truncated.")
    print("-------------------------\n")
    
    # Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3072)
    
    # --- MODIFICATION 2: Store input length for better decoding ---
    input_length = inputs['input_ids'].shape[1] 
    # --- End of Modification 2 ---
    
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.2,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # --- MODIFICATION 3: More Robust Decoding ---
    # Decode only the *newly generated tokens* by slicing the output tensor
    # This avoids any string matching for "[/INST]"
    new_tokens = outputs[0][input_length:]
    answer = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
    # --- End of Modification 3 ---

    print(answer)
    return answer

# --- EXAMPLE USAGE ---
# (Assuming you have your retrieval logic first)

# 1. Your retrieval system gets the chunks, sorted by relevance (best first)
# retrieved_chunks = [
#     {'text': "The policy for late submission is a 10% penalty per day. (This is the best chunk)"},
#     {'text': "The add/drop period is in the first week. (Less relevant)"},
#     {'text': "General academic regulations. (Least relevant)"}
# ]

# 2. Pass the query and the sorted chunks to the function
# student_query = "What happens if I submit my assignment late?"
# answer = generate_answer(student_query, retrieved_chunks)
# print(answer)

def ask_handbook(question, top_k=5, show_sources=True):
    """Complete RAG pipeline for handbook queries"""
    print(f"\n‚ùì Question: {question}")
    print("=" * 70)

    # Retrieve relevant sections
    relevant_chunks = retrieve_relevant_chunks(question, top_k)

    if not relevant_chunks:
        print("‚ùå No relevant information found in handbook")
        return None

    if show_sources:
        print("üìö Found relevant handbook sections:")
        for i, chunk in enumerate(relevant_chunks):
            print(f"\nüìÑ Section {i+1} (Relevance: {chunk['score']:.3f})")
            preview = chunk['text'][:200].replace('\n', ' ')
            print(f"   {preview}...")

    # Generate answer
    print("\nü§î Generating answer...")
    answer = generate_answer(question, relevant_chunks)

    print(f"\nüí° Answer:")
    print(answer)
    print("\n" + "=" * 70)

    return {
        'question': question,
        'answer': answer,
        'sources': relevant_chunks,
        'num_sources': len(relevant_chunks)
    }

# Test the system
print("\n\nüöÄ RAG System Ready!")
print("Testing with university-specific questions...")

# Sample test questions for university handbook
test_questions = [
    "What are the graduation requirements?",
    "How do I withdraw from a course?",
    "What is the academic probation policy?",
    "What happens if I'm caught cheating?",
    "How do I change my major?"
]

# Run a test query
# test_result = ask_handbook(test_questions[0])

# Interactive query function
def interactive_mode():
    """Interactive mode for asking questions"""
    print("\nüéì Interactive University Handbook Assistant")
    print("Type 'quit' to exit")
    print("-" * 50)

    while True:
        question = input("\n‚ùì Your question: ").strip()

        if question.lower() in ['quit', 'exit', 'q']:
            print("üëã Goodbye!")
            break

        if not question:
            print("Please enter a question!")
            continue

        try:
            ask_handbook(question)
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")

# Uncomment to start interactive mode
# interactive_mode()

print("\nüìã System Summary:")
print(f"   üìö Processed: {len(chunks):,} semantic chunks")
print(f"   üîç Embeddings: {len(all_embeddings):,} vectors")
print(f"   ü§ñ Model: {model_name}")
print(f"   üíæ Ready for queries!")



üöÄ RAG System Ready!
Testing with university-specific questions...

üìã System Summary:
   üìö Processed: 246 semantic chunks
   üîç Embeddings: 246 vectors
   ü§ñ Model: mistralai/Mistral-7B-Instruct-v0.1
   üíæ Ready for queries!


In [12]:
interactive_mode()


üéì Interactive University Handbook Assistant
Type 'quit' to exit
--------------------------------------------------

‚ùì Question: I was sick and missed my midterms. What should I do to get an excused absence?
üìö Found relevant handbook sections:

üìÑ Section 1 (Relevance: 0.455)
   2009)   ## SECTION 5. ATTENDANCE   5.1 Students are required to attend all classes starting with the first meeting of every class. 5.2 Non-attendance in any required class or academic activity constit...

üìÑ Section 2 (Relevance: 0.449)
   For Re-Admission of students who would like to continue their program after taking the leave of absence, the following are needed:   1. Copy of Approved Leave of Absence; 2. Evaluation Record Form fro...

üìÑ Section 3 (Relevance: 0.415)
   the first 20 minutes for a two-hour class; - 5.1.3. the first 15 minutes for a one-hour-and-a-half class; and - 5.1.4. the first 10 minutes for a one-hour class. 5.4 A student is considered late or ta...

üìÑ Section 4 (Relev

# Testing Workflow with Test Dataset

This section will run all questions from Test.json through the RAG pipeline and save results to CSV.

In [12]:
import json
import pandas as pd
from datetime import datetime
import time

# Load test questions
with open('../Test/Test.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

print(f"üìã Loaded {len(test_data)} test questions")
print(f"üìä Categories: {set(q['category'] for q in test_data)}")
print(f"üìä Difficulty levels: {set(q['difficulty'] for q in test_data)}")

üìã Loaded 100 test questions
üìä Categories: {'scenario', 'direct', 'unanswerable', 'adversarial', 'paraphrased'}
üìä Difficulty levels: {'hard', 'easy', 'medium'}


In [13]:
def test_rag_pipeline(test_data, top_k=5, output_file='rag_test_results.csv'):
    """
    Run RAG pipeline on all test questions and save results to CSV
    
    Args:
        test_data: List of test questions with expected answers
        top_k: Number of chunks to retrieve
        output_file: Output CSV filename
    """
    results = []
    
    print(f"\nüöÄ Starting RAG Pipeline Test")
    print(f"üìù Testing {len(test_data)} questions...")
    print("=" * 80)
    
    for i, test_item in enumerate(test_data):
        question = test_item['question']
        expected_answer = test_item['answer']
        difficulty = test_item['difficulty']
        category = test_item['category']
        
        print(f"\n[{i+1}/{len(test_data)}] Processing: {question[:60]}...")
        
        try:
            # Start timing
            start_time = time.time()
            
            # Retrieve relevant chunks
            relevant_chunks = retrieve_relevant_chunks(question, top_k)
            
            # Generate answer
            if relevant_chunks:
                generated_answer = generate_answer(question, relevant_chunks, max_new_tokens=350)
                
                # Calculate average relevance score
                avg_score = sum(chunk['score'] for chunk in relevant_chunks) / len(relevant_chunks)
                top_score = relevant_chunks[0]['score'] if relevant_chunks else 0
                
                # Get chunk IDs
                chunk_ids = [chunk['chunk_id'] for chunk in relevant_chunks]
            else:
                generated_answer = "NO RELEVANT CHUNKS FOUND"
                avg_score = 0
                top_score = 0
                chunk_ids = []
            
            # End timing
            processing_time = time.time() - start_time
            
            # Store result
            result = {
                'question_number': i + 1,
                'question': question,
                'expected_answer': expected_answer,
                'generated_answer': generated_answer,
                'difficulty': difficulty,
                'category': category,
                'processing_time_sec': round(processing_time, 2),
                'top_relevance_score': round(top_score, 4),
                'avg_relevance_score': round(avg_score, 4),
                'num_chunks_retrieved': len(relevant_chunks),
                'chunk_ids': str(chunk_ids)
            }
            
            results.append(result)
            print(f"‚úÖ Completed in {processing_time:.2f}s (Relevance: {top_score:.3f})")
            
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")
            result = {
                'question_number': i + 1,
                'question': question,
                'expected_answer': expected_answer,
                'generated_answer': f"ERROR: {str(e)}",
                'difficulty': difficulty,
                'category': category,
                'processing_time_sec': 0,
                'top_relevance_score': 0,
                'avg_relevance_score': 0,
                'num_chunks_retrieved': 0,
                'chunk_ids': '[]'
            }
            results.append(result)
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Save to CSV
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"rag_test_results_{timestamp}.csv"
    df.to_csv(output_filename, index=False, encoding='utf-8-sig')
    
    print("\n" + "=" * 80)
    print(f"‚úÖ Testing Complete!")
    print(f"üìä Total questions: {len(results)}")
    print(f"üíæ Results saved to: {output_filename}")
    print(f"‚è±Ô∏è  Total time: {df['processing_time_sec'].sum():.2f}s")
    print(f"üìà Average processing time: {df['processing_time_sec'].mean():.2f}s per question")
    
    # Summary statistics
    print("\nüìä Summary by Category:")
    category_summary = df.groupby('category').agg({
        'question': 'count',
        'processing_time_sec': 'mean',
        'top_relevance_score': 'mean'
    }).round(2)
    print(category_summary)
    
    print("\nüìä Summary by Difficulty:")
    difficulty_summary = df.groupby('difficulty').agg({
        'question': 'count',
        'processing_time_sec': 'mean',
        'top_relevance_score': 'mean'
    }).round(2)
    print(difficulty_summary)
    
    return df, output_filename

In [14]:
# Run the test pipeline
results_df, output_file = test_rag_pipeline(test_data, top_k=5)

print(f"\nüìÑ Preview of results:")
print(results_df[['question_number', 'category', 'difficulty', 'top_relevance_score', 'processing_time_sec']].head(10))


üöÄ Starting RAG Pipeline Test
üìù Testing 100 questions...

[1/100] Processing: What documents do I need to submit to apply as a freshman?...
‚úÖ Completed in 14.51s (Relevance: 0.536)

[2/100] Processing: I'm a transferee from another uni, what papers do I need?...
‚úÖ Completed in 10.50s (Relevance: 0.510)

[3/100] Processing: My EARISTCAT score for BS Civil Engineering was 82%. Did I g...
‚úÖ Completed in 10.38s (Relevance: 0.467)

[4/100] Processing: Admission is first-come-first-served, right? So my EARISTCAT...
‚úÖ Completed in 79.45s (Relevance: 0.600)

[5/100] Processing: What's the deadline for freshman applications for the next s...
‚úÖ Completed in 6.47s (Relevance: 0.489)

[6/100] Processing: I'm applying for BS Architecture. Is there any other test be...
‚úÖ Completed in 11.40s (Relevance: 0.565)

[7/100] Processing: What's the maximum number of units I can enroll in per semes...
‚úÖ Completed in 15.30s (Relevance: 0.629)

[8/100] Processing: I'm a regular student, can

## Optional: View Individual Results

You can examine specific questions and their generated answers:

In [None]:
# View a specific result
question_num = 3  # Change this to view different questions

result = results_df.iloc[question_num - 1]

print(f"Question #{result['question_number']}")
print(f"Category: {result['category']} | Difficulty: {result['difficulty']}")
print(f"Relevance Score: {result['top_relevance_score']}")
print("\n" + "=" * 80)
print(f"\n‚ùì QUESTION:\n{result['question']}")
print("\n" + "=" * 80)
print(f"\nüìñ EXPECTED ANSWER:\n{result['expected_answer']}")
print("\n" + "=" * 80)
print(f"\nü§ñ GENERATED ANSWER:\n{result['generated_answer']}")
print("\n" + "=" * 80)