In [1]:
import json
from types import SimpleNamespace

# Load pre-processed chunks from JSON
with open('Dataset/Chunks.json', 'r', encoding='utf-8') as f:
    chunks_data = json.load(f)

# Convert to objects that mimic LangChain's Document structure for compatibility
chunks = []
for item in chunks_data:
    # We use SimpleNamespace to create a simple object with attributes
    chunk_obj = SimpleNamespace(
        page_content=item['content'],
        metadata={
            'chunk_id': item['chunk_id'],
            **item['metadata']  # Unpack the existing metadata dictionary
        }
    )
    chunks.append(chunk_obj)

print(f"üìö Loaded {len(chunks)} pre-processed chunks from 'Chunks.json'")

# Display a sample chunk to verify
if chunks:
    sample = chunks[0]
    print("\nüìã Sample Chunk:")
    print(f"   Content: {sample.page_content[:150].replace(chr(10), ' ')}...")
    print(f"   Metadata: {sample.metadata}")

üìö Loaded 125 pre-processed chunks from 'Chunks.json'

üìã Sample Chunk:
   Content: HISTORY OF EARIST  The Eulogio "Amang" Rodriguez Institute of Science and Technology (EARIST) was established after the liberation of Manila in 1945. ...
   Metadata: {'chunk_id': 'handbook_chunk_001', 'source_document': 'Student Handbook', 'section_hierarchy': 'HISTORY OF EARIST'}


In [2]:
import torch
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import warnings
warnings.filterwarnings('ignore')

In [3]:
from huggingface_hub import notebook_login

# This will prompt you to enter your HF token
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [None]:


print("\nüîß Initializing embedding model...")
embedding_model = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1.5",  # Nomic embedding model
    model_kwargs={
        'device': 'cuda' if torch.cuda.is_available() else 'cpu',
        'trust_remote_code': True
    },
    encode_kwargs={'normalize_embeddings': True}
)
print("‚úÖ Embedding model loaded: nomic-embed-text-v1.5 (768 dimensions)")


üîß Initializing embedding model...


<All keys matched successfully>


‚úÖ Embedding model loaded: nomic-embed-text-v1.5 (768 dimensions)


In [4]:
print("\nüîÑ Generating embeddings for all chunks...")
chunk_texts = [chunk.page_content for chunk in chunks]

# Process embeddings in batches to avoid memory issues
batch_size = 32
all_embeddings = []
for i in range(0, len(chunk_texts), batch_size):
    batch = chunk_texts[i:i+batch_size]
    batch_embeddings = embedding_model.embed_documents(batch)
    all_embeddings.extend(batch_embeddings)
    print(f"   Processed batch {i//batch_size + 1}/{(len(chunk_texts) + batch_size - 1)//batch_size}")

print(f"‚úÖ Generated {len(all_embeddings)} embeddings")




üîÑ Generating embeddings for all chunks...
   Processed batch 1/4
   Processed batch 1/4
   Processed batch 2/4
   Processed batch 2/4
   Processed batch 3/4
   Processed batch 3/4
   Processed batch 4/4
‚úÖ Generated 125 embeddings
   Processed batch 4/4
‚úÖ Generated 125 embeddings


In [5]:

# 5. Build FAISS vector store for fast similarity search
print("\nüóÑÔ∏è Building FAISS vector database...")
dimension = len(all_embeddings[0])
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity

# Normalize embeddings for proper cosine similarity
embeddings_array = np.array(all_embeddings).astype('float32')
faiss.normalize_L2(embeddings_array)
index.add(embeddings_array)

print(f"‚úÖ FAISS index ready: {index.ntotal:,} vectors ({dimension} dimensions)")



üóÑÔ∏è Building FAISS vector database...
‚úÖ FAISS index ready: 125 vectors (768 dimensions)


In [6]:
# 6. Load high-quality language model for generation
print("\nü§ñ Loading language model...")

# Configure 4-bit quantization for T4 GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)



ü§ñ Loading language model...


In [7]:
from huggingface_hub import notebook_login

# This will prompt you to enter your HF token
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [8]:
# Use Mistral 7B for quality (perfect for T4)
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

print(f"‚úÖ Model loaded: {model_name}")
print(f"üéØ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ Model loaded: mistralai/Mistral-7B-Instruct-v0.1
üéØ GPU Memory: 8.5GB


In [9]:
def retrieve_relevant_chunks(query, top_k=5):
    """Find most relevant chunks for the query"""
    # Embed the query
    query_embedding = embedding_model.embed_query(query)
    query_vector = np.array([query_embedding]).astype('float32')
    faiss.normalize_L2(query_vector)

    # Search FAISS index
    scores, indices = index.search(query_vector, top_k)

    # Return results with metadata
    results = []
    for idx, score in zip(indices[0], scores[0]):
        if idx < len(chunks):  # Safety check
            chunk = chunks[idx]
            results.append({
                'text': chunk.page_content,
                'score': float(score),
                'chunk_id': chunk.metadata.get('chunk_id', int(idx)),
                'start_pos': chunk.metadata.get('start_index', 0) if hasattr(chunk, 'metadata') else 0
            })

    return results

def generate_answer(query, context_chunks, max_new_tokens=350):
    
    # Define token limits
    MAX_TOTAL_TOKENS = 4096
    MAX_OUTPUT_TOKENS = max_new_tokens
    MAX_INPUT_TOKENS = MAX_TOTAL_TOKENS - MAX_OUTPUT_TOKENS - 32  # 32 token safety buffer

    # --- Dynamic Prompt Truncation ---
    working_chunks = list(context_chunks) # Make a copy to modify

    while True:
        if not working_chunks:
            # Handle case where all context is removed
            combined_context = "No information found in the handbook."
            break

        # Re-ranking and XML-style Tags
        best_chunk = working_chunks[0]
        other_chunks = working_chunks[1:]
        other_chunks.reverse()

        context_parts = []
        for i, chunk in enumerate(other_chunks):
            context_parts.append(f"<HANDBOOK_SECTION_{i+1}>\n{chunk['text']}\n</HANDBOOK_SECTION_{i+1}>")
        context_parts.append(f"<HANDBOOK_SECTION_MOST_RELEVANT>\n{best_chunk['text']}\n</HANDBOOK_SECTION_MOST_RELEVANT>")
        
        combined_context = "\n\n".join(context_parts)

        # Create prompt and check token length
#         prompt = f"""<s>[INST] You are Amang Bot (Ambot) a university advisor. Answer the User's question accurately using only the provided sections below.

# INFORMATION SECTIONS:
# {combined_context}

# USER QUESTION: {query}

# Provide a clear, helpful answer based on the information above. If the Information Sections doesn't contain enough information, say so. Be specific about policies, procedures, and requirements. [/INST]"""
        prompt = f"""
<s>[INST]
You are **Amang Bot (Ambot)**, a precise and helpful university advisor. 
You must answer the user's question **using only the information provided in the INFORMATION SECTIONS**. 

### Your Answer Must Follow These Rules:
1. **Start with a direct answer** to the user's question.
2. After the direct answer, **provide additional relevant details**, background, or context based strictly on the provided information.
3. If the information sections **do not include the answer**, explicitly say:
   "The provided information does not contain the answer to this question."
4. Do NOT invent or assume facts that are not in the information sections.
5. Maintain a clear, formal, and student-friendly tone.

---

### INFORMATION SECTIONS:
{combined_context}

---

### USER QUESTION:
{query}

Provide your answer now, following the rules above.
[/INST]
"""

        # Check token length
        inputs = tokenizer(prompt, return_tensors="pt", truncation=False)
        input_length = inputs['input_ids'].shape[1]

        if input_length <= MAX_INPUT_TOKENS:
            print(f"‚úÖ Prompt fits token budget with {len(working_chunks)} chunks.")
            break
        else:
            # If prompt is too long, remove the least relevant chunk and retry
            if len(working_chunks) > 1:
                removed_chunk = working_chunks.pop(-1) # Remove last item (least relevant)
                print(f"‚ö†Ô∏è Prompt too long ({input_length} > {MAX_INPUT_TOKENS}). Removing least relevant chunk (Score: {removed_chunk['score']:.3f})...")
            else:
                # Can't remove any more chunks, will have to truncate
                print(f"‚ö†Ô∏è Prompt still too long, but only one chunk remains. Truncation will occur.")
                break

    # Tokenize with final truncation (as a safeguard)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKENS)
    input_length = inputs['input_ids'].shape[1] 
    
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_OUTPUT_TOKENS,
            temperature=0.2,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode only the newly generated tokens
    new_tokens = outputs[0][input_length:]
    answer = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

    print(f"\nüìù Generated answer ({len(new_tokens)} tokens):\n{answer}\n")
    return answer

def ask_handbook(question, top_k=5, show_sources=True):
    """Complete RAG pipeline for handbook queries"""
    print(f"\n‚ùì Question: {question}")
    print("=" * 70)

    # Retrieve relevant sections
    relevant_chunks = retrieve_relevant_chunks(question, top_k)

    if not relevant_chunks:
        print("‚ùå No relevant information found in handbook")
        return None

    if show_sources:
        print("üìö Found relevant handbook sections:")
        for i, chunk in enumerate(relevant_chunks):
            print(f"\nüìÑ Section {i+1} (Relevance: {chunk['score']:.3f})")
            preview = chunk['text'][:200].replace('\n', ' ')
            print(f"   {preview}...")

    # Generate answer
    print("\nü§î Generating answer...")
    answer = generate_answer(question, relevant_chunks)

    print(f"\nüí° Answer:")
    print(answer)
    print("\n" + "=" * 70)

    return {
        'question': question,
        'answer': answer,
        'sources': relevant_chunks,
        'num_sources': len(relevant_chunks)
    }

# Test the system
print("\n\nüöÄ RAG System Ready!")
print("Testing with university-specific questions...")

# Sample test questions for university handbook
test_questions = [
    "What are the graduation requirements?",
    "How do I withdraw from a course?",
    "What is the academic probation policy?",
    "What happens if I'm caught cheating?",
    "How do I change my major?"
]

# Run a test query
# test_result = ask_handbook(test_questions[0])

# Interactive query function
def interactive_mode():
    """Interactive mode for asking questions"""
    print("\nüéì Interactive University Handbook Assistant")
    print("Type 'quit' to exit")
    print("-" * 50)

    while True:
        question = input("\n‚ùì Your question: ").strip()

        if question.lower() in ['quit', 'exit', 'q']:
            print("üëã Goodbye!")
            break

        if not question:
            print("Please enter a question!")
            continue

        try:
            ask_handbook(question)
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")

# Uncomment to start interactive mode
# interactive_mode()

print("\nüìã System Summary:")
print(f"   üìö Processed: {len(chunks):,} semantic chunks")
print(f"   üîç Embeddings: {len(all_embeddings):,} vectors")
print(f"   ü§ñ Model: {model_name}")
print(f"   üíæ Ready for queries!")



üöÄ RAG System Ready!
Testing with university-specific questions...

üìã System Summary:
   üìö Processed: 125 semantic chunks
   üîç Embeddings: 125 vectors
   ü§ñ Model: mistralai/Mistral-7B-Instruct-v0.1
   üíæ Ready for queries!


In [15]:
interactive_mode()


üéì Interactive University Handbook Assistant
Type 'quit' to exit
--------------------------------------------------

‚ùì Question: Where is the present site of EARIST?
üìö Found relevant handbook sections:

üìÑ Section 1 (Relevance: 0.694)
   HISTORY OF EARIST  The Eulogio "Amang" Rodriguez Institute of Science and Technology (EARIST) was established after the liberation of Manila in 1945. EARIST traces back its development from Vocational...

üìÑ Section 2 (Relevance: 0.625)
   EARIST Hymn  Let the EARIST music fill the air With the echo our spirit rise There's a magic that will stir the hearts When we sing our lively tuneful march What a joy to sing the music gay When it so...

üìÑ Section 3 (Relevance: 0.578)
   - Awarded LEVEL II STATUS in the Fourteen (14) Degrees Program (Accrediting Agency of Chartered Colleges and Universities in the Philippines) - Rated SUC LEVEL II (CHED-DBM-PASUC Leveling Evaluation) ...

üìÑ Section 4 (Relevance: 0.568)
   Through the EARIST Cultura

# Testing Workflow with Test Dataset

This section will run all questions from Test.json through the RAG pipeline and save results to CSV.

In [11]:
import json
import pandas as pd
from datetime import datetime
import time

# Load test questions
with open('../Test/Test.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

print(f"üìã Loaded {len(test_data)} test questions")
print(f"üìä Categories: {set(q['category'] for q in test_data)}")
print(f"üìä Difficulty levels: {set(q['difficulty'] for q in test_data)}")

üìã Loaded 100 test questions
üìä Categories: {'unanswerable', 'direct', 'paraphrased', 'adversarial', 'scenario'}
üìä Difficulty levels: {'easy', 'hard', 'medium'}


In [12]:
def test_rag_pipeline(test_data, top_k=5, output_file='rag_test_results.csv'):
    """
    Run RAG pipeline on all test questions and save results to CSV
    
    Args:
        test_data: List of test questions with expected answers
        top_k: Number of chunks to retrieve
        output_file: Output CSV filename
    """
    results = []
    
    print(f"\nüöÄ Starting RAG Pipeline Test")
    print(f"üìù Testing {len(test_data)} questions...")
    print("=" * 80)
    
    for i, test_item in enumerate(test_data):
        question = test_item['question']
        expected_answer = test_item['answer']
        difficulty = test_item['difficulty']
        category = test_item['category']
        
        print(f"\n[{i+1}/{len(test_data)}] Processing: {question[:60]}...")
        
        try:
            # Start timing
            start_time = time.time()
            
            # Retrieve relevant chunks
            relevant_chunks = retrieve_relevant_chunks(question, top_k)
            
            # Generate answer with updated token limit
            if relevant_chunks:
                generated_answer = generate_answer(question, relevant_chunks, max_new_tokens=512)
                
                # Calculate average relevance score
                avg_score = sum(chunk['score'] for chunk in relevant_chunks) / len(relevant_chunks)
                top_score = relevant_chunks[0]['score'] if relevant_chunks else 0
                
                # Get chunk IDs
                chunk_ids = [chunk['chunk_id'] for chunk in relevant_chunks]
            else:
                generated_answer = "NO RELEVANT CHUNKS FOUND"
                avg_score = 0
                top_score = 0
                chunk_ids = []
            
            # End timing
            processing_time = time.time() - start_time
            
            # Store result
            result = {
                'question_number': i + 1,
                'question': question,
                'expected_answer': expected_answer,
                'generated_answer': generated_answer,
                'difficulty': difficulty,
                'category': category,
                'processing_time_sec': round(processing_time, 2),
                'top_relevance_score': round(top_score, 4),
                'avg_relevance_score': round(avg_score, 4),
                'num_chunks_retrieved': len(relevant_chunks),
                'chunk_ids': str(chunk_ids)
            }
            
            results.append(result)
            print(f"‚úÖ Completed in {processing_time:.2f}s (Relevance: {top_score:.3f})")
            
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")
            result = {
                'question_number': i + 1,
                'question': question,
                'expected_answer': expected_answer,
                'generated_answer': f"ERROR: {str(e)}",
                'difficulty': difficulty,
                'category': category,
                'processing_time_sec': 0,
                'top_relevance_score': 0,
                'avg_relevance_score': 0,
                'num_chunks_retrieved': 0,
                'chunk_ids': '[]'
            }
            results.append(result)
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Save to CSV
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"rag_test_results_{timestamp}.csv"
    df.to_csv(output_filename, index=False, encoding='utf-8-sig')
    
    print("\n" + "=" * 80)
    print(f"‚úÖ Testing Complete!")
    print(f"üìä Total questions: {len(results)}")
    print(f"üíæ Results saved to: {output_filename}")
    print(f"‚è±Ô∏è  Total time: {df['processing_time_sec'].sum():.2f}s")
    print(f"üìà Average processing time: {df['processing_time_sec'].mean():.2f}s per question")
    
    # Summary statistics
    print("\nüìä Summary by Category:")
    category_summary = df.groupby('category').agg({
        'question': 'count',
        'processing_time_sec': 'mean',
        'top_relevance_score': 'mean'
    }).round(2)
    print(category_summary)
    
    print("\nüìä Summary by Difficulty:")
    difficulty_summary = df.groupby('difficulty').agg({
        'question': 'count',
        'processing_time_sec': 'mean',
        'top_relevance_score': 'mean'
    }).round(2)
    print(difficulty_summary)
    
    return df, output_filename

In [13]:
# Run the test pipeline
results_df, output_file = test_rag_pipeline(test_data, top_k=5)

print(f"\nüìÑ Preview of results:")
print(results_df[['question_number', 'category', 'difficulty', 'top_relevance_score', 'processing_time_sec']].head(10))


üöÄ Starting RAG Pipeline Test
üìù Testing 100 questions...

[1/100] Processing: What documents do I need to submit to apply as a freshman?...
‚úÖ Prompt fits token budget with 5 chunks.

üìù Generated answer (98 tokens):
As a freshman, you need to submit the following documents to apply to EARIST:

1. Form 138 (Senior and High School Report Card)
2. Certificate of Good Moral Character with school seal
3. Birth Certificate (PSA Authenticated)
4. EARISTCAT Result
5. Satisfactory Result of Dean's Assessment and Evaluation
6. Medical and Health Examination

‚úÖ Completed in 10.19s (Relevance: 0.641)

[2/100] Processing: I'm a transferee from another uni, what papers do I need?...
‚úÖ Prompt fits token budget with 5 chunks.

üìù Generated answer (98 tokens):
As a freshman, you need to submit the following documents to apply to EARIST:

1. Form 138 (Senior and High School Report Card)
2. Certificate of Good Moral Character with school seal
3. Birth Certificate (PSA Authenticated)
4. EA

## Optional: View Individual Results

You can examine specific questions and their generated answers:

In [None]:
# View a specific result
question_num = 3  # Change this to view different questions

result = results_df.iloc[question_num - 1]

print(f"Question #{result['question_number']}")
print(f"Category: {result['category']} | Difficulty: {result['difficulty']}")
print(f"Relevance Score: {result['top_relevance_score']}")
print("\n" + "=" * 80)
print(f"\n‚ùì QUESTION:\n{result['question']}")
print("\n" + "=" * 80)
print(f"\nüìñ EXPECTED ANSWER:\n{result['expected_answer']}")
print("\n" + "=" * 80)
print(f"\nü§ñ GENERATED ANSWER:\n{result['generated_answer']}")
print("\n" + "=" * 80)