In [1]:
import json

# Load pre-made chunks from JSON file
with open('Dataset/Chunks.json', 'r', encoding='utf-8') as f:
    chunks_data = json.load(f)

print(f"üìö Loaded {len(chunks_data)} chunks from Chunks.json")
print(f"üìã Sample chunk keys: {list(chunks_data[0].keys())}")
print(f"\nüìÑ First chunk preview:")
print(f"   ID: {chunks_data[0]['chunk_id']}")
print(f"   Content: {chunks_data[0]['content'][:150]}...")

üìö Loaded 125 chunks from Chunks.json
üìã Sample chunk keys: ['chunk_id', 'content', 'metadata']

üìÑ First chunk preview:
   ID: handbook_chunk_001
   Content: HISTORY OF EARIST

The Eulogio "Amang" Rodriguez Institute of Science and Technology (EARIST) was established after the liberation of Manila in 1945. ...


In [2]:
import torch
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import warnings
warnings.filterwarnings('ignore')

In [3]:
print("\nüîß Initializing embedding model...")
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",  # Best quality for academic text
    model_kwargs={
        'device': 'cuda' if torch.cuda.is_available() else 'cpu',
        'trust_remote_code': True
    },
    encode_kwargs={'normalize_embeddings': True}
)
print("‚úÖ Embedding model loaded: all-mpnet-base-v2 (768 dimensions)")


üîß Initializing embedding model...
‚úÖ Embedding model loaded: all-mpnet-base-v2 (768 dimensions)
‚úÖ Embedding model loaded: all-mpnet-base-v2 (768 dimensions)


In [4]:
# Skip semantic chunking since we're using pre-made chunks
print("\n‚úÖ Using pre-chunked data from Chunks.json")
print(f"   Total chunks: {len(chunks_data)}")

# Analyze chunk quality
chunk_sizes = [len(chunk['content']) for chunk in chunks_data]
print(f"\nüìä Chunk Analysis:")
print(f"   Average size: {sum(chunk_sizes)/len(chunk_sizes):.0f} characters")
print(f"   Size range: {min(chunk_sizes)} - {max(chunk_sizes)} characters")


‚úÖ Using pre-chunked data from Chunks.json
   Total chunks: 125

üìä Chunk Analysis:
   Average size: 1336 characters
   Size range: 213 - 3785 characters


In [5]:
print("\nüîÑ Generating embeddings for all chunks from Chunks.json...")
# Embed metadata with content for better retrieval
chunk_texts = []
for chunk in chunks_data:
    # Extract metadata
    metadata = chunk.get('metadata', {})
    section = metadata.get('section_hierarchy', 'N/A')
    source = metadata.get('source_document', 'Student Handbook')
    
    # Create enriched text with metadata for embedding
    enriched_text = f"[Source: {source}] [Section: {section}]\n{chunk['content']}"
    chunk_texts.append(enriched_text)

# Process embeddings in batches to avoid memory issues
batch_size = 32
all_embeddings = []
for i in range(0, len(chunk_texts), batch_size):
    batch = chunk_texts[i:i+batch_size]
    batch_embeddings = embedding_model.embed_documents(batch)
    all_embeddings.extend(batch_embeddings)
    print(f"   Processed batch {i//batch_size + 1}/{(len(chunk_texts) + batch_size - 1)//batch_size}")

print(f"‚úÖ Generated {len(all_embeddings)} embeddings with embedded metadata")
print(f"üìã Sample enriched text (first 200 chars):\n   {chunk_texts[0][:200]}...")


üîÑ Generating embeddings for all chunks from Chunks.json...
   Processed batch 1/4
   Processed batch 1/4
   Processed batch 2/4
   Processed batch 2/4
   Processed batch 3/4
   Processed batch 3/4
   Processed batch 4/4
‚úÖ Generated 125 embeddings with embedded metadata
üìã Sample enriched text (first 200 chars):
   [Source: Student Handbook] [Section: HISTORY OF EARIST]
HISTORY OF EARIST

The Eulogio "Amang" Rodriguez Institute of Science and Technology (EARIST) was established after the liberation of Manila in ...
   Processed batch 4/4
‚úÖ Generated 125 embeddings with embedded metadata
üìã Sample enriched text (first 200 chars):
   [Source: Student Handbook] [Section: HISTORY OF EARIST]
HISTORY OF EARIST

The Eulogio "Amang" Rodriguez Institute of Science and Technology (EARIST) was established after the liberation of Manila in ...


In [6]:

# 5. Build FAISS vector store for fast similarity search
print("\nüóÑÔ∏è Building FAISS vector database...")
dimension = len(all_embeddings[0])
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity

# Normalize embeddings for proper cosine similarity
embeddings_array = np.array(all_embeddings).astype('float32')
faiss.normalize_L2(embeddings_array)
index.add(embeddings_array)

print(f"‚úÖ FAISS index ready: {index.ntotal:,} vectors ({dimension} dimensions)")



üóÑÔ∏è Building FAISS vector database...
‚úÖ FAISS index ready: 125 vectors (768 dimensions)


In [8]:
# 6. Load high-quality language model for generation
print("\nü§ñ Loading language model...")

# Configure 4-bit quantization for T4 GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)



ü§ñ Loading language model...


In [9]:
from huggingface_hub import notebook_login

# This will prompt you to enter your HF token
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [9]:
# Use Mistral 7B for quality (perfect for T4)
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

print(f"‚úÖ Model loaded: {model_name}")
print(f"üéØ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ Model loaded: mistralai/Mistral-7B-Instruct-v0.1
üéØ GPU Memory: 8.5GB


In [10]:
def retrieve_relevant_chunks(query, top_k=5):
    """Find most relevant chunks for the query"""
    # Embed the query
    query_embedding = embedding_model.embed_query(query)
    query_vector = np.array([query_embedding]).astype('float32')
    faiss.normalize_L2(query_vector)

    # Search FAISS index
    scores, indices = index.search(query_vector, top_k)

    # Return results with metadata from JSON chunks
    results = []
    for idx, score in zip(indices[0], scores[0]):
        if idx < len(chunks_data):  # Safety check
            chunk = chunks_data[idx]
            metadata = chunk.get('metadata', {})
            results.append({
                'text': chunk['content'],
                'score': float(score),
                'chunk_id': chunk['chunk_id'],
                'metadata': metadata,
                'section': metadata.get('section_hierarchy', 'N/A'),
                'source': metadata.get('source_document', 'Student Handbook')
            })

    return results

def generate_answer(query, context_chunks, max_new_tokens=350):
    """Generate answer using retrieved context with metadata"""
    # Combine context from relevant chunks with metadata
    context_parts = []
    for i, chunk in enumerate(context_chunks):
        section = chunk.get('section', 'N/A')
        source = chunk.get('source', 'Student Handbook')
        context_parts.append(f"[Section {i+1}: {section}]\n[Source: {source}]\n{chunk['text']}")

    combined_context = "\n\n".join(context_parts)

    # Create optimized prompt for university handbook
    prompt = f"""<s>[INST] You are Amang Bot (Ambot), EARIST's official university AI assistant.
Your task is to provide a helpful and informative answer to the 'SCHOOL QUERY' by using *only* the provided sections from the 'Student Handbook 2021'.

Follow these rules strictly:

1.  **Source of Truth:** Your knowledge is strictly limited to the text provided in the 'Student Handbook 2021' sections below. Do not use any external information or make assumptions.

2.  **Formal and Helpful Tone:** The response must be formal, direct, and helpful.

3.  **Response Structure:**
    * First, provide a clear, direct answer to the 'SCHOOL QUERY' in your own words (based *only* on the handbook).
    * Then, support your answer by *directly quoting* the relevant policy or procedure. You must introduce the quote with a phrase like, "As stated in the Student Handbook 2021:"

4.  **Fallback Response:** If the provided sections do not contain the information to answer the question, you *must* respond with *only* this exact phrase: "I'm sorry, I do not have the information to answer your question based on the provided Student Handbook 2021 sections."

Student Handbook 2021 Sections:
{combined_context}

SCHOOL QUERY:
{query}

Official Response: [/INST]"""

    # Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3072)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.2,  # Low for factual accuracy
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Extract generated answer
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = response.find("[/INST]") + len("[/INST]")
    answer = response[answer_start:].strip()

    return answer




üöÄ RAG System Ready!
Testing with university-specific questions...


In [None]:
# üîç DEBUG: Test retrieval and generation for admission requirements
print("="*80)
print("üîç DEBUGGING PIPELINE - Testing 'What are the admission requirements?'")
print("="*80)

test_query = "What are the admission requirements for new students?"

# Step 1: Test Retrieval
print("\nüìö STEP 1: Testing Retrieval...")
retrieved_chunks = retrieve_relevant_chunks(test_query, top_k=5)

print(f"\n‚úÖ Retrieved {len(retrieved_chunks)} chunks:")
for i, chunk in enumerate(retrieved_chunks, 1):
    print(f"\n--- Chunk {i} ---")
    print(f"Score: {chunk['score']:.4f}")
    print(f"Section: {chunk['section']}")
    print(f"Content (first 300 chars):")
    print(chunk['text'][:300])
    print("...")

# Step 2: Test Context Building
print("\n\nüìù STEP 2: Testing Context Building...")
context_parts = []
for i, chunk in enumerate(retrieved_chunks):
    section = chunk.get('section', 'N/A')
    source = chunk.get('source', 'Student Handbook')
    context_parts.append(f"[Section {i+1}: {section}]\n[Source: {source}]\n{chunk['text']}")

combined_context = "\n\n".join(context_parts)
print(f"Combined context length: {len(combined_context)} characters")
print(f"\nFirst 500 characters of combined context:")
print(combined_context[:500])
print("...")

# Step 3: Test Prompt Construction
print("\n\nüìã STEP 3: Testing Prompt Construction...")
prompt = f"""<s>[INST] You are Amang Bot (Ambot), EARIST's official university AI assistant.
Your task is to provide a helpful and informative answer to the 'SCHOOL QUERY' by using *only* the provided sections from the 'Student Handbook 2021'.

Follow these rules strictly:

1.  **Source of Truth:** Your knowledge is strictly limited to the text provided in the 'Student Handbook 2021' sections below. Do not use any external information or make assumptions.

2.  **Formal and Helpful Tone:** The response must be formal, direct, and helpful.

3.  **Response Structure:**
    * First, provide a clear, direct answer to the 'SCHOOL QUERY' in your own words (based *only* on the handbook).
    * Then, support your answer by *directly quoting* the relevant policy or procedure. You must introduce the quote with a phrase like, "As stated in the Student Handbook 2021:"

4.  **Fallback Response:** If the provided sections do not contain the information to answer the question, you *must* respond with *only* this exact phrase: "I'm sorry, I do not have the information to answer your question based on the provided Student Handbook 2021 sections."

Student Handbook 2021 Sections:
{combined_context}

SCHOOL QUERY:
{test_query}

Official Response: [/INST]"""

print(f"Full prompt length: {len(prompt)} characters")
print(f"First 800 characters of prompt:")
print(prompt[:800])
print("...")

# Step 4: Test Generation
print("\n\nü§ñ STEP 4: Testing Generation...")
print("Generating answer (this may take a moment)...")

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3072)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

print(f"Input token count: {inputs['input_ids'].shape[1]}")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=350,
        temperature=0.2,
        do_sample=True,
        top_p=0.9,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nüìÑ Full model response length: {len(full_response)} characters")

# Extract answer
answer_start = full_response.find("[/INST]") + len("[/INST]")
answer = full_response[answer_start:].strip()

print("\n" + "="*80)
print("üéì FINAL ANSWER:")
print("="*80)
print(answer)
print("\n" + "="*80)

# Check if the answer contains relevant information
print("\n\nüîç STEP 5: Answer Analysis...")
keywords = ["admission", "requirement", "enroll", "applicant", "student", "grade", "GPA", "document"]
found_keywords = [kw for kw in keywords if kw.lower() in answer.lower()]
print(f"Keywords found in answer: {found_keywords}")
print(f"Answer length: {len(answer)} characters")

if "I'm sorry" in answer or len(answer) < 50:
    print("‚ö†Ô∏è WARNING: Model returned fallback response or very short answer!")
    print("This suggests either:")
    print("  1. Retrieved chunks don't contain admission requirements")
    print("  2. Model is not extracting information correctly")
    print("  3. Prompt may need adjustment")
else:
    print("‚úÖ Model generated a substantive answer")

üîç DEBUGGING PIPELINE - Testing 'What are the admission requirements?'

üìö STEP 1: Testing Retrieval...

‚úÖ Retrieved 5 chunks:

--- Chunk 1 ---
Score: 0.6299
Section: ARTICLE I ‚Äì ADMISSION POLICY > Admission Qualifications of Local Student/s
Content (first 300 chars):
- Food Technology
    - Drafting Technology


## ARTICLE I
### ADMISSION POLICY

Admission of all students shall be primarily based on academic preparedness such as the result of entrance examinations, personal interviews, academic records, character evaluation, the ability of individual students to
...

--- Chunk 2 ---
Score: 0.6005
Section: ARTICLE I ‚Äì ADMISSION POLICY > A. Admission Requirements
Content (first 300 chars):
- Transferees preferably with no grade/s of "Dropped", "Incomplete", and "Failed" may be admitted depending on the availability of slots by the chosen program. Foreign students who graduated from high schools abroad and who have not enrolled in international HEIs (CHED CMO internalization).


In [None]:
def interactive_mode():
    """
    Interactive Q&A mode for querying the Student Handbook
    Type 'exit', 'quit', or 'q' to stop
    """
    print("\n" + "="*80)
    print("ü§ñ AMANG BOT - Interactive Q&A Mode")
    print("="*80)
    print("Ask me anything about EARIST's Student Handbook!")
    print("Type 'exit', 'quit', or 'q' to stop\n")
    
    while True:
        # Get user query
        print("-" * 80)
        query = input("\nüí¨ Your Question: ").strip()
        
        # Check for exit commands
        if query.lower() in ['exit', 'quit', 'q', '']:
            print("\nüëã Thank you for using Amang Bot! Goodbye!")
            break
        
        print("\nüîç Searching relevant sections...")
        
        # Step 1: Retrieve relevant chunks
        relevant_chunks = retrieve_relevant_chunks(query, top_k=5)
        
        # Display retrieved chunks with metadata
        print(f"\nüìö Found {len(relevant_chunks)} relevant sections:")
        for i, chunk in enumerate(relevant_chunks, 1):
            print(f"\n   [{i}] Section: {chunk['section']}")
            print(f"       Relevance Score: {chunk['score']:.4f}")
            print(f"       Preview: {chunk['text'][:100]}...")
        
        # Step 2: Generate answer
        print("\nü§î Generating answer...")
        answer = generate_answer(query, relevant_chunks)
        
        # Display answer
        print("\n" + "="*80)
        print("üéì AMANG BOT RESPONSE:")
        print("="*80)
        print(answer)
        print("\n" + "="*80)
        
        # Ask if user wants to continue
        print("\n‚ú® Ask another question or type 'exit' to quit")

# Start interactive mode
interactive_mode()


ü§ñ AMANG BOT - Interactive Q&A Mode
Ask me anything about EARIST's Student Handbook!
Type 'exit', 'quit', or 'q' to stop

--------------------------------------------------------------------------------

üîç Searching relevant sections...

üìö Found 5 relevant sections:

   [1] Section: MISSION STATEMENTS
       Relevance Score: 0.3589
       Preview: 6. Finished Phase I (June 14, 1999) of the Multi-Purpose Sports Complex Gymnasium with a 2.5M approp...

   [2] Section: HISTORY OF EARIST > EARIST Cavite Campus
       Relevance Score: 0.3376
       Preview: - Awarded LEVEL II STATUS in the Fourteen (14) Degrees Program (Accrediting Agency of Chartered Coll...

   [3] Section: Institute Officials > Additional Institute Officials
       Relevance Score: 0.3327
       Preview: MS. DANA ROLDAN
Chief, Records Service
DR. PEGGY M. OCHOA
Chief, Medical and Dental Service
MR. FERD...

   [4] Section: Board of Trustees
       Relevance Score: 0.3312
       Preview: Board of Trustees
Republi