In [1]:
with open('Dataset/StudentHandbookDataset.txt', 'r', encoding='utf-8') as f:
    dataset = f.read()

print(f"üìö Dataset loaded: {len(dataset):,} characters")
print(f"üìÑ Estimated pages: ~{len(dataset) // 2000}")

üìö Dataset loaded: 171,284 characters
üìÑ Estimated pages: ~85


In [2]:
import torch
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import warnings
warnings.filterwarnings('ignore')

In [3]:
print("\nüîß Initializing embedding model...")
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",  # Best quality for academic text
    model_kwargs={
        'device': 'cuda' if torch.cuda.is_available() else 'cpu',
        'trust_remote_code': True
    },
    encode_kwargs={'normalize_embeddings': True}
)
print("‚úÖ Embedding model loaded: all-mpnet-base-v2 (768 dimensions)")


üîß Initializing embedding model...
‚úÖ Embedding model loaded: all-mpnet-base-v2 (768 dimensions)


In [4]:
print("\nüîß Setting up semantic chunker...")
text_splitter = SemanticChunker(
    embeddings=embedding_model,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=80,  # Good balance for policy docs
    buffer_size=1,
    add_start_index=True  # Track position in original text
)


üîß Setting up semantic chunker...


In [5]:
# 3. Split raw text into semantic chunks using create_documents
print("\nüìù Creating semantic chunks from raw text...")
chunks = text_splitter.create_documents([dataset])
print(f"‚úÖ Created {len(chunks)} semantic chunks")

# Analyze chunk quality
chunk_sizes = [len(chunk.page_content) for chunk in chunks]
print(f"\nüìä Chunk Analysis:")
print(f"   Average size: {np.mean(chunk_sizes):.0f} characters")
print(f"   Size range: {min(chunk_sizes)} - {max(chunk_sizes)} characters")
print(f"   Total chunks: {len(chunks)}")

# Show sample chunks
print("\nüìã Sample chunks:")
for i in range(min(3, len(chunks))):
    chunk_preview = chunks[i].page_content[:150].replace('\n', ' ')
    print(f"   Chunk {i+1}: {chunk_preview}...")



üìù Creating semantic chunks from raw text...
‚úÖ Created 246 semantic chunks

üìä Chunk Analysis:
   Average size: 650 characters
   Size range: 2 - 10109 characters
   Total chunks: 246

üìã Sample chunks:
   Chunk 1: ÔªøRepublic of the Philippines  Eulogio "Amang" Rodriguez Institute of Science and Technology Office of Student Affairs and Services   EARIST STUDENT HA...
   Chunk 2: ii - HISTORY OF EARIST ..... 1 - MISSION STATEMENTS   - Vision ..... 3   - Mission ..... 3   - Goal ........
   Chunk 3: 3   - Objectives ..... 3 - CURRICULAR OFFERINGS   - Main Campus     - College of Architecture and Fine Arts ..... 4     - College of Arts and Sciences...


In [6]:
print("\nüîÑ Generating embeddings for all chunks...")
chunk_texts = [chunk.page_content for chunk in chunks]

# Process embeddings in batches to avoid memory issues
batch_size = 32
all_embeddings = []
for i in range(0, len(chunk_texts), batch_size):
    batch = chunk_texts[i:i+batch_size]
    batch_embeddings = embedding_model.embed_documents(batch)
    all_embeddings.extend(batch_embeddings)
    print(f"   Processed batch {i//batch_size + 1}/{(len(chunk_texts) + batch_size - 1)//batch_size}")

print(f"‚úÖ Generated {len(all_embeddings)} embeddings")




üîÑ Generating embeddings for all chunks...
   Processed batch 1/8
   Processed batch 2/8
   Processed batch 3/8
   Processed batch 4/8
   Processed batch 5/8
   Processed batch 6/8
   Processed batch 7/8
   Processed batch 8/8
‚úÖ Generated 246 embeddings


In [7]:

# 5. Build FAISS vector store for fast similarity search
print("\nüóÑÔ∏è Building FAISS vector database...")
dimension = len(all_embeddings[0])
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity

# Normalize embeddings for proper cosine similarity
embeddings_array = np.array(all_embeddings).astype('float32')
faiss.normalize_L2(embeddings_array)
index.add(embeddings_array)

print(f"‚úÖ FAISS index ready: {index.ntotal:,} vectors ({dimension} dimensions)")



üóÑÔ∏è Building FAISS vector database...
‚úÖ FAISS index ready: 246 vectors (768 dimensions)


In [8]:
# 6. Load high-quality language model for generation
print("\nü§ñ Loading language model...")

# Configure 4-bit quantization for T4 GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)



ü§ñ Loading language model...


In [9]:
from huggingface_hub import notebook_login

# This will prompt you to enter your HF token
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [9]:
# Use Mistral 7B for quality (perfect for T4)
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

print(f"‚úÖ Model loaded: {model_name}")
print(f"üéØ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ Model loaded: mistralai/Mistral-7B-Instruct-v0.1
üéØ GPU Memory: 8.5GB


In [10]:
def retrieve_relevant_chunks(query, top_k=5):
    """Find most relevant chunks for the query"""
    # Embed the query
    query_embedding = embedding_model.embed_query(query)
    query_vector = np.array([query_embedding]).astype('float32')
    faiss.normalize_L2(query_vector)

    # Search FAISS index
    scores, indices = index.search(query_vector, top_k)

    # Return results with metadata
    results = []
    for idx, score in zip(indices[0], scores[0]):
        if idx < len(chunks):  # Safety check
            chunk = chunks[idx]
            results.append({
                'text': chunk.page_content,
                'score': float(score),
                'chunk_id': int(idx),
                'start_pos': chunk.metadata.get('start_index', 0) if hasattr(chunk, 'metadata') else 0
            })

    return results

def generate_answer(query, context_chunks, max_new_tokens=350):
    """Generate answer using retrieved context"""
    # Combine context from relevant chunks
    context_parts = []
    for i, chunk in enumerate(context_chunks):
        context_parts.append(f"[Section {i+1}]\n{chunk['text']}")

    combined_context = "\n\n".join(context_parts)

    # Create optimized prompt for university handbook
    prompt = f"""<s>[INST] You are a university student advisor with access to the official student handbook. Answer the student's question accurately using only the provided handbook sections.

HANDBOOK SECTIONS:
{combined_context}

STUDENT QUESTION: {query}

Provide a clear, helpful answer based on the handbook information above. If the handbook doesn't contain enough information, say so. Be specific about policies, procedures, and requirements. [/INST]"""

    # Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3072)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.2,  # Low for factual accuracy
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Extract generated answer
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = response.find("[/INST]") + len("[/INST]")
    answer = response[answer_start:].strip()

    return answer

def ask_handbook(question, top_k=5, show_sources=True):
    """Complete RAG pipeline for handbook queries"""
    print(f"\n‚ùì Question: {question}")
    print("=" * 70)

    # Retrieve relevant sections
    relevant_chunks = retrieve_relevant_chunks(question, top_k)

    if not relevant_chunks:
        print("‚ùå No relevant information found in handbook")
        return None

    if show_sources:
        print("üìö Found relevant handbook sections:")
        for i, chunk in enumerate(relevant_chunks):
            print(f"\nüìÑ Section {i+1} (Relevance: {chunk['score']:.3f})")
            preview = chunk['text'][:200].replace('\n', ' ')
            print(f"   {preview}...")

    # Generate answer
    print("\nü§î Generating answer...")
    answer = generate_answer(question, relevant_chunks)

    print(f"\nüí° Answer:")
    print(answer)
    print("\n" + "=" * 70)

    return {
        'question': question,
        'answer': answer,
        'sources': relevant_chunks,
        'num_sources': len(relevant_chunks)
    }

# Test the system
print("\n\nüöÄ RAG System Ready!")
print("Testing with university-specific questions...")

# Sample test questions for university handbook
test_questions = [
    "What are the graduation requirements?",
    "How do I withdraw from a course?",
    "What is the academic probation policy?",
    "What happens if I'm caught cheating?",
    "How do I change my major?"
]

# Run a test query
test_result = ask_handbook(test_questions[0])

# Interactive query function
def interactive_mode():
    """Interactive mode for asking questions"""
    print("\nüéì Interactive University Handbook Assistant")
    print("Type 'quit' to exit")
    print("-" * 50)

    while True:
        question = input("\n‚ùì Your question: ").strip()

        if question.lower() in ['quit', 'exit', 'q']:
            print("üëã Goodbye!")
            break

        if not question:
            print("Please enter a question!")
            continue

        try:
            ask_handbook(question)
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")

# Uncomment to start interactive mode
# interactive_mode()

print("\nüìã System Summary:")
print(f"   üìö Processed: {len(chunks):,} semantic chunks")
print(f"   üîç Embeddings: {len(all_embeddings):,} vectors")
print(f"   ü§ñ Model: {model_name}")
print(f"   üíæ Ready for queries!")



üöÄ RAG System Ready!
Testing with university-specific questions...

‚ùì Question: What are the graduation requirements?
üìö Found relevant handbook sections:

üìÑ Section 1 (Relevance: 0.676)
   GRADUATION REQUIREMENTS   ### 9.1 A candidate for graduation shall file an application for graduation to the Registrar's Office. ### 9.2 A student shall be recommended for graduation when he/she has s...

üìÑ Section 2 (Relevance: 0.610)
   2. Must carry a minimum academic load of 18 units (except for graduating students). 3....

üìÑ Section 3 (Relevance: 0.586)
   Must have a passing grade in all subjects including P.E. and NSTP (MTSLTS/CWTS) enrolled in order to qualify for continuance for the following semester. ##### b....

üìÑ Section 4 (Relevance: 0.574)
   TERTIARY EDUCATION PROGRAM   ##### a. Qualifications 1....

üìÑ Section 5 (Relevance: 0.553)
   ### 10.2.2 Every candidate for graduation with honors must:   #### 10.2.4.1 Have carried the normal load prescribed in his/her cu

In [13]:
interactive_mode()


üéì Interactive University Handbook Assistant
Type 'quit' to exit
--------------------------------------------------

‚ùì Question: What are the grounds for thesis failure?
üìö Found relevant handbook sections:

üìÑ Section 1 (Relevance: 0.426)
   Obtain a failing grade......

üìÑ Section 2 (Relevance: 0.383)
   * b. Obtain a failing grade during term... * c....

üìÑ Section 3 (Relevance: 0.374)
   * d. Failure to enroll... * e....

üìÑ Section 4 (Relevance: 0.265)
   Such proponents/sources shall be held answerable in case of complaints....

üìÑ Section 5 (Relevance: 0.255)
   Expulsion ‚Äì the act of forcing someone to leave with justifiable cause or reason. Failed Grade ‚Äì a grade given to a student who does not qualify for passing a subject....

ü§î Generating answer...

üí° Answer:
Based on the provided handbook sections, it appears that there is no specific section regarding the grounds for thesis failure. However, Section 5 mentions expulsion as the act of forcing som