In [1]:
with open('../Dataset/StudentHandbookDataset.txt', 'r', encoding='utf-8') as f:
    dataset = f.read()

print(f"Dataset loaded: {len(dataset):,} characters")
print(f"Estimated pages: ~{len(dataset) // 2000}")

Dataset loaded: 171,284 characters
Estimated pages: ~85


In [4]:
import torch
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
import faiss
import numpy as np
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

In [5]:
def check_chunks_exist():
    """Check if chunks have been saved"""
    return os.path.exists("saved_chunks/chunks.pkl")

def save_chunks(chunks, filename="saved_chunks/chunks.pkl"):
    """Save chunks to disk"""
    os.makedirs("saved_chunks", exist_ok=True)
    with open(filename, 'wb') as f:
        pickle.dump(chunks, f)
    print(f"[SUCCESS] Saved {len(chunks)} chunks to {filename}")

def load_chunks(filename="saved_chunks/chunks.pkl"):
    """Load chunks from disk"""
    try:
        with open(filename, 'rb') as f:
            chunks = pickle.load(f)
        print(f"[SUCCESS] Loaded {len(chunks)} chunks from {filename}")
        return chunks
    except FileNotFoundError:
        print(f"[ERROR] Chunks file not found: {filename}")
        return None

# Check if chunks already exist
if check_chunks_exist():
    print("[SUCCESS] Chunks file found! You can skip to Step 2 (Embedding Experiments)")
    print("[INFO] To reload chunks, run: chunks = load_chunks()")
else:
    print("[INFO] No saved chunks found. Run the chunking cells below.")

[INFO] No saved chunks found. Run the chunking cells below.


In [None]:
# ========================================
# EXPERIMENT PARAMETER: CHUNKING STRATEGY
# ========================================
# Change these values to experiment with different chunking!

CHUNKING_STRATEGY = "semantic"  # Options: "semantic", "fixed" (future)
PERCENTILE_THRESHOLD = 45       # Try: 70, 75, 80, 85, 90 (higher = fewer, larger chunks)
BUFFER_SIZE = 1                 # Sentences to merge around breakpoints

# ========================================

# ONLY RUN THIS IF CHUNKS DON'T EXIST OR YOU WANT TO RE-CHUNK
print(f"Loading embedding model for {CHUNKING_STRATEGY} chunking...")
print("   Using CPU for stable performance")

chunking_embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

print(f"Setting up semantic chunker with PERCENTILE={PERCENTILE_THRESHOLD}...")
text_splitter = SemanticChunker(
    embeddings=chunking_embed_model,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=PERCENTILE_THRESHOLD,
    buffer_size=BUFFER_SIZE,
    add_start_index=True
)
print(f"[SUCCESS] Chunker ready (Percentile: {PERCENTILE_THRESHOLD}, Buffer: {BUFFER_SIZE})!")

# ONLY RUN THIS IF CHUNKS DON'T EXIST OR YOU WANT TO RE-CHUNK
print("Creating semantic chunks from raw text...")
print(f"Configuration: Percentile={PERCENTILE_THRESHOLD}, Buffer={BUFFER_SIZE}")
chunks = text_splitter.create_documents([dataset])
print(f"[SUCCESS] Created {len(chunks)} semantic chunks")

# Analyze chunk quality
chunk_sizes = [len(chunk.page_content) for chunk in chunks]
print(f"\nChunk Analysis:")
print(f"   Average size: {np.mean(chunk_sizes):.0f} characters")
print(f"   Size range: {min(chunk_sizes)} - {max(chunk_sizes)} characters")
print(f"   Total chunks: {len(chunks)}")

# Show sample chunks
print("\nSample chunks:")
for i in range(min(3, len(chunks))):
    chunk_preview = chunks[i].page_content[:150].replace('\n', ' ')
    print(f"   Chunk {i+1}: {chunk_preview}...")

# SAVE THE CHUNKS with configuration in filename
chunk_filename = f"saved_chunks/chunks_p{PERCENTILE_THRESHOLD}_b{BUFFER_SIZE}.pkl"
save_chunks(chunks, filename=chunk_filename)
print(f"\n[SUCCESS] Chunking complete and saved to {chunk_filename}!")
print(f"[INFO] Configuration: Percentile={PERCENTILE_THRESHOLD}, Buffer={BUFFER_SIZE}")

# Load saved chunks - automatically detect the correct file based on parameters
chunk_filename = f"saved_chunks/chunks_p{PERCENTILE_THRESHOLD}_b{BUFFER_SIZE}.pkl"

print(f"Looking for chunks: {chunk_filename}")
chunks = load_chunks(filename=chunk_filename)

if chunks is None:
    print(f"[ERROR] Chunks not found for P={PERCENTILE_THRESHOLD}, B={BUFFER_SIZE}")
    print("[INFO] Please run Step 1 (Chunking) cells first with these parameters!")
else:
    print(f"[SUCCESS] Ready to experiment with {len(chunks)} chunks!")
    print(f"[INFO] Loaded configuration: Percentile={PERCENTILE_THRESHOLD}, Buffer={BUFFER_SIZE}")



Loading embedding model for semantic chunking...
   Using CPU for stable performance
Setting up semantic chunker with PERCENTILE=45...
[SUCCESS] Chunker ready (Percentile: 45, Buffer: 3)!
Creating semantic chunks from raw text...
Configuration: Percentile=45, Buffer=3
[SUCCESS] Created 675 semantic chunks

Chunk Analysis:
   Average size: 236 characters
   Size range: 2 - 4352 characters
   Total chunks: 675

Sample chunks:
   Chunk 1: ﻿Republic of the Philippines  Eulogio "Amang" Rodriguez Institute of Science and Technology Office of Student Affairs and Services   EARIST STUDENT HA...
   Chunk 2: 3   - Mission ........
   Chunk 3: 3   - Goal ........
[SUCCESS] Saved 675 chunks to saved_chunks/chunks_p45_b3.pkl

[SUCCESS] Chunking complete and saved to saved_chunks/chunks_p45_b3.pkl!
[INFO] Configuration: Percentile=45, Buffer=3
Looking for chunks: saved_chunks/chunks_p45_b3.pkl
[SUCCESS] Loaded 675 chunks from saved_chunks/chunks_p45_b3.pkl
[SUCCESS] Ready to experiment with 675 chun

In [13]:
# ========================================
# EXPERIMENT PARAMETER: EMBEDDING MODEL
# ========================================
# Change MODEL_NAME to experiment with different embeddings!

# Current options (uncomment one):
# MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"  # Best quality, 768 dim
# MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Fast, 384 dim
MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"  # Q&A optimized
# MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"  # Multilingual

# ========================================

print(f"\n{'='*70}")
print(f"EMBEDDING MODEL CONFIGURATION")
print(f"{'='*70}")
print(f"Model: {MODEL_NAME}")
print("   Using CPU for stable performance")

embedding_model = HuggingFaceEmbeddings(
    model_name=MODEL_NAME,
    model_kwargs={
        'device': 'cuda',
        'trust_remote_code': True
    },
    encode_kwargs={'normalize_embeddings': True}
)

# Get embedding dimension
test_embed = embedding_model.embed_query("test")

print(f"[SUCCESS] Embedding model loaded!")
print(f"   Model: {MODEL_NAME.split('/')[-1]}")
print(f"   Embedding dimension: {len(test_embed)}")
print(f"   Device: CPU")
print(f"{'='*70}\n")

print("\nGenerating embeddings for all chunks with current model...")
chunk_texts = [chunk.page_content for chunk in chunks]

# Process embeddings in batches to avoid memory issues
batch_size = 32
all_embeddings = []

import time
start_time = time.time()

for i in range(0, len(chunk_texts), batch_size):
    batch = chunk_texts[i:i+batch_size]
    batch_embeddings = embedding_model.embed_documents(batch)
    all_embeddings.extend(batch_embeddings)
    print(f"   Processed batch {i//batch_size + 1}/{(len(chunk_texts) + batch_size - 1)//batch_size}")

elapsed_time = time.time() - start_time
print(f"\n[SUCCESS] Generated {len(all_embeddings)} embeddings in {elapsed_time:.2f}s")
print(f"Average: {elapsed_time/len(all_embeddings):.3f}s per chunk")


# Build FAISS vector store for fast similarity search
print("\nBuilding FAISS vector database...")
dimension = len(all_embeddings[0])
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity

# Normalize embeddings for proper cosine similarity
embeddings_array = np.array(all_embeddings).astype('float32')
faiss.normalize_L2(embeddings_array)
index.add(embeddings_array)

print(f"[SUCCESS] FAISS index ready: {index.ntotal:,} vectors ({dimension} dimensions)")
print(f"Model: {MODEL_NAME.split('/')[-1]}")




EMBEDDING MODEL CONFIGURATION
Model: sentence-transformers/multi-qa-mpnet-base-dot-v1
   Using CPU for stable performance
[SUCCESS] Embedding model loaded!
   Model: multi-qa-mpnet-base-dot-v1
   Embedding dimension: 768
   Device: CPU


Generating embeddings for all chunks with current model...
   Processed batch 1/22
   Processed batch 2/22
   Processed batch 3/22
   Processed batch 4/22
   Processed batch 5/22
   Processed batch 6/22
   Processed batch 7/22
   Processed batch 8/22
   Processed batch 9/22
   Processed batch 10/22
   Processed batch 11/22
   Processed batch 12/22
   Processed batch 13/22
   Processed batch 14/22
   Processed batch 15/22
   Processed batch 16/22
   Processed batch 17/22
   Processed batch 18/22
   Processed batch 19/22
   Processed batch 20/22
   Processed batch 21/22
   Processed batch 22/22

[SUCCESS] Generated 675 embeddings in 6.01s
Average: 0.009s per chunk

Building FAISS vector database...
[SUCCESS] FAISS index ready: 675 vectors (768 dimension

In [14]:
def retrieve_relevant_chunks(query, top_k=5):
    """Find most relevant chunks for the query"""
    print(f"\nSearching for: '{query}'")
    print(f"Model: {MODEL_NAME.split('/')[-1]}")
    print("=" * 50)

    # Embed the query
    query_embedding = embedding_model.embed_query(query)
    query_vector = np.array([query_embedding]).astype('float32')
    faiss.normalize_L2(query_vector)

    # Search FAISS index
    scores, indices = index.search(query_vector, top_k)

    # Return results with metadata
    results = []
    for idx, score in zip(indices[0], scores[0]):
        if idx < len(chunks):  # Safety check
            chunk = chunks[idx]
            results.append({
                'text': chunk.page_content,
                'score': float(score),
                'chunk_id': int(idx),
                'start_pos': chunk.metadata.get('start_index', 0) if hasattr(chunk, 'metadata') else 0
            })

    return results

def display_retrieval_results(query, results):
    """Display comprehensive retrieval results"""
    print(f"Found {len(results)} relevant chunks for: '{query}'")
    print("=" * 70)

    for i, result in enumerate(results, 1):
        print(f"\nChunk {i} (ID: {result['chunk_id']})")
        print(f"Relevance Score: {result['score']:.4f}")
        print(f"Position in Document: Character {result['start_pos']:,}")
        print(f"Length: {len(result['text'])} characters")
        print(f"Content Preview (first 200 chars):")
        print(f"   {result['text'][:200].replace(chr(10), ' ').replace(chr(13), '')}...")

        # Show full content if it's short enough
        if len(result['text']) <= 500:
            print(f"Full Content:")
            print(f"   {result['text']}")

        print("-" * 40)

    return results

print("[SUCCESS] Retrieval functions ready!")

[SUCCESS] Retrieval functions ready!


In [9]:
import json
# Load questions from Questions.json
with open('Questions.json', 'r', encoding='utf-8') as f:
    questions_data = json.load(f)

# Count total questions
total_questions = sum(len(questions) for questions in questions_data.values())
print(f"[SUCCESS] Loaded {total_questions} questions across {len(questions_data)} categories")
print("\nCategories:")
for category, questions in questions_data.items():
    print(f"   • {category}: {len(questions)} questions")

[SUCCESS] Loaded 50 questions across 8 categories

Categories:
   • Admissions: 8 questions
   • Enrollment and Registration: 8 questions
   • Fees, Payments and Scholarships: 6 questions
   • Academic Policies and Grading: 8 questions
   • Attendance and Conduct: 6 questions
   • Student Services and Organizations: 6 questions
   • Graduation and Academic Completion: 5 questions
   • General and Miscellaneous Information: 3 questions


In [10]:
def test_category_questions(category_name, top_k=3, show_full_results=False):
    """Test all questions from a specific category"""
    if category_name not in questions_data:
        print(f"[ERROR] Category '{category_name}' not found!")
        print(f"Available categories: {', '.join(questions_data.keys())}")
        return

    questions = questions_data[category_name]
    print(f"\n{'='*70}")
    print(f"Testing Category: {category_name}")
    print(f"Total Questions: {len(questions)}")
    print(f"Model: {MODEL_NAME.split('/')[-1]}")
    print(f"Retrieving top {top_k} chunks per question")
    print(f"{'='*70}")

    all_results = []

    for i, q_item in enumerate(questions, 1):
        question = q_item['question']
        expected_ref = q_item['expected_reference']

        print(f"\n{'─'*70}")
        print(f"Question {i}/{len(questions)}")
        print(f"Q: {question}")
        print(f"Expected Reference: {expected_ref}")
        print(f"{'─'*70}")

        # Retrieve relevant chunks
        results = retrieve_relevant_chunks(question, top_k=top_k)

        # Store results for analysis
        all_results.append({
            'question': question,
            'expected_reference': expected_ref,
            'results': results,
            'top_score': results[0]['score'] if results else 0
        })

        if show_full_results:
            display_retrieval_results(question, results)
        else:
            # Show compact summary
            print(f"\nTop {min(top_k, len(results))} Results:")
            for j, result in enumerate(results[:top_k], 1):
                preview = result['text'][:150].replace('\n', ' ').replace('\r', '')
                print(f"   {j}. Score: {result['score']:.4f} | {preview}...")

    # Summary statistics
    print(f"\n{'='*70}")
    print(f"CATEGORY SUMMARY: {category_name}")
    print(f"{'='*70}")
    avg_top_score = np.mean([r['top_score'] for r in all_results])
    print(f"Average Top Score: {avg_top_score:.4f}")
    print(f"Score Range: {min([r['top_score'] for r in all_results]):.4f} - {max([r['top_score'] for r in all_results]):.4f}")

    # Score distribution
    high_confidence = sum(1 for r in all_results if r['top_score'] > 0.7)
    medium_confidence = sum(1 for r in all_results if 0.5 <= r['top_score'] <= 0.7)
    low_confidence = sum(1 for r in all_results if r['top_score'] < 0.5)

    print(f"\nConfidence Distribution:")
    print(f"   High (>0.7):   {high_confidence}/{len(all_results)} questions ({high_confidence/len(all_results)*100:.1f}%)")
    print(f"   Medium (0.5-0.7): {medium_confidence}/{len(all_results)} questions ({medium_confidence/len(all_results)*100:.1f}%)")
    print(f"   Low (<0.5):    {low_confidence}/{len(all_results)} questions ({low_confidence/len(all_results)*100:.1f}%)")

    return all_results

print("[SUCCESS] Category testing function ready!")


def test_all_questions(top_k=3, show_detailed=False):
    """Test all questions from all categories"""
    print(f"\n{'='*70}")
    print(f"COMPREHENSIVE RETRIEVAL TEST")
    print(f"{'='*70}")
    print(f"Model: {MODEL_NAME.split('/')[-1]}")
    print(f"Total Categories: {len(questions_data)}")
    print(f"Total Questions: {sum(len(q) for q in questions_data.values())}")
    print(f"Top-K per question: {top_k}")
    print(f"{'='*70}")

    all_category_results = {}

    for category_name in questions_data.keys():
        print(f"\n\n{'█'*70}")
        print(f"CATEGORY: {category_name}")
        print(f"{'█'*70}")

        category_results = test_category_questions(
            category_name,
            top_k=top_k,
            show_full_results=show_detailed
        )
        all_category_results[category_name] = category_results

    # Overall summary
    print(f"\n\n{'='*70}")
    print(f"OVERALL PERFORMANCE SUMMARY")
    print(f"{'='*70}")

    all_scores = []
    for category, results in all_category_results.items():
        scores = [r['top_score'] for r in results]
        all_scores.extend(scores)
        avg = np.mean(scores)
        print(f"\n{category}:")
        print(f"   Questions: {len(results)}")
        print(f"   Avg Score: {avg:.4f}")
        print(f"   Range: {min(scores):.4f} - {max(scores):.4f}")

    # Global statistics
    print(f"\n{'─'*70}")
    print(f"GLOBAL STATISTICS:")
    print(f"   Total Questions Tested: {len(all_scores)}")
    print(f"   Overall Average Score: {np.mean(all_scores):.4f}")
    print(f"   Median Score: {np.median(all_scores):.4f}")
    print(f"   Std Deviation: {np.std(all_scores):.4f}")

    # Overall confidence distribution
    high = sum(1 for s in all_scores if s > 0.7)
    medium = sum(1 for s in all_scores if 0.5 <= s <= 0.7)
    low = sum(1 for s in all_scores if s < 0.5)

    print(f"\n   Overall Confidence Distribution:")
    print(f"      High (>0.7):   {high}/{len(all_scores)} ({high/len(all_scores)*100:.1f}%)")
    print(f"      Medium (0.5-0.7): {medium}/{len(all_scores)} ({medium/len(all_scores)*100:.1f}%)")
    print(f"      Low (<0.5):    {low}/{len(all_scores)} ({low/len(all_scores)*100:.1f}%)")

    return all_category_results

print("[SUCCESS] Comprehensive testing function ready!")




# Create experiment tracking system
experiment_results = []

def save_experiment_results(filename="experiment_results.json"):
    """Save all experiment results to file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(experiment_results, f, indent=2)
    print(f"[SUCCESS] Saved {len(experiment_results)} experiments to {filename}")

def load_experiment_results(filename="experiment_results.json"):
    """Load previous experiment results"""
    global experiment_results
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            experiment_results = json.load(f)
        print(f"[SUCCESS] Loaded {len(experiment_results)} previous experiments")
        return experiment_results
    except FileNotFoundError:
        print(f"[INFO] No previous results found. Starting fresh!")
        return []

# Try to load previous results
load_experiment_results()

print("[SUCCESS] Experiment tracking system ready!")
print(f"Current experiments tracked: {len(experiment_results)}")


def run_full_experiment(experiment_name, chunking_config, embedding_model_name, top_k=3):
    """
    Run a complete experiment with specific chunking and embedding configuration

    Args:
        experiment_name: Descriptive name for this experiment
        chunking_config: Dict with keys 'strategy', 'params' (e.g., percentile, overlap)
        embedding_model_name: HuggingFace model name
        top_k: Number of chunks to retrieve per query
    """
    start_time = time.time()

    print(f"\n{'█'*70}")
    print(f"RUNNING EXPERIMENT: {experiment_name}")
    print(f"{'█'*70}")
    print(f"Chunking: {chunking_config}")
    print(f"Embedding: {embedding_model_name}")
    print(f"Top-K: {top_k}")
    print(f"{'█'*70}\n")

    # Test with all questions
    category_results = test_all_questions(top_k=top_k, show_detailed=False)

    # Calculate metrics
    all_scores = []
    category_scores = {}

    for category, results in category_results.items():
        scores = [r['top_score'] for r in results]
        all_scores.extend(scores)
        category_scores[category] = {
            'avg_score': float(np.mean(scores)),
            'min_score': float(min(scores)),
            'max_score': float(max(scores)),
            'num_questions': len(scores)
        }

    # Compute overall metrics
    elapsed_time = time.time() - start_time

    experiment_data = {
        'experiment_name': experiment_name,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
        'chunking_config': chunking_config,
        'embedding_model': embedding_model_name,
        'top_k': top_k,
        'metrics': {
            'overall_avg_score': float(np.mean(all_scores)),
            'overall_median_score': float(np.median(all_scores)),
            'overall_std': float(np.std(all_scores)),
            'min_score': float(min(all_scores)),
            'max_score': float(max(all_scores)),
            'high_confidence_pct': float(sum(1 for s in all_scores if s > 0.7) / len(all_scores) * 100),
            'medium_confidence_pct': float(sum(1 for s in all_scores if 0.5 <= s <= 0.7) / len(all_scores) * 100),
            'low_confidence_pct': float(sum(1 for s in all_scores if s < 0.5) / len(all_scores) * 100),
            'total_questions': len(all_scores),
            'execution_time_seconds': elapsed_time
        },
        'category_metrics': category_scores
    }

    # Save to tracking
    experiment_results.append(experiment_data)
    save_experiment_results()

    print(f"\n{'='*70}")
    print(f"EXPERIMENT COMPLETE: {experiment_name}")
    print(f"{'='*70}")
    print(f"Overall Avg Score: {experiment_data['metrics']['overall_avg_score']:.4f}")
    print(f"High Confidence: {experiment_data['metrics']['high_confidence_pct']:.1f}%")
    print(f"Execution Time: {elapsed_time:.1f}s")
    print(f"{'='*70}\n")

    return experiment_data

print("[SUCCESS] Experiment runner ready!")


def compare_experiments():
    """Compare all tracked experiments and show rankings"""
    if not experiment_results:
        print("[INFO] No experiments to compare yet. Run some experiments first!")
        return

    print(f"\n{'='*80}")
    print(f"EXPERIMENT COMPARISON DASHBOARD")
    print(f"{'='*80}")
    print(f"Total Experiments: {len(experiment_results)}\n")

    # Sort by overall average score
    sorted_experiments = sorted(
        experiment_results,
        key=lambda x: x['metrics']['overall_avg_score'],
        reverse=True
    )

    # Display ranking table
    print(f"{'Rank':<6} {'Experiment Name':<35} {'Avg Score':<12} {'High Conf %':<12} {'Time (s)':<10}")
    print(f"{'─'*80}")

    for i, exp in enumerate(sorted_experiments, 1):
        name = exp['experiment_name'][:33]
        avg_score = exp['metrics']['overall_avg_score']
        high_conf = exp['metrics']['high_confidence_pct']
        exec_time = exp['metrics']['execution_time_seconds']

        print(f"{i:<6} {name:<35} {avg_score:<12.4f} {high_conf:<12.1f} {exec_time:<10.1f}")

    # Show best experiment details
    print(f"\n{'='*80}")
    print(f"🏆 BEST EXPERIMENT: {sorted_experiments[0]['experiment_name']}")
    print(f"{'='*80}")
    best = sorted_experiments[0]

    print(f"\nConfiguration:")
    print(f"   Chunking: {best['chunking_config']}")
    print(f"   Embedding Model: {best['embedding_model']}")
    print(f"   Top-K: {best['top_k']}")

    print(f"\nPerformance Metrics:")
    print(f"   Average Score: {best['metrics']['overall_avg_score']:.4f}")
    print(f"   Median Score: {best['metrics']['overall_median_score']:.4f}")
    print(f"   Std Deviation: {best['metrics']['overall_std']:.4f}")
    print(f"   Score Range: {best['metrics']['min_score']:.4f} - {best['metrics']['max_score']:.4f}")

    print(f"\nConfidence Distribution:")
    print(f"   High (>0.7):   {best['metrics']['high_confidence_pct']:.1f}%")
    print(f"   Medium (0.5-0.7): {best['metrics']['medium_confidence_pct']:.1f}%")
    print(f"   Low (<0.5):    {best['metrics']['low_confidence_pct']:.1f}%")

    print(f"\nCategory Performance:")
    for category, metrics in best['category_metrics'].items():
        print(f"   {category}: {metrics['avg_score']:.4f}")

    # Show comparison with worst
    if len(sorted_experiments) > 1:
        worst = sorted_experiments[-1]
        improvement = ((best['metrics']['overall_avg_score'] - worst['metrics']['overall_avg_score'])
                      / worst['metrics']['overall_avg_score'] * 100)
        print(f"\n📈 Improvement over worst: {improvement:.1f}%")

    print(f"\n{'='*80}\n")

    return sorted_experiments

print("[SUCCESS] Comparison function ready!")


# Quick visualization of experiment results
def plot_experiment_comparison():
    """Create a simple text-based comparison chart"""
    if not experiment_results:
        print("[INFO] No experiments to visualize yet.")
        return

    sorted_exp = sorted(experiment_results, key=lambda x: x['metrics']['overall_avg_score'], reverse=True)

    print(f"\n{'='*70}")
    print(f"EXPERIMENT PERFORMANCE VISUALIZATION")
    print(f"{'='*70}\n")

    # Bar chart of average scores
    print("Average Score Comparison:")
    print("-" * 70)
    max_score = max(e['metrics']['overall_avg_score'] for e in sorted_exp)

    for exp in sorted_exp:
        name = exp['experiment_name'][:30]
        score = exp['metrics']['overall_avg_score']
        bar_length = int((score / max_score) * 40)
        bar = '█' * bar_length
        print(f"{name:<30} {score:.4f} {bar}")

    print()

    # High confidence percentage comparison
    print("High Confidence (>0.7) Percentage:")
    print("-" * 70)
    max_conf = max(e['metrics']['high_confidence_pct'] for e in sorted_exp)

    for exp in sorted_exp:
        name = exp['experiment_name'][:30]
        conf = exp['metrics']['high_confidence_pct']
        bar_length = int((conf / max_conf) * 40) if max_conf > 0 else 0
        bar = '█' * bar_length
        print(f"{name:<30} {conf:>6.1f}% {bar}")

    print(f"\n{'='*70}\n")

print("[SUCCESS] Visualization function ready!")

[SUCCESS] Category testing function ready!
[SUCCESS] Comprehensive testing function ready!
[SUCCESS] Loaded 11 previous experiments
[SUCCESS] Experiment tracking system ready!
Current experiments tracked: 11
[SUCCESS] Experiment runner ready!
[SUCCESS] Comparison function ready!
[SUCCESS] Visualization function ready!


In [15]:
# ========================================
# RUN CURRENT EXPERIMENT
# ========================================
# This will test your current chunking + embedding configuration
# against all questions from Questions.json

# Automatically detect current configuration
current_chunking = {
    "strategy": "semantic",
    "percentile": PERCENTILE_THRESHOLD if 'PERCENTILE_THRESHOLD' in dir() else 80,
    "buffer_size": BUFFER_SIZE if 'BUFFER_SIZE' in dir() else 1
}

current_model = MODEL_NAME.split('/')[-1] if 'MODEL_NAME' in dir() else "unknown"

# Generate experiment name
experiment_name = f"P{current_chunking['percentile']}-{current_model}"

print(f"Ready to run experiment: {experiment_name}")
print(f"Chunking: {current_chunking}")
print(f"Embedding: {MODEL_NAME if 'MODEL_NAME' in dir() else 'Not set'}")
print("\n🚀 Uncomment the line below to run the experiment:")
print("# run_full_experiment(experiment_name, current_chunking, MODEL_NAME, top_k=5)")

# Uncomment to run:
run_full_experiment(experiment_name, current_chunking, MODEL_NAME, top_k=5)

Ready to run experiment: P45-multi-qa-mpnet-base-dot-v1
Chunking: {'strategy': 'semantic', 'percentile': 45, 'buffer_size': 3}
Embedding: sentence-transformers/multi-qa-mpnet-base-dot-v1

🚀 Uncomment the line below to run the experiment:
# run_full_experiment(experiment_name, current_chunking, MODEL_NAME, top_k=5)

██████████████████████████████████████████████████████████████████████
RUNNING EXPERIMENT: P45-multi-qa-mpnet-base-dot-v1
██████████████████████████████████████████████████████████████████████
Chunking: {'strategy': 'semantic', 'percentile': 45, 'buffer_size': 3}
Embedding: sentence-transformers/multi-qa-mpnet-base-dot-v1
Top-K: 5
██████████████████████████████████████████████████████████████████████


COMPREHENSIVE RETRIEVAL TEST
Model: multi-qa-mpnet-base-dot-v1
Total Categories: 8
Total Questions: 50
Top-K per question: 5


██████████████████████████████████████████████████████████████████████
CATEGORY: Admissions
██████████████████████████████████████████████████████████

{'experiment_name': 'P45-multi-qa-mpnet-base-dot-v1',
 'timestamp': '2025-10-15 20:39:18',
 'chunking_config': {'strategy': 'semantic',
  'percentile': 45,
  'buffer_size': 3},
 'embedding_model': 'sentence-transformers/multi-qa-mpnet-base-dot-v1',
 'top_k': 5,
 'metrics': {'overall_avg_score': 0.6317782586812973,
  'overall_median_score': 0.6329613626003265,
  'overall_std': 0.0646537192425565,
  'min_score': 0.47181645035743713,
  'max_score': 0.7992461323738098,
  'high_confidence_pct': 12.0,
  'medium_confidence_pct': 84.0,
  'low_confidence_pct': 4.0,
  'total_questions': 50,
  'execution_time_seconds': 0.575904369354248},
 'category_metrics': {'Admissions': {'avg_score': 0.6308294460177422,
   'min_score': 0.5961345434188843,
   'max_score': 0.6921945214271545,
   'num_questions': 8},
  'Enrollment and Registration': {'avg_score': 0.6379680596292019,
   'min_score': 0.47181645035743713,
   'max_score': 0.7936431169509888,
   'num_questions': 8},
  'Fees, Payments and Scholarships

In [16]:
# ========================================
# VIEW RESULTS & COMPARISON
# ========================================

# Show comparison dashboard
compare_experiments()

# Show visual comparison
plot_experiment_comparison()


EXPERIMENT COMPARISON DASHBOARD
Total Experiments: 13

Rank   Experiment Name                     Avg Score    High Conf %  Time (s)  
────────────────────────────────────────────────────────────────────────────────
1      P50-multi-qa-mpnet-base-dot-v1      0.6408       22.0         1.6       
2      P50-multi-qa-mpnet-base-dot-v1      0.6341       12.0         1.6       
3      P45-multi-qa-mpnet-base-dot-v1      0.6318       12.0         0.6       
4      P45-multi-qa-mpnet-base-dot-v1      0.6318       12.0         0.6       
5      P60-multi-qa-mpnet-base-dot-v1      0.6259       16.0         1.6       
6      P65-multi-qa-mpnet-base-dot-v1      0.6230       16.0         1.6       
7      P50-multi-qa-mpnet-base-dot-v1      0.6226       8.0          1.7       
8      P70-multi-qa-mpnet-base-dot-v1      0.6213       14.0         1.6       
9      P75-multi-qa-mpnet-base-dot-v1      0.6122       14.0         1.6       
10     P65-all-mpnet-base-v2               0.5826       8.0    