In [1]:
# Install dependencies

!pip install -q \
    faiss-cpu \
    sentence-transformers \
    datasets \
    evaluate \
    pandas \
    numpy

In [2]:
# Imports

import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
from typing import List

In [3]:
# ============================================================================
# 1. Parameter Experiments
# ============================================================================

print("\n" + "="*60)
print("STEP 4: PARAMETER EXPERIMENTS")
print("="*60)

# Embedding configurations
embedding_configs = [
    {'name': 'MiniLM-384D', 'model': 'all-MiniLM-L6-v2', 'dim': 384},
    {'name': 'MPNet-512D', 'model': 'all-mpnet-base-v2', 'dim': 512}
]

# Retrieval configurations
retrieval_configs = [
    {'top_k': 3, 'strategy': 'concatenate', 'description': 'Concatenate top-3 passages'},
    {'top_k': 5, 'strategy': 'concatenate', 'description': 'Concatenate top-5 passages'},
    {'top_k': 10, 'strategy': 'concatenate', 'description': 'Concatenate top-10 passages'},
    {'top_k': 3, 'strategy': 'mmr', 'description': 'MMR top-3 passages'},
    {'top_k': 5, 'strategy': 'mmr', 'description': 'MMR top-5 passages'}
]

print("\nEmbedding models:")
for config in embedding_configs:
    print(f"  - {config['name']}: {config['dim']} dimensions")

print("\nRetrieval configurations:")
for i, config in enumerate(retrieval_configs, 1):
    print(f"  {i}. Top-{config['top_k']} ({config['strategy']}): {config['description']}")


STEP 4: PARAMETER EXPERIMENTS

Embedding models:
  - MiniLM-384D: 384 dimensions
  - MPNet-512D: 512 dimensions

Retrieval configurations:
  1. Top-3 (concatenate): Concatenate top-3 passages
  2. Top-5 (concatenate): Concatenate top-5 passages
  3. Top-10 (concatenate): Concatenate top-10 passages
  4. Top-3 (mmr): MMR top-3 passages
  5. Top-5 (mmr): MMR top-5 passages


In [4]:
# ============================================================================
# 2. Load Dataset & Chunk
# ============================================================================

# Load Wikipedia corpus
wiki_corpus_dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus")
wiki_passages = wiki_corpus_dataset["passages"]

# Load question-answer pairs
qa_dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer")
test_questions = qa_dataset["test"]

print(f"\nDataset loaded: {len(wiki_passages)} passages, {len(test_questions)} questions")

# Chunk text into fixed-size segments
def chunk_text_by_chars(content: str, chunk_size: int) -> List[str]:
    if not content:
        return []
    return [content[offset:offset + chunk_size]
            for offset in range(0, len(content), chunk_size)]

# Create chunked document collection
def create_chunked_documents(passage_collection, chunk_size: int):
    chunked_documents = []

    for passage_idx, passage_data in enumerate(passage_collection):
        passage_text = passage_data.get("text") or passage_data.get("passage") or ""
        text_chunks = chunk_text_by_chars(passage_text, chunk_size)

        for chunk_idx, chunk_content in enumerate(text_chunks):
            chunked_documents.append({
                "id": f"{passage_idx}-{chunk_idx}",
                "text": chunk_content
            })

    return chunked_documents

CHUNK_SIZE = 550
document_chunks = create_chunked_documents(wiki_passages, CHUNK_SIZE)
print(f"Created {len(document_chunks)} chunks")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Dataset loaded: 3200 passages, 918 questions
Created 4233 chunks


In [9]:
# ============================================================================
# 3. Load FLAN-T5
# ============================================================================

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model_flan = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
generator = pipeline(
    "text2text-generation",
    model=model_flan,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

def generate_answer_from_context(prompt_text):
    generation_output = generator(
        prompt_text,
        max_new_tokens=128,
        temperature=0.2
    )
    return generation_output[0]['generated_text'].strip()

print("FLAN-T5 loaded")

Device set to use cuda:0


FLAN-T5 loaded


In [6]:
# ============================================================================
# 4. Build FAISS Indexes
# ============================================================================

def create_faiss_index(embed_config):
    print(f"\nSetting up: {embed_config['name']}")

    # Load the embedding moadel
    embedding_model = SentenceTransformer(embed_config['model'])
    chunk_texts = [chunk["text"] for chunk in document_chunks]

    # Generate embeddings
    chunk_embeddings = embedding_model.encode(
        chunk_texts,
        batch_size=64,
        show_progress_bar=True,
        normalize_embeddings=True
    ).astype('float32')

    embedding_dim = chunk_embeddings.shape[1]
    faiss_index = faiss.IndexFlatIP(embedding_dim)
    faiss_index.add(chunk_embeddings)

    print(f"  Indexed {faiss_index.ntotal} chunks (dim={embedding_dim})")

    return {
        'model': embedding_model,
        'index': faiss_index,
        'embeddings': chunk_embeddings
    }

# Build FAISS indexes for all embedding configs
embedding_indexes = {}
for config in embedding_configs:
    embedding_indexes[config['name']] = create_faiss_index(config)


Setting up: MiniLM-384D


Batches:   0%|          | 0/67 [00:00<?, ?it/s]

  Indexed 4233 chunks (dim=384)

Setting up: MPNet-512D


Batches:   0%|          | 0/67 [00:00<?, ?it/s]

  Indexed 4233 chunks (dim=768)


In [7]:
# ============================================================================
# 5. Passage selection strategies
# ============================================================================

def retrieve_top_chunks(embedding_bundle, query, top_k):
    query_vector = embedding_bundle['model'].encode(
        [query],
        normalize_embeddings=True
    ).astype('float32')

    similarity_scores, chunk_indices = embedding_bundle['index'].search(query_vector, top_k)

    return [(int(idx), float(score))
            for idx, score in zip(chunk_indices[0], similarity_scores[0])]

def select_passages(embedding_bundle, query, strategy, top_k):
    if strategy == 'concatenate':
        retrieved_results = retrieve_top_chunks(embedding_bundle, query, top_k)
        selected_passages = [document_chunks[idx]["text"] for idx, _ in retrieved_results]
        return " ".join(selected_passages)[:2500]

    elif strategy == 'mmr':
        # Retrieve more candidates for diversity selection
        candidate_pool_size = min(top_k * 3, len(document_chunks))
        candidate_results = retrieve_top_chunks(embedding_bundle, query, candidate_pool_size)

        selected_passages = []
        selected_indices = []
        candidate_indices = [idx for idx, _ in candidate_results]

        query_vector = embedding_bundle['model'].encode(
            [query],
            normalize_embeddings=True
        ).astype('float32')[0]

        while len(selected_passages) < top_k and candidate_indices:
            if not selected_indices:
                # First selection: highest relevance
                best_candidate_idx = candidate_indices[0]
            else:
                # MMR selection: balance relevance and diversity
                mmr_scores = []
                for candidate_idx in candidate_indices:
                    # Relevance to query
                    relevance_score = float(
                        np.dot(embedding_bundle['embeddings'][candidate_idx], query_vector)
                    )

                    # Maximum similarity to already selected passages
                    similarities_to_selected = [
                        float(np.dot(
                            embedding_bundle['embeddings'][candidate_idx],
                            embedding_bundle['embeddings'][selected_idx]
                        ))
                        for selected_idx in selected_indices
                    ]
                    max_similarity = max(similarities_to_selected)

                    # MMR score: balance relevance and diversity
                    mmr_score = 0.5 * relevance_score - 0.5 * max_similarity
                    mmr_scores.append(mmr_score)

                best_candidate_idx = candidate_indices[np.argmax(mmr_scores)]

            selected_passages.append(document_chunks[best_candidate_idx]["text"])
            selected_indices.append(best_candidate_idx)
            candidate_indices.remove(best_candidate_idx)

        return " ".join(selected_passages)[:2500]

In [8]:
# ============================================================================
# 6. Execute Parameter Experiments
# ============================================================================

# Define the best prompting strategy from Step 3
def create_persona_prompt(context, question):
    """Persona-based: Subject matter expert (best from Step 3)"""
    return f"""You are a subject matter expert. Use only the context.
If the answer is not in the context, say 'I don't know'. Be direct.

Context: {context}
Question: {question}
Answer:"""

print("\n" + "="*60)
print("RUNNING ALL PARAMETER COMBINATIONS")
print("="*60)

# Use only first 150 questions for testing
NUM_TEST_QUESTIONS = 150
evaluation_questions = test_questions.select(range(NUM_TEST_QUESTIONS))
print(f"Evaluating on {len(evaluation_questions)} questions (out of {len(test_questions)} total)")

experiment_results = []
total_combinations = len(embedding_configs) * len(retrieval_configs)
current_experiment = 0

for embed_config in embedding_configs:
    embedding_bundle = embedding_indexes[embed_config['name']]

    for retrieval_config in retrieval_configs:
        current_experiment += 1
        print(f"\n[Experiment {current_experiment}/{total_combinations}]")
        print(f"  Embedding: {embed_config['name']}")
        print(f"  Retrieval: Top-{retrieval_config['top_k']} ({retrieval_config['strategy']})")

        model_predictions = []
        ground_truth_answers = []

        for question_idx in range(len(evaluation_questions)):
            query_text = evaluation_questions[question_idx]["question"]

            # Extract ground truth answer
            answer_field = evaluation_questions[question_idx].get("answer") or \
                          evaluation_questions[question_idx].get("answers")
            ground_truth = answer_field if isinstance(answer_field, str) else answer_field[0]

            # Retrieve and select relevant context
            retrieved_context = select_passages(
                embedding_bundle,
                query_text,
                retrieval_config['strategy'],
                retrieval_config['top_k']
            )

            # Create prompt using best strategy from Step 3
            qa_prompt = create_persona_prompt(retrieved_context, query_text)
            predicted_answer = generate_answer_from_context(qa_prompt)

            # Store results for evaluation
            model_predictions.append({
                "id": str(question_idx),
                "prediction_text": predicted_answer
            })
            ground_truth_answers.append({
                "id": str(question_idx),
                "answers": {"text": [ground_truth], "answer_start": [0]}
            })

        # Calculate metrics
        squad_evaluator = evaluate.load("squad")
        evaluation_metrics = squad_evaluator.compute(
            predictions=model_predictions,
            references=ground_truth_answers
        )

        # Record experiment results
        experiment_results.append({
            'embedding_model': embed_config['name'],
            'embedding_dim': embed_config['dim'],
            'top_k': retrieval_config['top_k'],
            'selection_strategy': retrieval_config['strategy'],
            'exact_match': evaluation_metrics['exact_match'],
            'f1_score': evaluation_metrics['f1']
        })

        print(f"  Results: EM={evaluation_metrics['exact_match']:.3f}, "
              f"F1={evaluation_metrics['f1']:.3f}")

# Save results
results_dataframe = pd.DataFrame(experiment_results).sort_values(
    'f1_score',
    ascending=False
)
print("\n" + results_dataframe.to_string(index=False))
results_dataframe.to_csv('step4_results.csv', index=False)
print("\n✓ Saved to step4_results.csv")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



RUNNING ALL PARAMETER COMBINATIONS
Evaluating on 150 questions (out of 918 total)

[Experiment 1/10]
  Embedding: MiniLM-384D
  Retrieval: Top-3 (concatenate)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

  Results: EM=30.000, F1=36.542

[Experiment 2/10]
  Embedding: MiniLM-384D
  Retrieval: Top-5 (concatenate)


Token indices sequence length is longer than the specified maximum sequence length for this model (628 > 512). Running this sequence through the model will result in indexing errors


  Results: EM=32.000, F1=39.974

[Experiment 3/10]
  Embedding: MiniLM-384D
  Retrieval: Top-10 (concatenate)
  Results: EM=32.000, F1=38.606

[Experiment 4/10]
  Embedding: MiniLM-384D
  Retrieval: Top-3 (mmr)
  Results: EM=31.333, F1=37.465

[Experiment 5/10]
  Embedding: MiniLM-384D
  Retrieval: Top-5 (mmr)
  Results: EM=30.000, F1=36.251

[Experiment 6/10]
  Embedding: MPNet-512D
  Retrieval: Top-3 (concatenate)
  Results: EM=31.333, F1=38.577

[Experiment 7/10]
  Embedding: MPNet-512D
  Retrieval: Top-5 (concatenate)
  Results: EM=34.667, F1=42.295

[Experiment 8/10]
  Embedding: MPNet-512D
  Retrieval: Top-10 (concatenate)
  Results: EM=30.000, F1=36.653

[Experiment 9/10]
  Embedding: MPNet-512D
  Retrieval: Top-3 (mmr)
  Results: EM=32.667, F1=38.800

[Experiment 10/10]
  Embedding: MPNet-512D
  Retrieval: Top-5 (mmr)
  Results: EM=34.667, F1=41.409

embedding_model  embedding_dim  top_k selection_strategy  exact_match  f1_score
     MPNet-512D            512      5        conc