# Hybrid Dense-Sparse Retrieval Experiments

This notebook implements the experimental setup for evaluating our hybrid dense-sparse retrieval approach on the HotpotQA dataset. We'll evaluate the system's performance with different configurations and analyze the results.

## Table of Contents:
1. Project Setup and Dependencies
2. Configuration and Parameters
3. Data Loading and Preprocessing
4. Model Implementation
5. Experimental Evaluation
6. Results Analysis and Visualization

In [None]:
# Import required libraries
import os
import json
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

# Import project modules
import sys
sys.path.append('../src')
from multihop_dense_retrieval.dense.model import DenseRetriever
from multihop_dense_retrieval.sparse.retriever import SparseRetriever
from multihop_dense_retrieval.hybrid.retriever import HybridRetriever

# Configure plotting
plt.style.use('seaborn')
%matplotlib inline

## Project Configuration

First, we'll set up the project configuration including paths and experiment parameters. We'll create a configuration class to manage all experiment settings.

In [None]:
class ExperimentConfig:
    def __init__(self):
        # Project paths
        self.base_path = Path('../')
        self.data_path = self.base_path / 'data/hotpotqa'
        self.results_path = self.base_path / 'experiments/results'
        self.models_path = self.base_path / 'models'
        
        # Create directories if they don't exist
        self.data_path.mkdir(parents=True, exist_ok=True)
        self.results_path.mkdir(parents=True, exist_ok=True)
        self.models_path.mkdir(parents=True, exist_ok=True)
        
        # Model configuration
        self.model_config = {
            'dense': {
                'model_name': 'roberta-base',
                'max_length': 512,
                'batch_size': 32,
                'device': 'cuda' if torch.cuda.is_available() else 'cpu'
            },
            'sparse': {
                'k1': 1.5,
                'b': 0.75
            }
        }
        
        # Experiment settings
        self.experiment_config = {
            'alpha_values': [0.0, 0.3, 0.5, 0.7, 0.8, 0.9, 1.0],
            'top_k_values': [1, 5, 10, 50],
            'batch_size': 32,
            'num_samples': None  # None for full dataset
        }
        
    def save_config(self, filename: str):
        """Save configuration to JSON file."""
        config = {
            'model_config': self.model_config,
            'experiment_config': self.experiment_config
        }
        with open(self.results_path / filename, 'w') as f:
            json.dump(config, f, indent=2)
            
    @staticmethod
    def load_config(filepath: str):
        """Load configuration from JSON file."""
        with open(filepath) as f:
            config = json.load(f)
        return config

# Initialize configuration
config = ExperimentConfig()
print("Configuration initialized with device:", config.model_config['dense']['device'])

## Data Loading and Preprocessing

Next, we'll implement functions to load and preprocess the HotpotQA dataset. This includes loading the dev set and the Wikipedia corpus, as well as implementing evaluation utilities.

In [None]:
class DataLoader:
    def __init__(self, config: ExperimentConfig):
        self.config = config
        
    def load_hotpotqa_dev(self) -> List[Dict]:
        """Load HotpotQA dev set."""
        filepath = self.config.data_path / 'dev.json'
        with open(filepath) as f:
            data = json.load(f)
        
        if self.config.experiment_config['num_samples']:
            data = data[:self.config.experiment_config['num_samples']]
            
        print(f"Loaded {len(data)} examples from dev set")
        return data
    
    def load_wiki_corpus(self) -> Dict[str, str]:
        """Load Wikipedia corpus."""
        filepath = self.config.data_path / 'wiki.json'
        with open(filepath) as f:
            wiki_data = json.load(f)
            
        # Concatenate title and text for each document
        corpus = {
            doc_id: f"{content['title']} {content['text']}"
            for doc_id, content in wiki_data.items()
        }
        
        print(f"Loaded {len(corpus)} documents from Wikipedia")
        return corpus
    
    @staticmethod
    def get_gold_docs(example: Dict) -> List[str]:
        """Extract gold supporting document IDs from example."""
        return [doc[0] for doc in example['supporting_facts']]
    
# Initialize data loader
data_loader = DataLoader(config)

# Load development set and Wikipedia corpus
try:
    dev_data = data_loader.load_hotpotqa_dev()
    wiki_corpus = data_loader.load_wiki_corpus()
except FileNotFoundError:
    print("Error: Dataset files not found. Please run the download script first:")

## Model Setup and Evaluation

Now we'll set up the evaluation framework including metrics computation and the experiment runner class that will handle the model evaluation across different configurations.

In [None]:
class Evaluator:
    def __init__(self, config: ExperimentConfig):
        self.config = config
        
    def compute_metrics(
        self,
        predictions: List[Tuple[List[str], List[float]]],
        gold_docs: List[List[str]]
    ) -> Dict[str, float]:
        """Compute retrieval metrics."""
        metrics = {}
        
        # Compute top-k accuracy
        for k in self.config.experiment_config['top_k_values']:
            correct = 0
            for (pred_docs, _), gold in zip(predictions, gold_docs):
                top_k_docs = pred_docs[:k]
                if any(doc in gold for doc in top_k_docs):
                    correct += 1
            metrics[f'top_{k}_accuracy'] = correct / len(predictions)
        
        # Compute MRR
        mrr_sum = 0
        for (pred_docs, _), gold in zip(predictions, gold_docs):
            for rank, doc in enumerate(pred_docs, 1):
                if doc in gold:
                    mrr_sum += 1 / rank
                    break
        metrics['mrr'] = mrr_sum / len(predictions)
        
        return metrics

class ExperimentRunner:
    def __init__(self, config: ExperimentConfig, data_loader: DataLoader):
        self.config = config
        self.data_loader = data_loader
        self.evaluator = Evaluator(config)
        
        # Initialize models
        self.dense_retriever = DenseRetriever(
            model_name=config.model_config['dense']['model_name'],
            device=config.model_config['dense']['device']
        )
        self.sparse_retriever = SparseRetriever(
            k1=config.model_config['sparse']['k1'],
            b=config.model_config['sparse']['b']
        )
        
    def run_experiment(self) -> pd.DataFrame:
        """Run experiments for different alpha values."""
        results = []
        
        for alpha in tqdm(self.config.experiment_config['alpha_values']):
            # Initialize hybrid retriever
            hybrid_retriever = HybridRetriever(
                dense_retriever=self.dense_retriever,
                sparse_retriever=self.sparse_retriever,
                alpha=alpha
            )
            
            # Get predictions for dev set
            predictions = []
            for example in tqdm(self.data_loader.dev_data):
                pred = hybrid_retriever.retrieve(
                    example['question'],
                    top_k=max(self.config.experiment_config['top_k_values'])
                )
                predictions.append(pred)
                
            # Compute metrics
            gold_docs = [self.data_loader.get_gold_docs(ex) for ex in self.data_loader.dev_data]
            metrics = self.evaluator.compute_metrics(predictions, gold_docs)
            
            # Record results
            results.append({
                'alpha': alpha,
                **metrics
            })
            
        return pd.DataFrame(results)

# Initialize experiment runner
runner = ExperimentRunner(config, data_loader)

# Run experiments
print("Starting experiments...")
results_df = runner.run_experiment()
print("\nExperiment Results:")
print(results_df.to_string(index=False))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_metrics_vs_alpha(results_df: pd.DataFrame):
    """Plot metrics vs alpha values."""
    plt.figure(figsize=(12, 6))
    metric_cols = [col for col in results_df.columns if col != 'alpha']
    
    for metric in metric_cols:
        plt.plot(results_df['alpha'], results_df[metric], 
                marker='o', label=metric)
    
    plt.xlabel('Alpha (Dense Retriever Weight)')
    plt.ylabel('Score')
    plt.title('Retrieval Metrics vs Alpha')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_metric_heatmap(results_df: pd.DataFrame):
    """Plot correlation heatmap between metrics."""
    plt.figure(figsize=(8, 6))
    corr = results_df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
    plt.title('Metric Correlations')
    plt.show()

# Plot results
print("Plotting results...")
plot_metrics_vs_alpha(results_df)
plot_metric_heatmap(results_df)

# Find best configurations
print("\nBest configurations for each metric:")
metric_cols = [col for col in results_df.columns if col != 'alpha']
for metric in metric_cols:
    best_idx = results_df[metric].idxmax()
    print(f"\nBest {metric}:")
    print(f"Alpha: {results_df.loc[best_idx, 'alpha']:.3f}")
    print(f"Score: {results_df.loc[best_idx, metric]:.3f}")