In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/adrian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [37]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from openai import OpenAI
from dotenv import load_dotenv
import os
import tiktoken
from typing import List, Dict

In [None]:
class TokenCounter:
    def __init__(self):
        self.encoding = tiktoken.get_encoding("cl100k_base")
        self.total_tokens = {
            "embedding": 0,
            "retrieval": 0
        }

    def count_tokens(self, text: str) -> int:
        return len(self.encoding.encode(text))

In [28]:
class DocumentProcessor:
    def __init__(self, chunk_size=500, chunk_overlap=50):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""]
        )
        self.model = SentenceTransformer("BAAI/bge-small-en-v1.5")
        
    def process_documents(self, dataset, num_docs=10):
        documents = []
        for i in range(num_docs):
            doc = dataset['train'][i]
            full_text = f"Title: {doc['title']}\n\nContent: {doc['text']}"
            chunks = self.text_splitter.split_text(full_text)
            documents.extend(chunks)
            
        # Create embeddings and BM25 index
        embeddings = self.model.encode(documents, normalize_embeddings=True)
        tokenized_corpus = [word_tokenize(doc.lower()) for doc in documents]
        bm25 = BM25Okapi(tokenized_corpus)
        
        return documents, embeddings, bm25

In [47]:
class HybridRetriever:
    def __init__(self, documents, embeddings, bm25, model):
        self.documents = documents
        self.embeddings = embeddings
        self.bm25 = bm25
        self.model = model

        self.token_counter = TokenCounter()
        self.token_stats = {
            "total_query_tokens": 0,
            "total_retrieved_tokens": 0,
            "queries_processed": 0
        }
    
    def retrieve(self, query, top_k=4):
        query_tokens = self.token_counter.count_tokens(query)
        self.token_stats["total_query_tokens"] += query_tokens

        semantic_results = self._semantic_search(query, top_k)
        bm25_results = self._bm25_search(query, top_k)
        final_results = self._rank_fusion(semantic_results, bm25_results)

        retrieved_docs = [self.documents[doc_id] for doc_id, _ in final_results[:top_k]]
        retrieved_tokens = sum(self.token_counter.count_tokens(doc) for doc in retrieved_docs)
        self.token_stats["total_retrieved_tokens"] += retrieved_tokens
        
        self.token_stats["queries_processed"] += 1


        print(f"\nRetrieval Operation Stats (Query #{self.token_stats['queries_processed']}):")
        print(f"Query tokens: {query_tokens}")
        print(f"Retrieved document tokens: {retrieved_tokens}")
        print(f"Average tokens per retrieved document: {retrieved_tokens / len(retrieved_docs):.1f}")

        return retrieved_docs
    
    def get_token_stats(self):
        stats = self.token_stats.copy()
        if stats["queries_processed"] > 0:
            stats["average_query_tokens"] = stats["total_query_tokens"] / stats["queries_processed"]
            stats["average_retrieved_tokens"] = stats["total_retrieved_tokens"] / stats["queries_processed"]
        return stats
    
    def _semantic_search(self, query, top_k):
        query_embedding = self.model.encode(
            f"Represent this sentence for searching relevant passages: {query}", 
            normalize_embeddings=True
        )
        similarities = query_embedding @ self.embeddings.T
        top_indices = np.argpartition(similarities, -top_k)[-top_k:]
        return [(idx, similarities[idx]) for idx in top_indices]
    
    def _bm25_search(self, query, top_k):
        tokenized_query = word_tokenize(query.lower())
        scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argpartition(scores, -top_k)[-top_k:]
        return [(idx, scores[idx]) for idx in top_indices]
    
    def _rank_fusion(self, semantic_results, bm25_results, k=60):
        scores = {}
        for results in [semantic_results, bm25_results]:
            for rank, (doc_id, score) in enumerate(sorted(results, key=lambda x: x[1], reverse=True)):
                scores[doc_id] = scores.get(doc_id, 0) + 1.0 / (rank + k)
        return sorted(scores.items(), key=lambda x: x[1], reverse=True)

In [45]:
class ResponseGenerator:
    def __init__(self):
        load_dotenv()
        self.client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=os.getenv("OPENROUTER_KEY"),
        )
    
    def generate(self, query, context):
        prompt = f"""Please provide a direct answer to the question using only the information from the provided context. If the information is not available in the context, please state that.
        Question: {query}
        Context: {context}
        Ensure your response is clear and concise. Do not suggest search queries or discuss how to search for information."""
        
        completion = self.client.chat.completions.create(
            model="deepseek/deepseek-chat",
            messages=[{"role": "user", "content": prompt}]
        )
        return completion.choices[0].message.content

In [7]:
# Load dataset
dataset = load_dataset("pszemraj/simple_wikipedia")
print("Dataset loaded")

Dataset loaded
Processing documents
Documents processed


Unnamed: 0,Query,Response
0,What is the significance of Baghdad Internatio...,The significance of Baghdad International Airp...
1,How does natural selection explain the develop...,Natural selection explains the development of ...
2,What are some key milestones in Halsey's music...,Key milestones in Halsey's music career includ...


In [32]:
# Process documents
processor = DocumentProcessor()
print("Processing documents")
processor.process_documents(dataset)
print("Documents processed")

Processing documents
Documents processed


In [48]:
# Initialize retriever and generator
retriever = HybridRetriever(documents, embeddings, bm25, processor.model)
generator = ResponseGenerator()

NameError: name 'TokenCounter' is not defined

In [41]:
# Queries
queries = [
    # Basic Factual
    "Who was Albert Einstein and what is he known for?",
    "What is the capital of France and what landmarks is it famous for?",
    "When did World War II begin and end?",
    "What is photosynthesis and how does it work?",
    "Who wrote Romeo and Juliet and what is the play about?",
    
    # Multi-hop
    # "How did the Industrial Revolution affect both urban development and working conditions?",
    # "What connection exists between the Renaissance and the Scientific Revolution?",
    # "How did ancient Greek democracy influence modern governmental systems?",
    
    # # Comparative
    # "What are the main differences between DNA and RNA?",
    # "How do classical and quantum physics differ?",
    # "Compare the American and French Revolutions: what were their causes and outcomes?",
    
    # # Time-Based
    # "What major events occurred during the 1960s Space Race?",
    # "How did transportation evolve from the 19th to the 20th century?",
    # "What were the key developments in computer technology during the 1990s?",
    
    # # Scientific
    # "How does gravity affect planetary motion?",
    # "What role do enzymes play in digestion?",
    # "How does climate change affect global weather patterns?",
    
    # # Historical
    # "What caused the fall of the Roman Empire?",
    # "How did the Black Death impact medieval Europe?",
    # "What were the major achievements of ancient Egypt?",
    
    # # Edge Cases
    # "What is the Revolution?", 
    # "What happened in Azerbaijan in 1832?",  
    # "What are the outcomes of the 2024 Olympics?",
    # "How did mathematics influence both art and music?",
    
    # # Lists and Specifics
    # "What are the main types of renewable energy?",
    # "How many planets are in our solar system and what are their sizes?",
    # "When were the major pyramids of Egypt built?",
    # "Who were the key leaders during World War II?",
    
    # # Complex Reasoning
    # "How did various inventions during the Industrial Revolution connect to create modern factories?",
    # "What relationship exists between climate, geography, and the development of ancient civilizations?",
    # "How did different philosophical movements influence political changes throughout history?"
]
    
# Process queries and display results
results = []
for query in queries:
    retrieved_docs = retriever.retrieve(query)
    context = "\n\n".join(retrieved_docs)
    response = generator.generate(query, context)
    results.append({'Query': query, 'Response': response})

In [49]:
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Query,Response
0,Who was Albert Einstein and what is he known for?,The provided context does not mention Albert E...
1,What is the capital of France and what landmar...,The capital of France is Paris. Paris is famou...
2,When did World War II begin and end?,The provided context does not explicitly state...
3,What is photosynthesis and how does it work?,The context provided does not contain any info...
4,Who wrote Romeo and Juliet and what is the pla...,The context does not provide information about...


In [50]:
# Get final token statistics
stats = retriever.get_token_stats()
print(f"\nFinal Token Statistics:")
print(f"Total queries processed: {stats['queries_processed']}")
print(f"Total query tokens: {stats['total_query_tokens']}")
print(f"Total retrieved tokens: {stats['total_retrieved_tokens']}")
print(f"Average tokens per query: {stats['average_query_tokens']:.1f}")
print(f"Average tokens per retrieval: {stats['average_retrieved_tokens']:.1f}")

AttributeError: 'HybridRetriever' object has no attribute 'get_token_stats'