In [1]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [13]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from openai import OpenAI
from dotenv import load_dotenv
import os
import tiktoken
from typing import List, Dict

In [14]:
class TokenCounter:
    def __init__(self):
        self.encoding = tiktoken.get_encoding("cl100k_base")
        self.total_tokens = {
            "embedding": 0,
            "retrieval": 0
        }

    def count_tokens(self, text: str) -> int:
        return len(self.encoding.encode(text))

In [69]:
class DocumentProcessor:
    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""]
        )
        self.model = SentenceTransformer("BAAI/bge-small-en-v1.5")

    def create_context(self, chunk):
        prompt = f"""Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else.

        Chunk: {chunk}"""

        context_creator = ResponseGenerator()
        completion = context_creator.client.chat.completions.create(
            model="deepseek/deepseek-chat",
            messages=[{"role": "user", "content": prompt }]
        )
        return completion.choices[0].message.content

    def process_documents(self, dataset, num_docs=5):
        documents = []
        for i in range(num_docs):
            doc = dataset['train'][i]
            full_text = f"{doc['title']} {doc['text']}"
            chunks = self.text_splitter.split_text(full_text)

            for chunk in chunks:
                    context = self.create_context(chunk)
                    enriched_chunk = f"{context}; {chunk}"
                    documents.append(enriched_chunk)

        # Create embeddings and BM25 index
        embeddings = self.model.encode(documents, normalize_embeddings=True)
        tokenized_corpus = [word_tokenize(doc.lower()) for doc in documents]
        bm25 = BM25Okapi(tokenized_corpus)

        return documents, embeddings, bm25

In [70]:
class HybridRetriever:
    def __init__(self, documents, embeddings, bm25, model):
        self.documents = documents
        self.embeddings = embeddings
        self.bm25 = bm25
        self.model = model

        self.token_counter = TokenCounter()
        self.token_stats = {
            "total_query_tokens": 0,
            "total_retrieved_tokens": 0,
            "queries_processed": 0
        }

    def retrieve(self, query, top_k=4):
        query_tokens = self.token_counter.count_tokens(query)
        self.token_stats["total_query_tokens"] += query_tokens

        semantic_results = self._semantic_search(query, top_k)
        bm25_results = self._bm25_search(query, top_k)
        final_results = self._rank_fusion(semantic_results, bm25_results)

        retrieved_docs = [self.documents[doc_id] for doc_id, _ in final_results[:top_k]]
        retrieved_tokens = sum(self.token_counter.count_tokens(doc) for doc in retrieved_docs)
        self.token_stats["total_retrieved_tokens"] += retrieved_tokens

        self.token_stats["queries_processed"] += 1


        print(f"\nRetrieval Operation Stats (Query #{self.token_stats['queries_processed']}):")
        print(f"Query tokens: {query_tokens}")
        print(f"Retrieved document tokens: {retrieved_tokens}")
        print(f"Average tokens per retrieved document: {retrieved_tokens / len(retrieved_docs):.1f}")

        return retrieved_docs

    def get_token_stats(self):
        stats = self.token_stats.copy()
        if stats["queries_processed"] > 0:
            stats["average_query_tokens"] = stats["total_query_tokens"] / stats["queries_processed"]
            stats["average_retrieved_tokens"] = stats["total_retrieved_tokens"] / stats["queries_processed"]
        return stats

    def _semantic_search(self, query, top_k):
        query_embedding = self.model.encode(
            f"Represent this sentence for searching relevant passages: {query}",
            normalize_embeddings=True
        )
        similarities = query_embedding @ self.embeddings.T
        top_indices = np.argpartition(similarities, -top_k)[-top_k:]
        return [(idx, similarities[idx]) for idx in top_indices]

    def _bm25_search(self, query, top_k):
        tokenized_query = word_tokenize(query.lower())
        scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argpartition(scores, -top_k)[-top_k:]
        return [(idx, scores[idx]) for idx in top_indices]

    def _rank_fusion(self, semantic_results, bm25_results, k=60):
        scores = {}
        for results in [semantic_results, bm25_results]:
            for rank, (doc_id, score) in enumerate(sorted(results, key=lambda x: x[1], reverse=True)):
                scores[doc_id] = scores.get(doc_id, 0) + 1.0 / (rank + k)
        return sorted(scores.items(), key=lambda x: x[1], reverse=True)

In [68]:
class ResponseGenerator:
    def __init__(self):
        load_dotenv()
        self.client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=os.getenv("OPENROUTER_KEY"),
        )

    def generate(self, query, context):
        prompt = f"""Please provide a direct answer to the question using only the information from the provided context. If the information is not available in the context, please state that.
        Question: {query}
        Context: {context}
        Ensure your response is clear and concise. Do not suggest search queries or discuss how to search for information."""

        completion = self.client.chat.completions.create(
            model="deepseek/deepseek-chat",
            messages=[{"role": "user", "content": prompt}]
        )
        return completion.choices[0].message.content

In [30]:
# Load dataset
dataset = load_dataset("pszemraj/simple_wikipedia")
print("Dataset loaded")

Dataset loaded


In [61]:
# Process documents
processor = DocumentProcessor()
print("Processing documents")
documents, embeddings, bm25 = processor.process_documents(dataset)
print("Documents processed")

Processing documents
Documents processed


In [62]:
# Initialize retriever and generator
retriever = HybridRetriever(documents, embeddings, bm25, processor.model)
generator = ResponseGenerator()

In [63]:
# Queries
queries = [
# Basic Factual
"Who was Bob Steele and what roles was he known for?",
"What was Pope Shenouda III's role and when did he serve?",
"When was salt first formed in Cheshire?",
"What is Vitória F.C. and where is it located?",

# Multi-hop
"How did Bob Steele's acting career evolve from movies to television?",
"What was the connection between Pope Shenouda III and Pope Paul VI?",
"How did geological processes lead to salt formation in Cheshire?",

# Time-Based
"What happened to salt deposits in Cheshire during the Triassic period?",
"What were the key events in Bob Steele's life between 1931 and 1939?",
"How long did Pope Shenouda III serve as Pope of Alexandria?",

# Edge Cases
"What sports teams existed in Portugal in 1910?",
"What was happening in Cheshire in 1915?",
"Who was married to Virginia Nash Tatem?",

# Complex Reasoning
"How did different historical periods (Romans, Anglo Saxons, Normans) affect salt manufacturing in Cheshire?",
"What relationship existed between Pope Shenouda III's health conditions and his death?",

# Document Coverage Test
"What do these documents tell us about the entertainment industry in the 20th century?",
"How do these documents represent different aspects of European history?",
"What information about religious leadership is contained in these sources?",

# Missing Information Queries
"What was Bob Steele's childhood like in Hollywood?",
"What were Pope Shenouda III's major theological writings?",
"What is the current status of Vitória F.C.?",

# Cross-Document Connections
"How do these documents represent different institutional structures (religious, entertainment, sports, industry)?",
"What geographical connections exist between these documents?",
"What timeline can we construct from all events mentioned in these documents?"
]

# Process queries and display results
results = []
for query in queries:
    retrieved_docs = retriever.retrieve(query)
    context = "\n\n".join(retrieved_docs)
    response = generator.generate(query, context)
    results.append({'Query': query, 'Response': response})


Retrieval Operation Stats (Query #1):
Query tokens: 12
Retrieved document tokens: 700
Average tokens per retrieved document: 175.0

Retrieval Operation Stats (Query #2):
Query tokens: 15
Retrieved document tokens: 768
Average tokens per retrieved document: 192.0

Retrieval Operation Stats (Query #3):
Query tokens: 9
Retrieved document tokens: 546
Average tokens per retrieved document: 136.5

Retrieval Operation Stats (Query #4):
Query tokens: 13
Retrieved document tokens: 770
Average tokens per retrieved document: 192.5

Retrieval Operation Stats (Query #5):
Query tokens: 13
Retrieved document tokens: 697
Average tokens per retrieved document: 174.2

Retrieval Operation Stats (Query #6):
Query tokens: 15
Retrieved document tokens: 768
Average tokens per retrieved document: 192.0

Retrieval Operation Stats (Query #7):
Query tokens: 12
Retrieved document tokens: 697
Average tokens per retrieved document: 174.2

Retrieval Operation Stats (Query #8):
Query tokens: 14
Retrieved document to

In [64]:
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Query,Response
0,Who was Bob Steele and what roles was he known...,Bob Steele was an American actor known for his...
1,What was Pope Shenouda III's role and when did...,Pope Shenouda III served as the 117th Pope of ...
2,When was salt first formed in Cheshire?,Salt in Cheshire was first formed about 220 mi...
3,What is Vitória F.C. and where is it located?,Vitória F.C. is a Portuguese sports club locat...
4,How did Bob Steele's acting career evolve from...,The context does not provide specific details ...
5,What was the connection between Pope Shenouda ...,The connection between Pope Shenouda III and P...
6,How did geological processes lead to salt form...,Geological processes led to salt formation in ...
7,What happened to salt deposits in Cheshire dur...,"During the Triassic period, water from the oce..."
8,What were the key events in Bob Steele's life ...,The key events in Bob Steele's life between 19...
9,How long did Pope Shenouda III serve as Pope o...,Pope Shenouda III served as Pope of Alexandria...


In [65]:
for q, a in zip(results_df["Query"], results_df["Response"]):
  print(f"Query: {q} \nResponse:{a}")
  print("\n-------------------------------\n")

Query: Who was Bob Steele and what roles was he known for? 
Response:Bob Steele was an American actor known for his roles in films such as *Carson City Kid*, *Island in the Sky*, *Rio Bravo*, *Hang 'Em High*, *Rio Lobo*, and in the television sitcom *F Troop*.

-------------------------------

Query: What was Pope Shenouda III's role and when did he serve? 
Response:Pope Shenouda III served as the 117th Pope of Alexandria and Patriarch of the See of St. Mark. His papacy lasted from 14 November 1971 until his death on 17 March 2012.

-------------------------------

Query: When was salt first formed in Cheshire? 
Response:Salt in Cheshire was first formed about 220 million years ago during the Triassic period.

-------------------------------

Query: What is Vitória F.C. and where is it located? 
Response:Vitória F.C. is a Portuguese sports club located in the city of Setúbal.

-------------------------------

Query: How did Bob Steele's acting career evolve from movies to television? 


In [66]:
# Get final token statistics
stats = retriever.get_token_stats()
print(f"\nFinal Token Statistics:")
print(f"Total queries processed: {stats['queries_processed']}")
print(f"Total query tokens: {stats['total_query_tokens']}")
print(f"Total retrieved tokens: {stats['total_retrieved_tokens']}")
print(f"Average tokens per query: {stats['average_query_tokens']:.1f}")
print(f"Average tokens per retrieval: {stats['average_retrieved_tokens']:.1f}")


Final Token Statistics:
Total queries processed: 24
Total query tokens: 316
Total retrieved tokens: 17491
Average tokens per query: 13.2
Average tokens per retrieval: 728.8


In [57]:
# Evaluation Scores for Contextual Model Responses
contextual_evaluation_scores = {
    # Simple Factual Queries
    "Who was Bob Steele?": {
        "score": 3,
        "notes": "Perfect accuracy, includes all key roles"
    },
    "What was Pope Shenouda III's role?": {
        "score": 3,
        "notes": "Accurate role and time period"
    },
    "When was salt first formed in Cheshire?": {
        "score": 3,
        "notes": "Correct time period and geological era"
    },
    "What is Vitória F.C.?": {
        "score": 3,
        "notes": "Accurate location and establishment date"
    },

    # Multi-hop Queries
    "How did Bob Steele's acting career evolve?": {
        "score": 3,
        "notes": "Appropriate acknowledgment of limited information"
    },
    "What was the connection between Pope Shenouda III and Pope Paul VI?": {
        "score": 3,
        "notes": "Accurate mention of common declaration, notes limits"
    },
    "How did geological processes lead to salt formation?": {
        "score": 3,
        "notes": "Excellent explanation of process"
    },

    # Time-Based Queries
    "What happened to salt deposits?": {
        "score": 3,
        "notes": "Clear explanation of process"
    },
    "What were key events in Bob Steele's life 1931-1939?": {
        "score": 3,
        "notes": "Comprehensive list of marriages/divorces"
    },
    "How long did Pope Shenouda III serve?": {
        "score": 3,
        "notes": "Precise duration and dates"
    },

    # Edge Cases
    "What sports teams existed in Portugal in 1910?": {
        "score": 3,
        "notes": "Correctly identifies only mentioned team"
    },
    "What was happening in Cheshire in 1915?": {
        "score": 3,
        "notes": "Appropriate handling of limited information"
    },
    "Who was married to Virginia Nash Tatem?": {
        "score": 3,
        "notes": "Simple, accurate answer"
    },

    # Complex Reasoning
    "How did different historical periods affect salt manufacturing?": {
        "score": 3,
        "notes": "Appropriate acknowledgment of limited information"
    },
    "What relationship existed between health conditions and death?": {
        "score": 3,
        "notes": "Accurate cause of death and connection"
    },

    # Document Analysis
    "What do these documents tell us about entertainment industry?": {
        "score": 2,
        "notes": "Missed potential relevance of Bob Steele's career"
    },
    "How do documents represent European history?": {
        "score": 3,
        "notes": "Comprehensive coverage of all historical aspects"
    },
    "What information about religious leadership?": {
        "score": 3,
        "notes": "Complete information about Pope Shenouda III"
    },

    # Missing Information Queries
    "What was Bob Steele's childhood like?": {
        "score": 3,
        "notes": "Appropriate acknowledgment of missing information"
    },
    "What were Pope Shenouda III's theological writings?": {
        "score": 3,
        "notes": "Clear statement about lack of information"
    },
    "What is the current status of Vitória F.C.?": {
        "score": 3,
        "notes": "Appropriate acknowledgment of missing information"
    },

    # Cross-Document Analysis
    "How do documents represent different institutional structures?": {
        "score": 3,
        "notes": "Good categorization of institutions"
    },
    "What geographical connections exist?": {
        "score": 2,
        "notes": "Could have included Norway/Scotland connection"
    },
    "What timeline can we construct?": {
        "score": 3,
        "notes": "Comprehensive timeline from all documents"
    }
}

# Calculate average score
contextual_average_score = sum(item["score"] for item in contextual_evaluation_scores.values()) / len(contextual_evaluation_scores)
print(f"Average score: {contextual_average_score:.2f}/3.00")

# Count scores by category
conextual_score_distribution = {
    3: sum(1 for item in contextual_evaluation_scores.values() if item["score"] == 3),
    2: sum(1 for item in contextual_evaluation_scores.values() if item["score"] == 2),
    1: sum(1 for item in contextual_evaluation_scores.values() if item["score"] == 1),
    0: sum(1 for item in contextual_evaluation_scores.values() if item["score"] == 0)
}
print("\nScore distribution:")
for score, count in conextual_score_distribution.items():
    print(f"Score {score}: {count} responses")

Average score: 2.92/3.00

Score distribution:
Score 3: 22 responses
Score 2: 2 responses
Score 1: 0 responses
Score 0: 0 responses


In [67]:
# Evaluation Scores for Model Responses (Without Pre-embedding Context)
evaluation_scores_no_preembed = {
    # Simple Factual Queries
    "Who was Bob Steele?": {
        "score": 2,  # Less complete than previous response, missing "Western films" specification
        "notes": "Missing genre specification, otherwise accurate"
    },
    "What was Pope Shenouda III's role?": {
        "score": 3,
        "notes": "Matches first response in accuracy and completeness"
    },
    "When was salt first formed in Cheshire?": {
        "score": 3,
        "notes": "Identical to first response"
    },
    "What is Vitória F.C.?": {
        "score": 2,  # Missing establishment date, less specific
        "notes": "Missing founding date and some details"
    },

    # Multi-hop Queries
    "How did Bob Steele's acting career evolve?": {
        "score": 3,
        "notes": "Appropriate acknowledgment of limited information"
    },
    "What was the connection between Popes?": {
        "score": 2,  # More concise but less contextual
        "notes": "More direct but loses some context about limited information"
    },
    "How did geological processes lead to salt formation?": {
        "score": 3,
        "notes": "Clear explanation of process"
    },

    # Time-Based Queries
    "What happened to salt deposits?": {
        "score": 3,
        "notes": "Clear and accurate explanation"
    },
    "What were key events in Bob Steele's life 1931-1939?": {
        "score": 3,
        "notes": "Complete timeline of marriages"
    },
    "How long did Pope Shenouda III serve?": {
        "score": 3,
        "notes": "Precise duration given"
    },

    # Edge Cases & Complex Reasoning
    "What sports teams existed in Portugal in 1910?": {
        "score": 3,
        "notes": "Appropriate scope and limitation noted"
    },
    "What was happening in Cheshire in 1915?": {
        "score": 3,
        "notes": "Good handling of limited information"
    },
    "Who was married to Virginia Nash Tatem?": {
        "score": 3,
        "notes": "Simple, accurate answer"
    },

    # Document Analysis
    "What do these documents tell us about entertainment industry?": {
        "score": 0,  # Incorrect assessment
        "notes": "Failed to recognize Bob Steele's career information"
    },
    "How do documents represent European history?": {
        "score": 3,
        "notes": "Comprehensive coverage of historical aspects"
    },
    "What information about religious leadership?": {
        "score": 3,
        "notes": "Complete information with good detail"
    },

    # Missing Information Queries
    "What was Bob Steele's childhood like?": {
        "score": 3,
        "notes": "Appropriate acknowledgment of missing information"
    },
    "What were Pope Shenouda III's theological writings?": {
        "score": 3,
        "notes": "Clear about lack of information"
    },
    "What is the current status of Vitória F.C.?": {
        "score": 3,
        "notes": "Appropriate acknowledgment of missing information"
    },

    # Cross-Document Analysis
    "How do documents represent different institutional structures?": {
        "score": 3,
        "notes": "Better categorization than first version"
    },
    "What geographical connections exist?": {
        "score": 3,
        "notes": "More comprehensive geographical connections"
    },
    "What timeline can we construct?": {
        "score": 3,
        "notes": "Excellent chronological organization"
    }
}

# Calculate average score
average_score = sum(item["score"] for item in evaluation_scores_no_preembed.values()) / len(evaluation_scores_no_preembed)
print(f"Average score: {average_score:.2f}/3.00")

# Count scores by category
score_distribution = {
    3: sum(1 for item in evaluation_scores_no_preembed.values() if item["score"] == 3),
    2: sum(1 for item in evaluation_scores_no_preembed.values() if item["score"] == 2),
    1: sum(1 for item in evaluation_scores_no_preembed.values() if item["score"] == 1),
    0: sum(1 for item in evaluation_scores_no_preembed.values() if item["score"] == 0)
}
print("\nScore distribution:")
for score, count in score_distribution.items():
    print(f"Score {score}: {count} responses")

Average score: 2.73/3.00

Score distribution:
Score 3: 18 responses
Score 2: 3 responses
Score 1: 0 responses
Score 0: 1 responses
