In [1]:
import requests
from sentence_transformers import SentenceTransformer, CrossEncoder
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re

In [2]:
def fetch_and_clean_wikipedia(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        # Simple text extraction (you might want to use BeautifulSoup for better parsing)
        text = response.text
        
        # Extract content between <p> tags (basic approach)
        paragraphs = re.findall(r'<p[^>]*>(.*?)</p>', text, re.DOTALL)
        
        # Clean HTML tags and get meaningful content
        clean_paragraphs = []
        for p in paragraphs:
            # Remove HTML tags
            clean_text = re.sub(r'<[^>]+>', '', p)
            # Remove citations like [1], [2], etc.
            clean_text = re.sub(r'\[\d+\]', '', clean_text)
            # Remove extra whitespace
            clean_text = ' '.join(clean_text.split())
            
            # Only keep paragraphs with substantial content
            if len(clean_text) > 50:
                clean_paragraphs.append(clean_text)
        
        return clean_paragraphs[:20]  # Limit to first 20 paragraphs for efficiency
    except Exception as e:
        print(f"Error fetching Wikipedia: {e}")

In [3]:
def chunk_text(paragraphs, max_length=200):
    chunks = []
    for paragraph in paragraphs:
        words = paragraph.split()
        for i in range(0, len(words), max_length):
            chunk = ' '.join(words[i:i + max_length])
            if len(chunk.strip()) > 20:  # Only keep meaningful chunks
                chunks.append(chunk.strip())
    return chunks

In [5]:
class BiEncoderRetriever:
    def __init__(self, model_name="all-mpnet-base-v2"):
        print(f"Loading bi-encoder: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.chunks = []
        self.chunk_embeddings = None
    
    def index_documents(self, chunks):
        self.chunks = chunks
        print(f"Creating embeddings for {len(chunks)} chunks...")
        self.chunk_embeddings = self.model.encode(chunks)
        print("Bi-encoder indexing complete!")
    
    def search(self, query, top_k=5):
        query_embedding = self.model.encode([query])
        similarities = cosine_similarity(query_embedding, self.chunk_embeddings)[0]
        
        # Get top-k results
        top_indices = np.argsort(similarities)[::-1][:top_k]
        results = []
        
        for idx in top_indices:
            results.append({
                'text': self.chunks[idx],
                'score': similarities[idx],
                'rank': len(results) + 1
            })
        
        return results

In [6]:
class CrossEncoderRetriever:
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
        print(f"Loading cross-encoder: {model_name}")
        # Note: jina-colbert-v2 might not be available, using a common alternative
        try:
            self.model = CrossEncoder(model_name)
        except:
            # Fallback to a commonly available cross-encoder
            print("Falling back to ms-marco cross-encoder")
            self.model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
        
        self.chunks = []
    
    def index_documents(self, chunks):
        self.chunks = chunks
        print(f"Cross-encoder ready with {len(chunks)} chunks!")
    
    def search(self, query, top_k=5):
        print(f"Cross-encoder processing {len(self.chunks)} pairs...")
        
        # Create query-document pairs
        pairs = [[query, chunk] for chunk in self.chunks]
        
        # Score all pairs
        scores = self.model.predict(pairs)
        
        # Get top-k results
        scored_chunks = list(zip(self.chunks, scores))
        scored_chunks.sort(key=lambda x: x[1], reverse=True)
        
        results = []
        for i, (chunk, score) in enumerate(scored_chunks[:top_k]):
            results.append({
                'text': chunk,
                'score': float(score),
                'rank': i + 1
            })
        
        return results

In [7]:
def compare_results(bi_results, cross_results, query):
    print(f"\n{'='*80}")
    print(f"QUERY: {query}")
    print(f"{'='*80}")
    
    print(f"\n🔍 BI-ENCODER RESULTS:")
    print("-" * 50)
    for result in bi_results:
        print(f"Rank {result['rank']} (Score: {result['score']:.4f})")
        print(f"Text: {result['text'][:150]}...")
        print()
    
    print(f"\n🎯 CROSS-ENCODER RESULTS:")
    print("-" * 50)
    for result in cross_results:
        print(f"Rank {result['rank']} (Score: {result['score']:.4f})")
        print(f"Text: {result['text'][:150]}...")
        print()


In [8]:
def main():
    # Fetch iPhone Wikipedia content
    print("Fetching iPhone Wikipedia page...")
    url = "https://en.wikipedia.org/wiki/IPhone"
    paragraphs = fetch_and_clean_wikipedia(url)
    chunks = chunk_text(paragraphs)
    
    print(f"Processed {len(chunks)} text chunks from Wikipedia")
    
    # Initialize both retrievers
    bi_encoder = BiEncoderRetriever()
    cross_encoder = CrossEncoderRetriever()
    
    # Index documents
    bi_encoder.index_documents(chunks)
    cross_encoder.index_documents(chunks)
    
    # Test queries - different types to see where cross-encoders excel
    test_queries = [
        # Factual queries (bi-encoders usually good)
        "What year was the iPhone announced?",
        "Who designed the iPhone?",
        
        # Semantic/conceptual queries (cross-encoders often better)
        "How did the iPhone change mobile phones?",
        "What makes iPhone cameras special?",
        
        # Complex/nuanced queries (cross-encoders typically excel)
        "Why is the iPhone considered revolutionary?",
        "What are the main advantages of iPhone over other phones?",
        
        # Specific technical queries
        "What storage options are available?",
        "How does Face ID work?",
    ]
    
    # Compare results for each query
    for query in test_queries:
        bi_results = bi_encoder.search(query, top_k=3)
        cross_results = cross_encoder.search(query, top_k=3)
        compare_results(bi_results, cross_results, query)
        
        # Brief analysis
        print(f"\n💡 ANALYSIS:")
        print(f"Bi-encoder top score: {bi_results[0]['score']:.4f}")
        print(f"Cross-encoder top score: {cross_results[0]['score']:.4f}")
        
        if cross_results[0]['score'] > bi_results[0]['score'] * 1.2:  # 20% better
            print("✅ Cross-encoder shows significant improvement!")
        elif bi_results[0]['score'] > cross_results[0]['score'] * 1.1:  # 10% better
            print("✅ Bi-encoder performs better here!")
        else:
            print("➖ Results are similar")
        
        input("\nPress Enter to continue to next query...")

In [9]:
if __name__ == "__main__":
    print("🚀 Starting Bi-encoder vs Cross-encoder Comparison")
    print("📱 Using iPhone Wikipedia page as test data")
    print("-" * 60)
    
    main()

🚀 Starting Bi-encoder vs Cross-encoder Comparison
📱 Using iPhone Wikipedia page as test data
------------------------------------------------------------
Fetching iPhone Wikipedia page...
Processed 20 text chunks from Wikipedia
Loading bi-encoder: all-mpnet-base-v2
Loading cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Creating embeddings for 20 chunks...
Bi-encoder indexing complete!
Cross-encoder ready with 20 chunks!
Cross-encoder processing 20 pairs...

QUERY: What year was the iPhone announced?

🔍 BI-ENCODER RESULTS:
--------------------------------------------------
Rank 1 (Score: 0.7056)
Text: The iPhone is a line of smartphones developed and marketed by Apple that run iOS, the company's own mobile operating system. The first-generation iPho...

Rank 2 (Score: 0.6955)
Text: Jobs unveiled the first-generation iPhone to the public on January 9, 2007, at the Macworld 2007 convention at the Moscone Center in San Francisco.&#9...

Rank 3 (Score: 0.6805)
Text: Development of an Apple smartphone began in 2004, when the company started to gather a team of 1,000 employees led by hardware engineer Tony Fadell, s...


🎯 CROSS-ENCODER RESULTS:
--------------------------------------------------
Rank 1 (Score: 8.0910)
Text: The iPhone is a line of smartphones developed and marketed by Apple that run iOS, th