In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import json
import os
from pathlib import Path

# Path to chunked data - Load at the beginning
chunked_data_path = Path("../../data-ingestion/processed/chunked/hybrid/")

# Load a few chunked documents
def load_chunked_documents(path, max_files=5):
    """Load chunked documents from JSON files"""
    documents = []
    json_files = list(path.glob("*.json"))[:max_files]
    
    for json_file in json_files:
        with open(json_file, 'r') as f:
            chunks = json.load(f)
            for chunk in chunks[:3]:  # Take first 3 chunks from each file
                documents.append({
                    'content': chunk['content'],
                    'metadata': chunk['metadata']
                })
    
    return documents

# Load sample documents at the start
chunked_docs = load_chunked_documents(chunked_data_path, max_files=3)
print(f"Loaded {len(chunked_docs)} chunks from ArXiv papers")
print(f"\ Sample Chunk:")
print(f"   Title: {chunked_docs[0]['metadata']['title']}")
print(f"   Section: {chunked_docs[0]['metadata']['section']}")
print(f"   Content preview: {chunked_docs[0]['content'][:200]}...")

Loaded 9 chunks from ArXiv papers
\ Sample Chunk:
   Title: Agentic Test-Time Scaling for WebAgents
   Section: Abstract
   Content preview: Test-time scaling has become a standard way
to improve performance and boost reliability of
neural network models. However, its behavior
on agentic, multi-step tasks remains less well-
understood: sma...


  print(f"\ Sample Chunk:")


In [3]:
# Show examples of the data we're working with
print("Data Overview:\n")
print(f"Total chunks loaded: {len(chunked_docs)}")
print(f"Chunking strategy: {chunked_docs[0]['metadata']['chunk_type']}")
print(f"\nSample Papers:")

unique_papers = {}
for doc in chunked_docs:
    paper_id = doc['metadata']['arxiv_id']
    if paper_id not in unique_papers:
        unique_papers[paper_id] = doc['metadata']['title']

for i, (arxiv_id, title) in enumerate(list(unique_papers.items())[:3], 1):
    print(f"   {i}. [{arxiv_id}] {title[:70]}...")

print(f"\nThese chunks will be used in all examples below!")

Data Overview:

Total chunks loaded: 9
Chunking strategy: hybrid

Sample Papers:
   1. [2602.12276v1] Agentic Test-Time Scaling for WebAgents...
   2. [2602.12251v1] A technical curriculum on language-oriented artificial intelligence in...
   3. [2602.11322v1] Predictive Associative Memory: Retrieval Beyond Similarity Through Tem...

These chunks will be used in all examples below!


In [4]:
### Huggingface And OpenAI Models

from langchain_huggingface import HuggingFaceEmbeddings

## Initialize a simple Embedding model(no API Key needed!)
embeddings=HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

embeddings

  from .autonotebook import tqdm as notebook_tqdm


HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [5]:
## Create embeddings from real research paper chunk
text = chunked_docs[0]['content']

embedding = embeddings.embed_query(text)
print(f"Paper: {chunked_docs[0]['metadata']['title'][:60]}...")
print(f"Section: {chunked_docs[0]['metadata']['section']}")
print(f"\nText preview: {text[:150]}...")
print(f"\nEmbedding length: {len(embedding)}")
print(f"Sample values: {embedding[:5]}")

Paper: Agentic Test-Time Scaling for WebAgents...
Section: Abstract

Text preview: Test-time scaling has become a standard way
to improve performance and boost reliability of
neural network models. However, its behavior
on agentic, m...

Embedding length: 384
Sample values: [-0.0693819522857666, -0.11218364536762238, -0.06601215898990631, 0.047143902629613876, 0.020564280450344086]


In [None]:
# Use actual research paper chunks instead of simple sentences
paper_texts = [doc['content'] for doc in chunked_docs[:5]]

embedding_sentence = embeddings.embed_documents(paper_texts)

print(f"Created {len(embedding_sentence)} embeddings from research paper chunks")
print(f"Each embedding has {len(embedding_sentence[0])} dimensions")
print(f"\nFirst chunk from: {chunked_docs[0]['metadata']['title'][:50]}...")
print(f"Second chunk from: {chunked_docs[1]['metadata']['title'][:50]}...")

# Show that identical texts have identical embeddings
print(f"\nEmbedding shape: ({len(embedding_sentence)}, {len(embedding_sentence[0])})")

Created 5 embeddings from research paper chunks
Each embedding has 384 dimensions

First chunk from: Agentic Test-Time Scaling for WebAgents...
Second chunk from: Agentic Test-Time Scaling for WebAgents...

‚úÖ Embedding shape: (5, 384)


In [7]:
## Measuring Similarity
def cosine_similarity(vec1, vec2):
    """
    Cosine similarity measures the angle between two vectors.
    - Result close to 1: Very similar
    - Result close to 0: Not related
    - Result close to -1: Opposite meanings
    """
    dot_product = np.dot(vec1, vec2)
    norm_a = np.linalg.norm(vec1)
    norm_b = np.linalg.norm(vec2)
    
    # Handle edge case: zero vectors
    if norm_a == 0 or norm_b == 0:
        return 0.0
    
    return dot_product / (norm_a * norm_b)

In [None]:
# Example: Compare similarity between actual research paper chunks
# Use embeddings from real chunks
chunk1_embedding = embedding_sentence[0]
chunk2_embedding = embedding_sentence[1]
chunk3_embedding = embedding_sentence[2]

similarity_1_2 = cosine_similarity(chunk1_embedding, chunk2_embedding)
similarity_1_3 = cosine_similarity(chunk1_embedding, chunk3_embedding)

print(f"Chunk 1: {chunked_docs[0]['metadata']['section']} from {chunked_docs[0]['metadata']['title'][:40]}...")
print(f"Chunk 2: {chunked_docs[1]['metadata']['section']} from {chunked_docs[1]['metadata']['title'][:40]}...")
print(f"Chunk 3: {chunked_docs[2]['metadata']['section']} from {chunked_docs[2]['metadata']['title'][:40]}...")
print(f"\nSimilarity (Chunk 1 vs Chunk 2): {similarity_1_2:.4f}")
print(f"Similarity (Chunk 1 vs Chunk 3): {similarity_1_3:.4f}")

üìÑ Chunk 1: Abstract from Agentic Test-Time Scaling for WebAgents...
üìÑ Chunk 2: 1. Introduction from Agentic Test-Time Scaling for WebAgents...
üìÑ Chunk 3: 1. Introduction from Agentic Test-Time Scaling for WebAgents...

üî¢ Similarity (Chunk 1 vs Chunk 2): 0.4222
üî¢ Similarity (Chunk 1 vs Chunk 3): 0.5202


In [None]:
# Compare chunks from different papers
similarity_2_3 = cosine_similarity(chunk2_embedding, chunk3_embedding)
print(f"Similarity (Chunk 2 vs Chunk 3): {similarity_2_3:.4f}")


üî¢ Similarity (Chunk 2 vs Chunk 3): 0.4739

üí° Higher scores = more semantically similar content
üí° Chunks from same paper/topic typically have higher similarity


In [None]:
# Compare similarity between different paper chunks
import pandas as pd

print("Similarity Matrix between Paper Chunks:\n")

# Ensure required variables are available
if "paper_chunks" not in globals():
    paper_chunks = [doc["content"] for doc in chunked_docs]
if "paper_embeddings" not in globals():
    paper_embeddings = embeddings.embed_documents(paper_chunks)

# Create a small similarity matrix for first 5 chunks
n_chunks = min(5, len(paper_chunks))
similarity_matrix = []

for i in range(n_chunks):
    row = []
    for j in range(n_chunks):
        sim = cosine_similarity(paper_embeddings[i], paper_embeddings[j])
        row.append(f"{sim:.3f}")
    similarity_matrix.append(row)

# Create DataFrame for better visualization
df = pd.DataFrame(
    similarity_matrix, 
    columns=[f"Chunk {i+1}" for i in range(n_chunks)],
    index=[f"Chunk {i+1}" for i in range(n_chunks)]
)

print(df)


üìä Similarity Matrix between Paper Chunks:

        Chunk 1 Chunk 2 Chunk 3 Chunk 4 Chunk 5
Chunk 1   1.000   0.422   0.520   0.233   0.226
Chunk 2   0.422   1.000   0.474   0.379   0.456
Chunk 3   0.520   0.474   1.000   0.220   0.254
Chunk 4   0.233   0.379   0.220   1.000   0.533
Chunk 5   0.226   0.456   0.254   0.533   1.000

üí° Diagonal values are 1.0 (each chunk is identical to itself)
üí° Higher values indicate more semantic similarity between chunks


In [11]:
# Semantic search on research papers
def semantic_search_papers(query, documents, doc_metadata, embeddings_model, top_k=3):
    """Search through research paper chunks"""
    query_embedding = embeddings_model.embed_query(query)
    doc_embeddings = embeddings_model.embed_documents(documents)
    
    similarities = []
    for i, doc_emb in enumerate(doc_embeddings):
        similarity = cosine_similarity(query_embedding, doc_emb)
        similarities.append((similarity, documents[i], doc_metadata[i]))
    
    similarities.sort(reverse=True)
    return similarities[:top_k]

# Test with a query
query = "What are neural networks and machine learning?"
results = semantic_search_papers(
    query, 
    paper_chunks, 
    [doc['metadata'] for doc in chunked_docs],
    embeddings
)

print(f"üîç Search Query: '{query}'\n")
print("=" * 80)
for i, (score, content, metadata) in enumerate(results, 1):
    print(f"\n{i}. Score: {score:.4f}")
    print(f"   Paper: {metadata['title']}")
    print(f"   Section: {metadata['section']}")
    print(f"   ArXiv ID: {metadata['arxiv_id']}")
    print(f"   Content: {content[:200]}...")
    print("-" * 80)

üîç Search Query: 'What are neural networks and machine learning?'


1. Score: 0.3288
   Paper: A technical curriculum on language-oriented artificial intelligence in translation and specialised communication
   Section: Abstract
   ArXiv ID: 2602.12251v1
   Content: This paper presents a technical curricu-
lum on language-oriented artificial intel-
ligence (AI) in the language and transla-
tion (L&T) industry. The curriculum aims
to foster domain-specific technic...
--------------------------------------------------------------------------------

2. Score: 0.2492
   Paper: A technical curriculum on language-oriented artificial intelligence in translation and specialised communication
   Section: 1
Introduction
   ArXiv ID: 2602.12251v1
   Content: The recent emergence of general-purpose AI
(GPAI) technologies in the form of large language
¬© 2026 The author. This article is licensed under a Creative
Commons 4.0 licence, no derivative works, attr...
-----------------------------------

In [12]:
# Extract just the content from our chunked documents
paper_chunks = [doc['content'] for doc in chunked_docs]

# Create embeddings for the research paper chunks
paper_embeddings = embeddings.embed_documents(paper_chunks)

print(f"Created embeddings for {len(paper_chunks)} paper chunks")
print(f"Each embedding has {len(paper_embeddings[0])} dimensions")
print(f"\nFirst chunk preview:")
print(paper_chunks[0][:300] + "...")

Created embeddings for 9 paper chunks
Each embedding has 384 dimensions

First chunk preview:
Test-time scaling has become a standard way
to improve performance and boost reliability of
neural network models. However, its behavior
on agentic, multi-step tasks remains less well-
understood: small per-step errors can compound
over long horizons; and we find that naive policies
that uniformly i...
