In [36]:
import json
from pathlib import Path
import numpy as np

# Setup path to chunked data
chunked_data_path = Path("../../data-ingestion/processed/chunked/hybrid/")

# Load chunked documents from the data-ingestion pipeline
def load_arxiv_chunks(path, max_files=5, chunks_per_file=5):
    """Load chunked ArXiv papers"""
    documents = []
    json_files = list(path.glob("*.json"))[:max_files]
    
    for json_file in json_files:
        with open(json_file, 'r') as f:
            chunks = json.load(f)
            for chunk in chunks[:chunks_per_file]:
                documents.append({
                    'content': chunk['content'],
                    'metadata': chunk['metadata']
                })
    
    return documents

# Load papers early for use throughout the notebook
arxiv_chunks = load_arxiv_chunks(chunked_data_path, max_files=5)

print(f"Loaded {len(arxiv_chunks)} chunks from ArXiv papers")
print(f"\nSample Paper:")
print(f"   Title: {arxiv_chunks[0]['metadata']['title']}")
print(f"   Category: {arxiv_chunks[0]['metadata']['primary_category']}")
print(f"   Section: {arxiv_chunks[0]['metadata']['section']}")
print(f"   Chunk Type: {arxiv_chunks[0]['metadata']['chunk_type']}")
print(f"\n   Content Preview:")
print(f"   {arxiv_chunks[0]['content'][:250]}...")

Loaded 25 chunks from ArXiv papers

Sample Paper:
   Title: Agentic Test-Time Scaling for WebAgents
   Category: cs.AI
   Section: Abstract
   Chunk Type: hybrid

   Content Preview:
   Test-time scaling has become a standard way
to improve performance and boost reliability of
neural network models. However, its behavior
on agentic, multi-step tasks remains less well-
understood: small per-step errors can compound
over long horizons...


In [37]:
# Display data overview
print("\n" + "=" * 90)
print("Data Overview for This Notebook:\n")

unique_papers = {}
for doc in arxiv_chunks:
    paper_id = doc['metadata']['arxiv_id']
    if paper_id not in unique_papers:
        unique_papers[paper_id] = {
            'title': doc['metadata']['title'],
            'category': doc['metadata']['primary_category']
        }

print(f"{len(unique_papers)} research papers:")
for i, (arxiv_id, info) in enumerate(list(unique_papers.items())[:5], 1):
    print(f"\n   {i}. [{arxiv_id}] {info['category']}")
    print(f"      {info['title'][:75]}...")

print("=" * 90)


Data Overview for This Notebook:

5 research papers:

   1. [2602.12276v1] cs.AI
      Agentic Test-Time Scaling for WebAgents...

   2. [2602.12251v1] cs.CL
      A technical curriculum on language-oriented artificial intelligence in tran...

   3. [2602.11322v1] cs.LG
      Predictive Associative Memory: Retrieval Beyond Similarity Through Temporal...

   4. [2602.12236v1] cs.NE
      Energy-Aware Spike Budgeting for Continual Learning in Spiking Neural Netwo...

   5. [2602.11947v1] math.OC
      Mixed-Integer Programming for Change-point Detection...


In [38]:
## Import HuggingFace Embeddings (No API Key Required!)
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize embeddings model - runs locally without API keys
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

print("HuggingFace Embeddings Model Loaded!")
print(f"Model: sentence-transformers/all-MiniLM-L6-v2")
print(f"Embedding Dimension: 384")

HuggingFace Embeddings Model Loaded!
Model: sentence-transformers/all-MiniLM-L6-v2
Embedding Dimension: 384


In [39]:
## Single text embeddings - using real research paper chunk
single_text = arxiv_chunks[0]['content']
single_embeddings = embeddings.embed_query(single_text)

print("Single Text Embedding from ArXiv Paper:")
print(f"Paper: {arxiv_chunks[0]['metadata']['title'][:60]}...")
print(f"Section: {arxiv_chunks[0]['metadata']['section']}")
print(f"\nInput preview: {single_text[:150]}...")
print(f"\n Output: Vector of {len(single_embeddings)} dimensions")
print(f"Sample values: {single_embeddings[:5]}")

Single Text Embedding from ArXiv Paper:
Paper: Agentic Test-Time Scaling for WebAgents...
Section: Abstract

Input preview: Test-time scaling has become a standard way
to improve performance and boost reliability of
neural network models. However, its behavior
on agentic, m...

 Output: Vector of 384 dimensions
Sample values: [-0.0693819522857666, -0.11218364536762238, -0.06601215898990631, 0.047143902629613876, 0.020564280450344086]


In [40]:
# Multiple texts at once - using real research paper chunks
multiple_texts = [doc['content'] for doc in arxiv_chunks[:4]]

print("Using real research paper chunks:")
for i, doc in enumerate(arxiv_chunks[:4], 1):
    print(f"{i}. {doc['metadata']['title'][:50]}... (Section: {doc['metadata']['section']})")

Using real research paper chunks:
1. Agentic Test-Time Scaling for WebAgents... (Section: Abstract)
2. Agentic Test-Time Scaling for WebAgents... (Section: 1. Introduction)
3. Agentic Test-Time Scaling for WebAgents... (Section: 1. Introduction)
4. Agentic Test-Time Scaling for WebAgents... (Section: 1. Introduction)


In [41]:
multiple_embeddings = embeddings.embed_documents(multiple_texts)

print("\nMultiple Text Embeddings:")
print(f"Number of chunks: {len(multiple_texts)}")
print(f"Number of embeddings: {len(multiple_embeddings)}")
print(f"Each embedding size: {len(multiple_embeddings[0])} dimensions")
print(f"\nFirst embedding sample: {multiple_embeddings[0][:5]}")
print(f"\nEach research paper chunk is now represented as a {len(multiple_embeddings[0])}-dimensional vector")


Multiple Text Embeddings:
Number of chunks: 4
Number of embeddings: 4
Each embedding size: 384 dimensions

First embedding sample: [-0.0693819671869278, -0.11218366026878357, -0.06601212918758392, 0.04714391380548477, 0.020564256235957146]

Each research paper chunk is now represented as a 384-dimensional vector


In [42]:
### Cosine Similarity With OpenAI Embeddings

import numpy as np
def cosine_similarity(vec1, vec2):
    """
    Cosine similarity measures the angle between two vectors.
    - Result close to 1: Very similar
    - Result close to 0: Not related
    - Result close to -1: Opposite meanings
    """

    dot_product=np.dot(vec1,vec2)
    norm_a=np.linalg.norm(vec1)
    norm_b=np.linalg.norm(vec2)
    return dot_product/(norm_a * norm_b)

In [43]:
# Finding similar chunks - using real research paper chunks
sentences = [doc['content'] for doc in arxiv_chunks[:5]]

print("Analyzing similarity between these research paper chunks:\n")
for i, doc in enumerate(arxiv_chunks[:5], 1):
    print(f"{i}. Paper: {doc['metadata']['title'][:45]}...")
    print(f"   Section: {doc['metadata']['section']}")
    print()

Analyzing similarity between these research paper chunks:

1. Paper: Agentic Test-Time Scaling for WebAgents...
   Section: Abstract

2. Paper: Agentic Test-Time Scaling for WebAgents...
   Section: 1. Introduction

3. Paper: Agentic Test-Time Scaling for WebAgents...
   Section: 1. Introduction

4. Paper: Agentic Test-Time Scaling for WebAgents...
   Section: 1. Introduction

5. Paper: Agentic Test-Time Scaling for WebAgents...
   Section: 1. Introduction



In [44]:
sentence_embeddings=embeddings.embed_documents(sentences)

In [45]:
## Calculate similarity between all pairs of research chunks

print("Pairwise Similarity Scores:\n")
print("=" * 80)

for i in range(min(3, len(sentences))):  # Limit to first 3 for readability
    for j in range(i+1, min(4, len(sentences))):
        similarity = cosine_similarity(sentence_embeddings[i], sentence_embeddings[j])
        
        print(f"\nChunk {i+1} ({arxiv_chunks[i]['metadata']['section']}) vs Chunk {j+1} ({arxiv_chunks[j]['metadata']['section']})")
        print(f"   Paper 1: {arxiv_chunks[i]['metadata']['title'][:40]}...")
        print(f"   Paper 2: {arxiv_chunks[j]['metadata']['title'][:40]}...")
        print(f"   Similarity Score: {similarity:.4f}")



Pairwise Similarity Scores:


Chunk 1 (Abstract) vs Chunk 2 (1. Introduction)
   Paper 1: Agentic Test-Time Scaling for WebAgents...
   Paper 2: Agentic Test-Time Scaling for WebAgents...
   Similarity Score: 0.4222

Chunk 1 (Abstract) vs Chunk 3 (1. Introduction)
   Paper 1: Agentic Test-Time Scaling for WebAgents...
   Paper 2: Agentic Test-Time Scaling for WebAgents...
   Similarity Score: 0.5202

Chunk 1 (Abstract) vs Chunk 4 (1. Introduction)
   Paper 1: Agentic Test-Time Scaling for WebAgents...
   Paper 2: Agentic Test-Time Scaling for WebAgents...
   Similarity Score: 0.5384

Chunk 2 (1. Introduction) vs Chunk 3 (1. Introduction)
   Paper 1: Agentic Test-Time Scaling for WebAgents...
   Paper 2: Agentic Test-Time Scaling for WebAgents...
   Similarity Score: 0.4739

Chunk 2 (1. Introduction) vs Chunk 4 (1. Introduction)
   Paper 1: Agentic Test-Time Scaling for WebAgents...
   Paper 2: Agentic Test-Time Scaling for WebAgents...
   Similarity Score: 0.4944

Chunk 3 (1. Introduct

In [46]:
### Example- Semantic Search- Retireve the similar sentence
def semantic_search(query,documents,embeddings_models,top_k=3):
    """Simple semantic search implementation"""

    ## embed query and doument

    query_embedding=embeddings_models.embed_query(query)
    doc_embeddings = embeddings_models.embed_documents(documents)

    ## Calculate the similarity score

    similarties=[]

    for i,doc_emb in enumerate(doc_embeddings):
        similarity=cosine_similarity(query_embedding,doc_emb)
        similarties.append((similarity,documents[i]))

    ## Sort by similarity
    similarties.sort(reverse=True)
    return similarties[:top_k]

In [47]:
# Semantic search with real research papers
documents = [doc['content'] for doc in arxiv_chunks[:10]]

query = "What are neural networks and machine learning?"

print(f"Searching through {len(documents)} research paper chunks")
print(f" Query: '{query}'")
print(f"\nSearching in papers:")
for i, doc in enumerate(arxiv_chunks[:10], 1):
    print(f"   {i}. {doc['metadata']['title'][:60]}...")

Searching through 10 research paper chunks
 Query: 'What are neural networks and machine learning?'

Searching in papers:
   1. Agentic Test-Time Scaling for WebAgents...
   2. Agentic Test-Time Scaling for WebAgents...
   3. Agentic Test-Time Scaling for WebAgents...
   4. Agentic Test-Time Scaling for WebAgents...
   5. Agentic Test-Time Scaling for WebAgents...
   6. A technical curriculum on language-oriented artificial intel...
   7. A technical curriculum on language-oriented artificial intel...
   8. A technical curriculum on language-oriented artificial intel...
   9. A technical curriculum on language-oriented artificial intel...
   10. A technical curriculum on language-oriented artificial intel...


In [48]:
results=semantic_search(query,documents,embeddings)

In [49]:
print(f"\nSemantic Search Results for: '{query}'\n")
print("=" * 90)

for i, (score, doc) in enumerate(results, 1):
    # Find matching metadata
    idx = documents.index(doc)
    metadata = arxiv_chunks[idx]['metadata']
    
    print(f"\n{i}. Similarity: {score:.4f}")
    print(f"   Paper: {metadata['title'][:65]}...")
    print(f"   Section: {metadata['section']}")
    print(f"   ArXiv ID: {metadata['arxiv_id']}")
    print(f"   Content: {doc[:120]}...")

print("\n" + "=" * 90)


Semantic Search Results for: 'What are neural networks and machine learning?'


1. Similarity: 0.3308
   Paper: A technical curriculum on language-oriented artificial intelligen...
   Section: approach
   ArXiv ID: 2602.12251v1
   Content: in various ways.
The curriculum proposed in this paper focuses on
developing technical AI literacy, which involves
knowl...

2. Similarity: 0.3288
   Paper: A technical curriculum on language-oriented artificial intelligen...
   Section: Abstract
   ArXiv ID: 2602.12251v1
   Content: This paper presents a technical curricu-
lum on language-oriented artificial intel-
ligence (AI) in the language and tra...

3. Similarity: 0.2492
   Paper: A technical curriculum on language-oriented artificial intelligen...
   Section: 1
Introduction
   ArXiv ID: 2602.12251v1
   Content: The recent emergence of general-purpose AI
(GPAI) technologies in the form of large language
¬© 2026 The author. This art...



In [50]:
# Load chunks from different strategies
strategies = ['hybrid', 'semantic', 'recursive', 'token_based']
strategy_chunks = {}

for strategy in strategies:
    strategy_path = Path(f"../../data-ingestion/processed/chunked/{strategy}/")
    if strategy_path.exists():
        chunks = load_arxiv_chunks(strategy_path, max_files=2, chunks_per_file=3)
        strategy_chunks[strategy] = chunks
        print(f"Loaded {len(chunks)} chunks from {strategy} strategy")

print(f"\nTotal strategies loaded: {len(strategy_chunks)}")

Loaded 6 chunks from hybrid strategy
Loaded 0 chunks from semantic strategy
Loaded 6 chunks from recursive strategy
Loaded 6 chunks from token_based strategy

Total strategies loaded: 4


In [51]:
# Analyze characteristics of different chunking strategies
import pandas as pd

strategy_stats = []

for strategy, chunks in strategy_chunks.items():
    if not chunks:  # Skip empty chunk lists
        print(f"Skipping {strategy} - no chunks available")
        continue
    
    contents = [c['content'] for c in chunks]
    if not contents:  # Skip if no valid content
        print(f"Skipping {strategy} - no content available")
        continue
    
    avg_length = np.mean([len(c) for c in contents])
    max_length = max([len(c) for c in contents])
    min_length = min([len(c) for c in contents])
    
    strategy_stats.append({
        'Strategy': strategy,
        'Num Chunks': len(chunks),
        'Avg Length': int(avg_length),
        'Min Length': int(min_length),
        'Max Length': int(max_length)
    })

if strategy_stats:
    stats_df = pd.DataFrame(strategy_stats)
    print("Chunking Strategy Comparison:\n")
    print(stats_df.to_string(index=False))
    
    print("\nDifferent strategies create chunks of varying sizes")
    print("This affects the granularity and context of embeddings")
else:
    print("No strategy statistics available - all strategies had empty chunks")

Skipping semantic - no chunks available
Chunking Strategy Comparison:

   Strategy  Num Chunks  Avg Length  Min Length  Max Length
     hybrid           6         949         752        1416
  recursive           6         987         980         996
token_based           6        2060        2008        2123

Different strategies create chunks of varying sizes
This affects the granularity and context of embeddings


## Comparing Different Chunking Strategies

Let's compare embeddings from different chunking strategies (hybrid, semantic, recursive, etc.)

In [52]:
# Create embeddings for all loaded research papers
paper_contents = [doc['content'] for doc in arxiv_chunks]
paper_embeddings = embeddings.embed_documents(paper_contents)

print(f"Created embeddings for {len(paper_embeddings)} research paper chunks")
print(f"Each embedding: {len(paper_embeddings[0])} dimensions")
print(f"Total size: ~{len(paper_embeddings) * len(paper_embeddings[0]) * 4 / 1024:.2f} KB")
print(f"\nPapers include:")
unique_titles = set([doc['metadata']['title'] for doc in arxiv_chunks])
for i, title in enumerate(list(unique_titles)[:5], 1):
    print(f"   {i}. {title[:70]}...")

Created embeddings for 25 research paper chunks
Each embedding: 384 dimensions
Total size: ~37.50 KB

Papers include:
   1. Mixed-Integer Programming for Change-point Detection...
   2. Agentic Test-Time Scaling for WebAgents...
   3. Energy-Aware Spike Budgeting for Continual Learning in Spiking Neural ...
   4. Predictive Associative Memory: Retrieval Beyond Similarity Through Tem...
   5. A technical curriculum on language-oriented artificial intelligence in...


In [53]:
# Advanced semantic search with research papers
def semantic_search_arxiv(query, documents, doc_metadata, embeddings_model, top_k=5):
    """Search through ArXiv paper chunks with detailed results"""
    
    print(f"Searching for: '{query}'")
    print(f"Searching through {len(documents)} chunks...")
    
    # Create embeddings
    query_embedding = embeddings_model.embed_query(query)
    doc_embeddings = embeddings_model.embed_documents(documents)
    
    # Calculate similarities
    def cosine_similarity(vec1, vec2):
        dot_product = np.dot(vec1, vec2)
        norm_a = np.linalg.norm(vec1)
        norm_b = np.linalg.norm(vec2)
        return dot_product / (norm_a * norm_b)
    
    similarities = []
    for i, doc_emb in enumerate(doc_embeddings):
        similarity = cosine_similarity(query_embedding, doc_emb)
        similarities.append((similarity, documents[i], doc_metadata[i]))
    
    # Sort and return top results
    similarities.sort(reverse=True)
    return similarities[:top_k]

# Test different queries
test_queries = [
    "neural networks and deep learning",
    "machine learning algorithms",
    "data processing and analysis"
]

for query in test_queries:
    print("\n" + "="*100)
    results = semantic_search_arxiv(
        query, 
        paper_contents, 
        [doc['metadata'] for doc in arxiv_chunks],
        embeddings,
        top_k=3
    )
    
    print(f"\nüèÜ Top 3 Results:\n")
    for i, (score, content, metadata) in enumerate(results, 1):
        print(f"{i}. Similarity: {score:.4f}")
        print(f"   Paper: {metadata['title'][:70]}...")
        print(f"   Section: {metadata['section']}")
        print(f"   ArXiv ID: {metadata['arxiv_id']}")
        print(f"   Preview: {content[:150]}...")
        print()


Searching for: 'neural networks and deep learning'
Searching through 25 chunks...



üèÜ Top 3 Results:

1. Similarity: 0.3478
   Paper: Energy-Aware Spike Budgeting for Continual Learning in Spiking Neural ...
   Section: 1
Introduction
   ArXiv ID: 2602.12236v1
   Preview: Neuromorphic computing has emerged as a paradigm shift in artificial intelligence, drawing
inspiration from the energy efficiency and temporal dynamic...

2. Similarity: 0.3459
   Paper: A technical curriculum on language-oriented artificial intelligence in...
   Section: approach
   ArXiv ID: 2602.12251v1
   Preview: in various ways.
The curriculum proposed in this paper focuses on
developing technical AI literacy, which involves
knowledge of the basic operating pr...

3. Similarity: 0.3377
   Paper: A technical curriculum on language-oriented artificial intelligence in...
   Section: Abstract
   ArXiv ID: 2602.12251v1
   Preview: This paper presents a technical curricu-
lum on language-oriented artificial intel-
ligence (AI) in the language and transla-
tion (L&T) industry. The...


Searching f

In [54]:
# Another query on research papers
query = "What is deep learning and neural network architecture?"
results = semantic_search(query, documents, embeddings)

print(f"\nSemantic Search Results for: '{query}'\n")
print("=" * 90)

for i, (score, doc) in enumerate(results, 1):
    idx = documents.index(doc)
    metadata = arxiv_chunks[idx]['metadata']
    
    print(f"\n{i}. Similarity: {score:.4f}")
    print(f"   Paper: {metadata['title'][:65]}...")
    print(f"   Section: {metadata['section']}")
    print(f"   Content: {doc[:120]}...")

print("\n" + "=" * 90)


Semantic Search Results for: 'What is deep learning and neural network architecture?'


1. Similarity: 0.3148
   Paper: A technical curriculum on language-oriented artificial intelligen...
   Section: Abstract
   Content: This paper presents a technical curricu-
lum on language-oriented artificial intel-
ligence (AI) in the language and tra...

2. Similarity: 0.2969
   Paper: A technical curriculum on language-oriented artificial intelligen...
   Section: approach
   Content: in various ways.
The curriculum proposed in this paper focuses on
developing technical AI literacy, which involves
knowl...

3. Similarity: 0.2521
   Paper: A technical curriculum on language-oriented artificial intelligen...
   Section: 1
Introduction
   Content: The recent emergence of general-purpose AI
(GPAI) technologies in the form of large language
¬© 2026 The author. This art...

