In [1]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("Deepseek-r1.pdf")
pages = loader.load()
len(pages)

  from .autonotebook import tqdm as notebook_tqdm


22

In [2]:
import re

def clean_text(text):
    text = text.replace("\xa0", " ")  
    text = re.sub(r"\s+\n", "\n", text)  
    text = re.sub(r"\n{2,}", "\n\n", text)  
    text = re.sub(r"-\n", "", text)  
    return text

In [3]:
for p in pages:
    p.page_content = clean_text(p.page_content)

# Chunking
## Fixed-size chunking

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", " ", ""]
)

fixed_chunks = splitter.split_documents(pages)
len(fixed_chunks)

92

In [5]:
# Convert to JSON serializable format
chunks_json = [
    {"id": i, "text": chunk.page_content}
    for i, chunk in enumerate(fixed_chunks)
]

# Save
with open("chunks_fixed.json", "w") as f:
    json.dump(chunks_json, f, indent=2)

print(f"Saved {len(chunks_json)} chunks ‚Üí chunks_fixed.json")

Saved 92 chunks ‚Üí chunks_fixed.json


In [6]:
for i, c in enumerate(fixed_chunks[:3]):
    print(f"===== Fixed Chunk #{i} =====")
    print(c.page_content[:500]) 
    print("\n")

===== Fixed Chunk #0 =====
DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via
Reinforcement Learning
DeepSeek-AI
research@deepseek.com
Abstract
We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1.
DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities.
Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing
reasoning behavio


===== Fixed Chunk #1 =====
mixing. To address these issues and further enhance reasoning performance, we introduce
DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeekR1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the
research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models
(1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama.
AIME 2

## Content chunking

In [7]:
import re
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

import re

def split_by_headings(text):
    # ÂåπÈÖç ‚Äú1 Introduction‚Äù / ‚Äú2.3 DeepSeek-R1-Zero‚Äù Á≠âÊ†ºÂºè
    pattern = r"\n(?=\d[\d\.]*\s[A-Z])"
    parts = re.split(pattern, text)
    
    docs = []
    for part in parts:
        clean_part = part.strip()
        if clean_part:
            docs.append(Document(page_content=clean_part))
    return docs


In [8]:
full_text = "\n".join([p.page_content for p in pages])
heading_chunks = split_by_headings(full_text)
len(heading_chunks)

63

from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

final_chunks = []

for doc in heading_chunks:
    small_chunks = splitter.split_text(doc.page_content)
    for chunk in small_chunks:
        final_chunks.append(Document(page_content=chunk))
        
len(final_chunks)

In [9]:
for i, c in enumerate(heading_chunks[30:40]):
    print(f"===== Heading Chunk #{i} =====")
    print(c.page_content[:500])  # Âè™ÊâìÂç∞ÂâçÈù¢‰∏ÄÈÉ®ÂàÜ
    print("\n")

===== Heading Chunk #0 =====
2.2. DeepSeek-R1-Zero: Reinforcement Learning on the Base Model
Reinforcement learning has demonstrated significant effectiveness in reasoning tasks, as evidenced by our previous works (Shao et al., 2024; Wang et al., 2023). However, these works
heavily depended on supervised data, which are time-intensive to gather. In this section, we
explore the potential of LLMs to develop reasoning capabilities without any supervised data,
focusing on their self-evolution through a pure reinforcement learni


===== Heading Chunk #1 =====
2.2.1. Reinforcement Learning Algorithm
Group Relative Policy OptimizationIn order to save the training costs of RL, we adopt Group
Relative Policy Optimization (GRPO) (Shao et al., 2024), which foregoes the critic model that is
typically the same size as the policy model, and estimates the baseline from group scores instead.
Specifically, for each question ùëû, GRPO samples a group of outputs {ùëú1, ùëú2, ¬∑¬∑¬∑ , ùëúùê∫}from the

In [10]:
print(heading_chunks[0].page_content)

DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via
Reinforcement Learning
DeepSeek-AI
research@deepseek.com
Abstract
We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1.
DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities.
Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing
reasoning behaviors. However, it encounters challenges such as poor readability, and language
mixing. To address these issues and further enhance reasoning performance, we introduce
DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeekR1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the
research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models
(1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qw

# Retrieval
## Dense Search (Semantice embedding)

In [11]:
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding, SparseTextEmbedding
from langchain_core.documents import Document

In [12]:
client = QdrantClient(path=":memory:")

In [13]:
TextEmbedding.list_supported_models()

[{'model': 'BAAI/bge-base-en',
  'sources': {'hf': 'Qdrant/fast-bge-base-en',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.42,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model': 'BAAI/bge-base-en-v1.5',
  'sources': {'hf': 'qdrant/bge-base-en-v1.5-onnx-q',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.21,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model':

In [14]:
import json

EMBEDDING_DIMENSIONALITY = 512

for model in TextEmbedding.list_supported_models():
    if model["dim"] == EMBEDDING_DIMENSIONALITY:
        print(json.dumps(model, indent=2))

{
  "model": "BAAI/bge-small-zh-v1.5",
  "sources": {
    "hf": "Qdrant/bge-small-zh-v1.5",
    "url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz",
    "_deprecated_tar_struct": true
  },
  "model_file": "model_optimized.onnx",
  "description": "Text embeddings, Unimodal (text), Chinese, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.",
  "license": "mit",
  "size_in_GB": 0.09,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}
{
  "model": "Qdrant/clip-ViT-B-32-text",
  "sources": {
    "hf": "Qdrant/clip-ViT-B-32-text",
    "url": null,
    "_deprecated_tar_struct": false
  },
  "model_file": "model.onnx",
  "description": "Text embeddings, Multimodal (text&image), English, 77 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year",
  "license": "mit",
  "size_in_GB": 0.25,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}
{
  "model": "jinaai/jina-embeddings-v2-small-e

In [15]:
dense_embedder = TextEmbedding(model_name="BAAI/bge-large-en-v1.5")
sparse_embedder = SparseTextEmbedding(model_name="qdrant/bm25")

In [16]:
chunks = heading_chunks 
# chunks = fixed_chunks

In [17]:
texts = [c.page_content for c in chunks]
ids = list(range(len(texts)))
payloads = [{"chunk_id": i, "text": texts[i]} for i in ids]

In [18]:
# Dense Embedding
dense_vectors = list(dense_embedder.embed(texts))

client.recreate_collection(
    collection_name="deepseek_dense",
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,              
        distance=models.Distance.COSINE
    )
)

  client.recreate_collection(


True

In [None]:
client.upload_points(
    collection_name="deepseek_dense",
    points=[
        models.PointStruct(
            id=ids[i],
            vector=dense_vectors[i],
            payload=payloads[i]
        )
        for i in range(len(texts))
    ]
)

In [None]:
query = "Which reinforcement learning algorithm does DeepSeek-R1 use?"

query_vector = next(dense_embedder.embed([query]))

results = client.query_points(
    collection_name="deepseek_dense",
    query=query_vector,
    limit=5
).points

for hit in results:
    print("\nChunk ID:", hit.payload["chunk_id"], "Score:", hit.score)
    print(hit.payload["text"][:400], "...")

## Sparse Search (BM25)

In [20]:
sparse_vectors = list(sparse_embedder.embed(texts))

In [21]:
sparse_vectors[0]

SparseEmbedding(values=array([2.0149613 , 1.9779911 , 1.09444229, 1.88298228, 1.46171814,
       1.09444229, 1.46171814, 1.46171814, 1.46171814, 1.09444229,
       1.46171814, 1.09444229, 1.09444229, 1.46171814, 1.09444229,
       1.09444229, 1.64582116, 1.75643225, 1.46171814, 1.09444229,
       1.09444229, 1.64582116, 1.09444229, 1.09444229, 1.09444229,
       1.09444229, 1.09444229, 1.09444229, 1.09444229, 1.09444229,
       1.09444229, 1.09444229, 1.09444229, 1.09444229, 1.09444229,
       1.09444229, 1.09444229, 1.09444229, 1.09444229, 1.09444229,
       1.09444229, 1.09444229, 1.09444229, 1.09444229, 1.09444229,
       1.09444229, 1.09444229, 1.46171814, 1.09444229, 1.09444229,
       1.09444229, 1.09444229, 1.09444229, 1.09444229, 1.09444229,
       1.09444229, 1.09444229, 1.09444229, 1.09444229, 1.09444229,
       1.09444229, 1.09444229, 1.09444229, 1.09444229, 1.09444229,
       1.09444229, 1.09444229, 1.92255925, 1.09444229, 1.09444229,
       1.09444229, 1.09444229, 1.094442

In [22]:
if client.collection_exists("deepseek_sparse"):
    client.delete_collection("deepseek_sparse")

In [23]:
client.recreate_collection(
    collection_name="deepseek_sparse",
    vectors_config={},
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF
        )
    }
)

  client.recreate_collection(


True

In [24]:
bm25_vectors = list(sparse_embedder.embed(texts))

client.upload_points(
    collection_name="deepseek_sparse",
    points=[
        models.PointStruct(
            id=i,
            vector={"bm25": bm25_vectors[i].as_object()},
            payload={"chunk_id": i, "text": texts[i]}
        )
        for i in range(len(texts))
    ]
)

In [25]:
query = "Which reinforcement learning algorithm does DeepSeek-R1 use?"

query_sparse = next(sparse_embedder.query_embed(query))

results = client.query_points(
    collection_name="deepseek_sparse",
    query=models.SparseVector(**query_sparse.as_object()),
    using="bm25",
    limit=5,
    with_payload=True
).points

for hit in results:
    print("\nChunk ID:", hit.payload["chunk_id"], "Score:", hit.score)
    print(hit.payload["text"][:400], "...")



Chunk ID: 30 Score: 8.353005409240723
2.2. DeepSeek-R1-Zero: Reinforcement Learning on the Base Model
Reinforcement learning has demonstrated significant effectiveness in reasoning tasks, as evidenced by our previous works (Shao et al., 2024; Wang et al., 2023). However, these works
heavily depended on supervised data, which are time-intensive to gather. In this section, we
explore the potential of LLMs to develop reasoning capabiliti ...

Chunk ID: 25 Score: 8.22844409942627
1. Introduction
In recent years, Large Language Models (LLMs) have been undergoing rapid iteration and
evolution (Anthropic, 2024; Google, 2024; OpenAI, 2024a), progressively diminishing the gap
towards Artificial General Intelligence (AGI).
Recently, post-training has emerged as an important component of the full training pipeline.
It has been shown to enhance accuracy on reasoning tasks, align w ...

Chunk ID: 29 Score: 6.960752487182617
2.1. Overview
Previous work has heavily relied on large amounts of supervi

## Hybrid Retrieval

In [26]:
from fastembed import TextEmbedding, SparseTextEmbedding, LateInteractionTextEmbedding
from qdrant_client import QdrantClient, models

In [27]:
dense_model = TextEmbedding("BAAI/bge-large-en-v1.5")
sparse_model = SparseTextEmbedding("Qdrant/bm25")
late_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")

In [28]:
documents = [c.page_content for c in heading_chunks]

In [29]:
dense_embeddings = list(dense_model.embed(documents))
bm25_embeddings = list(sparse_model.embed(documents))
late_embeddings = list(late_model.embed(documents))   # Ê≥®ÊÑèÔºöËøôÊòØ list of list of vectors

In [30]:
from qdrant_client.models import Distance, VectorParams, models

client.recreate_collection(
    collection_name="deepseek_hybrid",
    vectors_config={
        "jina-small": models.VectorParams(
            size=len(dense_embeddings[0]),
            distance=models.Distance.COSINE
        ),
        "colbert": models.VectorParams(
            size=len(late_embeddings[0][0]),
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            ),
            hnsw_config=models.HnswConfigDiff(m=0)  # Á¶ÅÁî® HNSWÔºàColBERT rerank ÂøÖÈ°ªÁ¶ÅÁî®Ôºâ
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF
        )
    }
)


  client.recreate_collection(


True

In [31]:
from qdrant_client.models import PointStruct

points = []
for idx, (dense, bm25, late, doc) in enumerate(
    zip(dense_embeddings, bm25_embeddings, late_embeddings, documents)
):
    point = PointStruct(
        id=idx,
        vector={
            "jina-small": dense,
            "bm25": bm25.as_object(),
            "colbert": late
        },
        payload={"chunk_id": idx, "text": doc}
    )
    points.append(point)

client.upsert(collection_name="deepseek_hybrid", points=points)


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [32]:
query = "Which reinforcement learning algorithm does DeepSeek-R1 use?"

query_dense = next(dense_model.query_embed(query))
query_sparse = next(sparse_model.query_embed(query))
query_late = next(late_model.query_embed(query))

In [33]:
# PrefetchÔºàDense + SparseÔºâHybrid Retrieval
prefetch = [
    models.Prefetch(
        query=query_dense,
        using="jina-small",
        limit=20
    ),
    models.Prefetch(
        query=models.SparseVector(**query_sparse.as_object()),
        using="bm25",
        limit=20
    )
]

In [34]:
results = client.query_points(
    collection_name="deepseek_hybrid",
    prefetch=prefetch,
    query=query_late,
    using="colbert",
    with_payload=True,
    limit=10
).points

for hit in results:
    print("\nCHUNK:", hit.payload["chunk_id"], "SCORE:", hit.score)
    print(hit.payload["text"][:400], "...")


CHUNK: 30 SCORE: 25.6588256074936
2.2. DeepSeek-R1-Zero: Reinforcement Learning on the Base Model
Reinforcement learning has demonstrated significant effectiveness in reasoning tasks, as evidenced by our previous works (Shao et al., 2024; Wang et al., 2023). However, these works
heavily depended on supervised data, which are time-intensive to gather. In this section, we
explore the potential of LLMs to develop reasoning capabiliti ...

CHUNK: 7 SCORE: 24.348006591472615
2.2 DeepSeek-R1-Zero: Reinforcement Learning on the Base Model . . . . . . . . . . 5 ...

CHUNK: 12 SCORE: 23.841955517857894
2.3 DeepSeek-R1: Reinforcement Learning with Cold Start . . . . . . . . . . . . . . . 9 ...

CHUNK: 38 SCORE: 23.553651692000606
2.3. DeepSeek-R1: Reinforcement Learning with Cold Start
Inspired by the promising results of DeepSeek-R1-Zero, two natural questions arise: 1) Can
reasoning performance be further improved or convergence accelerated by incorporating a small
amount of high-quality data

# Evaluation

In [35]:
def build_dense_collection(client, chunks, collection_name):
    texts = [c.page_content for c in chunks]
    ids = list(range(len(texts)))
    payloads = [{"chunk_id": i, "text": texts[i]} for i in ids]

    dense_vectors = list(dense_embedder.embed(texts))
    dim = len(dense_vectors[0])

    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=dim,
            distance=models.Distance.COSINE
        )
    )

    client.upload_points(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=ids[i],
                vector=dense_vectors[i],
                payload=payloads[i]
            )
            for i in range(len(texts))
        ]
    )

    print(f"‚úì Dense collection created: {collection_name}")


In [36]:
def build_sparse_collection(client, chunks, collection_name):
    texts = [c.page_content for c in chunks]
    ids = list(range(len(texts)))
    payloads = [{"chunk_id": i, "text": texts[i]} for i in ids]

    # Compute sparse vectors
    bm25_vectors = list(sparse_embedder.embed(texts))

    # Create collection (dense vectors disabled)
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config={},   # <-- MUST be empty for sparse-only collection
        sparse_vectors_config={
            "bm25": models.SparseVectorParams(
                modifier=models.Modifier.IDF
            )
        }
    )

    # Upload points
    client.upload_points(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=i,
                vector={"bm25": bm25_vectors[i].as_object()},
                payload=payloads[i]
            )
            for i in range(len(texts))
        ]
    )

    print(f"‚úì Sparse BM25 collection created: {collection_name}")


In [37]:
#dense_model = TextEmbedding("jinaai/jina-embeddings-v2-small-en") 
dense_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2") 
sparse_model = SparseTextEmbedding("Qdrant/bm25") 
late_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")

In [38]:
def build_hybrid_collection(client, chunks, collection_name):
    print(f"\nBuilding hybrid collection: {collection_name}")

    documents = [c.page_content for c in chunks]
    ids = list(range(len(documents)))

    # Embed all three
    dense_embeddings = list(dense_model.embed(documents))
    bm25_embeddings = list(sparse_model.embed(documents))
    late_embeddings = list(late_model.embed(documents))  # list of list-vectors

    dense_dim = len(dense_embeddings[0])
    late_dim = len(late_embeddings[0][0])  # each is multi-vector

    # Recreate hybrid collection
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config={
            "Ball-MiniLM-L6-v2": models.VectorParams(
                size=dense_dim,
                distance=models.Distance.COSINE
            ),
            "colbert": models.VectorParams(
                size=late_dim,
                distance=models.Distance.COSINE,
                multivector_config=models.MultiVectorConfig(
                    comparator=models.MultiVectorComparator.MAX_SIM
                ),
                hnsw_config=models.HnswConfigDiff(m=0)  # required for reranking
            ),
        },
        sparse_vectors_config={
            "bm25": models.SparseVectorParams(
                modifier=models.Modifier.IDF
            )
        }
    )

    # Upsert points
    points = []
    for i in range(len(documents)):
        points.append(
            models.PointStruct(
                id=i,
                vector={
                    "Ball-MiniLM-L6-v2": dense_embeddings[i],
                    "bm25": bm25_embeddings[i].as_object(),
                    "colbert": late_embeddings[i],
                },
                payload={"chunk_id": i, "text": documents[i]}
            )
        )

    client.upsert(collection_name=collection_name, points=points)
    print(f"‚úì Hybrid collection created: {collection_name}")



In [39]:
client = QdrantClient(path=":memory:")

datasets = {
    "fixed": fixed_chunks,
    "heading": heading_chunks,
}

for mode, chunks in datasets.items():
    print(f"\n=== Building collections for {mode.upper()} chunks ===")
    build_dense_collection(client, chunks, f"deepseek_dense_{mode}")
    build_sparse_collection(client, chunks, f"deepseek_sparse_{mode}")
    build_hybrid_collection(client, chunks, f"deepseek_hybrid_{mode}")


=== Building collections for FIXED chunks ===


  client.recreate_collection(
  client.recreate_collection(


‚úì Dense collection created: deepseek_dense_fixed
‚úì Sparse BM25 collection created: deepseek_sparse_fixed

Building hybrid collection: deepseek_hybrid_fixed


  client.recreate_collection(


‚úì Hybrid collection created: deepseek_hybrid_fixed

=== Building collections for HEADING chunks ===
‚úì Dense collection created: deepseek_dense_heading
‚úì Sparse BM25 collection created: deepseek_sparse_heading

Building hybrid collection: deepseek_hybrid_heading
‚úì Hybrid collection created: deepseek_hybrid_heading


In [40]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [41]:
with open("ground_truth.json", "r") as f:
    ground_truth = json.load(f)

In [42]:
def dense_retriever(query, top_k=10, mode="fixed"):
    """
    mode = 'fixed' or 'heading'
    """
    collection_name = f"deepseek_dense_{mode}"

    # Compute query vector
    query_vec = next(dense_embedder.embed([query]))

    # Query Qdrant
    hits = client.query_points(
        collection_name=collection_name,
        query=query_vec,
        limit=top_k,
        with_payload=True
    ).points

    return hits   

In [43]:
def sparse_retriever(query, top_k=10, mode="fixed"):
    collection_name = f"deepseek_sparse_{mode}"

    query_sparse = next(sparse_embedder.query_embed(query))

    hits = client.query_points(
        collection_name=collection_name,
        query=models.SparseVector(**query_sparse.as_object()),
        using="bm25",                # <-- REQUIRED
        limit=top_k,
        with_payload=True
    ).points

    return hits

In [44]:
def hybrid_retriever(query, top_k=10, mode="fixed"):
    collection_name = f"deepseek_hybrid_{mode}"

    # embed query in all 3 ways
    q_dense = next(dense_model.query_embed(query))
    q_sparse = next(sparse_model.query_embed(query))
    q_late = next(late_model.query_embed(query))  # for rerank

    prefetch = [
        models.Prefetch(
            query=q_dense,
            using="Ball-MiniLM-L6-v2",
            limit=20
        ),
        models.Prefetch(
            query=models.SparseVector(**q_sparse.as_object()),
            using="bm25",
            limit=20
        ),
    ]

    results = client.query_points(
        collection_name=collection_name,
        prefetch=prefetch,
        query=q_late,
        using="colbert",
        with_payload=True,
        limit=top_k
    ).points

    return results


Áî® embedding similarityÔºàÂêëÈáèÁõ∏‰ººÂ∫¶ÔºâÊâæÂà∞‚ÄúÊúÄÂåπÈÖçÁöÑ chunk‚Äù

In [45]:
chunk_texts_fixed = [c.page_content for c in fixed_chunks]
chunk_texts_heading = [c.page_content for c in heading_chunks]

chunk_emb_fixed = list(dense_embedder.embed(chunk_texts_fixed))
chunk_emb_heading = list(dense_embedder.embed(chunk_texts_heading))

In [46]:
for item in ground_truth:
    ans_emb = next(dense_embedder.embed([item["answer"]]))

In [47]:
import numpy as np

def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [48]:
def find_chunk_by_embedding(ans_emb, chunk_emb_list):
    sims = [cosine_sim(ans_emb, emb) for emb in chunk_emb_list]
    return int(np.argmax(sims))   # pick best match

In [49]:
for item in ground_truth:
    ans_emb = next(dense_embedder.embed([item["answer"]]))

    item["fixed_chunk_id"] = find_chunk_by_embedding(ans_emb, chunk_emb_fixed)
    item["heading_chunk_id"] = find_chunk_by_embedding(ans_emb, chunk_emb_heading)

In [50]:
for item in ground_truth:
    print(item["id"], item["fixed_chunk_id"], item["heading_chunk_id"])

q01 20 7
q02 66 33
q03 23 9
q04 22 32
q05 26 4
q06 74 58
q07 74 55
q08 40 40
q09 38 56
q10 40 55
q11 53 3
q12 58 55
q13 2 3
q14 53 46
q15 28 27
q16 48 45
q17 16 45
q18 41 41
q19 55 20
q20 58 27


In [51]:
print(ground_truth[1]["answer"])
print("-----")
print(heading_chunks[ground_truth[1]["heading_chunk_id"]].page_content)

Because neural reward models may suffer from reward hacking during large-scale RL and require extra retraining, complicating the training pipeline.
-----
2.2.2. Reward Modeling
The reward is the source of the training signal, which decides the optimization direction of RL.
To train DeepSeek-R1-Zero, we adopt a rule-based reward system that mainly consists of two
types of rewards:
‚Ä¢ Accuracy rewards: The accuracy reward model evaluates whether the response is correct.
For example, in the case of math problems with deterministic results, the model is required
to provide the final answer in a specified format (e.g., within a box), enabling reliable
rule-based verification of correctness. Similarly, for LeetCode problems, a compiler can be
used to generate feedback based on predefined test cases.
‚Ä¢ Format rewards: In addition to the accuracy reward model, we employ a format reward
model that enforces the model to put its thinking process between ‚Äò<think>‚Äô and ‚Äò</think>‚Äô
tags.
W

In [52]:
def reciprocal_rank(ranked_ids, true_id):
    for rank, rid in enumerate(ranked_ids, start=1):
        if rid == true_id:
            return 1 / rank
    return 0

In [53]:
def evaluate(method_name, retriever_fn, gt_data, chunk_mode, ks=[1,3,5,10]):
    results = {f"hit@{k}": 0 for k in ks}
    results["MRR"] = 0
    total = len(gt_data)

    for item in gt_data:
        true_id = item[f"{chunk_mode}_chunk_id"]
        if true_id is None:
            continue

        hits = retriever_fn(item["question"], top_k=max(ks), mode=chunk_mode)
        retrieved_ids = [h.payload["chunk_id"] for h in hits]

        for k in ks:
            if true_id in retrieved_ids[:k]:
                results[f"hit@{k}"] += 1

        results["MRR"] += reciprocal_rank(retrieved_ids, true_id)

    # normalize
    for k in ks:
        results[f"hit@{k}"] /= total
    results["MRR"] /= total

    results["method"] = method_name
    results["chunking"] = chunk_mode
    return results

In [54]:
eval_results = []

methods = [
    ("Dense", dense_retriever),
    ("Sparse", sparse_retriever),
    ("Hybrid", hybrid_retriever)
]

for name, fn in methods:
    eval_results.append(evaluate(name, fn, ground_truth, chunk_mode="fixed"))
    eval_results.append(evaluate(name, fn, ground_truth, chunk_mode="heading"))

In [55]:
import pandas as pd

df = pd.DataFrame(eval_results)
df = df[["method", "chunking", "hit@1", "hit@3", "hit@5", "hit@10", "MRR"]]
df

Unnamed: 0,method,chunking,hit@1,hit@3,hit@5,hit@10,MRR
0,Dense,fixed,0.2,0.2,0.2,0.3,0.214583
1,Dense,heading,0.05,0.15,0.15,0.25,0.094444
2,Sparse,fixed,0.3,0.35,0.45,0.55,0.351865
3,Sparse,heading,0.1,0.25,0.3,0.4,0.19006
4,Hybrid,fixed,0.2,0.25,0.3,0.5,0.252758
5,Hybrid,heading,0.1,0.3,0.35,0.35,0.21
