This notebook reads in nontrivial examples from the full JSONL file and creates a subsample for the purpose of evolving the citation generation prompt.

The dataset on which to evaluate the citation gen prompt consists of:
* The subsample of examples (100 rows) with their embeddings (Qwen 0.6B, for quality + efficiency)
* A list of the unique DOIs cited by this subsample

Each iteration of prompt engineering will run the target DOIs through the citation generation LLM invocation using the current state of the prompt. This then generates a collection of 'contributions' which must also be embedded

In [None]:
import pandas as pd
import numpy as np
import random
from database.milvusdb import MilvusDB
from embedders import Embedder

db = MilvusDB()
db.list_collections()

# Set up query and document embedding models
query_embedder = Embedder.create(model_name="Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True, for_queries=True)
document_embedder = Embedder.create(
    model_name="Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True, for_queries=False
)
document_embedder.model = (
    query_embedder.model
)  # Set the same underlying model since this model's query vs. doc behavior can switch dynamically
print(f"Query embedder: {query_embedder}")
print(f"Document embedder: {document_embedder}")

In [None]:
# Get a dataset of 100 samples randomly selected from all samples
examples = pd.read_json("../data/dataset/nontrivial_checked.jsonl", lines=True)
examples = examples.sample(n=100, random_state=42)
examples["embedding"] = examples["sent_no_cit"].apply(lambda x: query_embedder([x])[0])
examples.to_parquet("citation_gen_examples.parquet")