In [1]:
import pandas as pd
from tqdm import tqdm
import json
import numpy as np
from pprint import pprint
from citeline.database.milvusdb import MilvusDB
from citeline.embedders import Embedder
from citeline.query_expander import get_expander

db = MilvusDB()
print(db)

tqdm.pandas()

# Setup: load embedder, expander, dataset, db collection
embedder = Embedder.create("Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True)
print(embedder)

expander = get_expander("add_prev_3", path_to_data="../data/preprocessed/reviews.jsonl")
print(expander)

sample = pd.read_json("../data/dataset/nontrivial_nomath.jsonl", lines=True)
sample = sample.sample(50, random_state=42).reset_index(drop=True)

# Apply query expansion and embed the queries
sample["sent_no_cit"] = expander(sample)
sample["vector"] = sample.progress_apply(lambda row: embedder([row["sent_no_cit"]])[0], axis=1)

db.list_collections()
db.client.load_collection("qwen06_contributions")

sample.head()

<citeline.database.milvusdb.MilvusDB object at 0x111b49890>
Qwen/Qwen3-Embedding-0.6B, device=mps, normalize=True, dim=1024
QueryExpander(name=add_prev_3, data_length=2980)


100%|██████████| 50/50 [00:11<00:00,  4.44it/s]

Collections:
 - astrobert_chunks: 460801 entities
 - astrobert_contributions: 89860 entities
 - bge_chunks: 460801 entities
 - bge_contributions: 89860 entities
 - nasa_chunks: 460801 entities
 - nasa_contributions: 89860 entities
 - qwen06_chunks: 460801 entities
 - qwen06_contributions: 89860 entities
 - qwen06_findings_v2: 4342 entities
 - qwen06_v3_contributions: 299286 entities
 - qwen8b_contributions: 89860 entities
 - specter_chunks: 460801 entities
 - specter_contributions: 89860 entities





Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes,sent_cit_masked,vector
0,10.1146/annurev-astro-021022-043545,"For example, strong outliers from the MZR with...",There are important practical implications for...,494,[10.1088/0004-637X/695/1/259],20220801,[2009ApJ...695..259P],"For example, strong outliers from the MZR with...","[-0.0304536, -0.03546455, -0.008876894, -0.028..."
1,10.1007/s001590050008,Grillmair et al. (1995a) conclude that the sta...,They find that most of their sample clusters s...,593,[10.1086/117470],19970101,[1995AJ....109.2553G],[REF] conclude that the stars found beyond the...,"[-0.023469558, 0.0036594549, -0.009580555, 0.0..."
2,10.1016/j.newar.2021.101610,"Later, a very similar study was carried out by...","For the problem at hand, these conditions are ...",608,[10.1111/j.1365-2966.2010.17674.x],20210601,[2011MNRAS.411..155G],"Later, a very similar study was carried out by...","[0.012016729, -0.036106728, -0.008817904, 0.02..."
3,10.1146/annurev-astro-082214-122348,k Using column density of C-C bonds derived by...,h Identification based on a single absorption ...,377,[10.1088/0004-637X/784/2/172],20150801,[2014ApJ...784..172H],k Using column density of C-C bonds derived by...,"[-0.05986425, -0.016321747, -0.008735628, 0.01..."
4,10.1146/annurev.astro.41.082801.100328,One of the most surprising observations to dat...,Additional evidence for clumpiness comes from ...,321,[10.1086/319733],20030101,[2001ApJ...550..142H],One of the most surprising observations to dat...,"[-0.0136848325, -0.06581634, -0.010353548, 0.0..."


In [2]:
sample.iloc[15]['sent_original']

'Kartaltepe et al. (2012) find that a sample of ULIRGs selected from the CANDELS fields are more likely than a field galaxy sample to be involved in galaxy interactions and mergers (72 +5 −7 % versus 32 ± 3%).'

In [5]:
# Data patches

# one sample has incorrect citation_dois
# sample.at[15, "citation_dois"] = ["10.48550/arXiv.1110.4057"]
# It's true target, "10.48550/arXiv.1110.4057", not in the db
sample.drop(index=15, inplace=True)

In [4]:
research = pd.read_json('../data/research_used.jsonl', lines=True)
for idx, row in research.iterrows():
    authors = row.author
    if any(["Kart" in name for name in authors]):
        print(row.pubdate, row.doi, row.title, authors)

2003-04-01 10.1086/346076 The BIMA Survey of Nearby Galaxies (BIMA SONG). II. The CO Data ['Helfer, Tamara T.', 'Thornley, Michele D.', 'Regan, Michael W.', 'Wong, Tony', 'Sheth, Kartik', 'Vogel, Stuart N.', 'Blitz, Leo', 'Bock, Douglas C. -J.']
1999-03-01 10.1086/306824 Megamaser Disks in Active Galactic Nuclei ['Kartje, John F.', 'Königl, Arieh', 'Elitzur, Moshe']
2015-05-01 10.1088/2041-8205/804/1/L21 Small Scatter and Nearly Isothermal Mass Profiles to Four Half-light Radii from Two-dimensional Stellar Dynamics of Early-type Galaxies ['Cappellari, Michele', 'Romanowsky, Aaron J.', 'Brodie, Jean P.', 'Forbes, Duncan A.', 'Strader, Jay', 'Foster, Caroline', 'Kartha, Sreeja S.', 'Pastorello, Nicola', 'Pota, Vincenzo', 'Spitler, Lee R.', 'Usher, Christopher', 'Arnold, Jacob A.']
2012-07-01 10.1088/0004-637X/753/2/167 What Turns Galaxies Off? The Different Morphologies of Star-forming and Quiescent Galaxies since z ~ 2 from CANDELS ['Bell, Eric F.', 'van der Wel, Arjen', 'Papovich, Case

In [6]:
def get_hard_records(example: pd.Series, n: int = 2) -> list[str]:
    """
    Overfetches 3*n most similar records (bc if two reps from same doc are in top n, we won't have n distinct non-target dois)

    Returns:
      A list of doi's, ordered by their max similarity to the query
    """
    results = db.search(
        collection_name="qwen06_contributions",
        query_records=[example.to_dict()],
        query_vectors=[example.vector],
        limit=3 * n,
    )
    results = results[0]  # db.search operates on lists of queries; we only need the first result

    # Filter results to non-targets only
    target_dois = set(example.citation_dois)
    non_target_results = [r for r in results if r["doi"] not in target_dois]
    return non_target_results[:n]


def get_similarity_to_targets(example: pd.Series) -> list[float]:
    """
    For each target doi in the example, computes the max similarity between the example and any record with that doi.

    Returns a list of scores in order of example.citation_dois
    """
    similarities = []
    for target_doi in example.citation_dois:
        try:
            results = db.select_by_doi(doi=target_doi, collection_name="qwen06_contributions")
            target_vectors = np.array(results["vector"].tolist())
            similarity_scores = np.dot(example.vector, target_vectors.T)
            similarities.append(np.max(similarity_scores))
        except Exception as e:
            print(f"Error processing DOI {target_doi}: {e}")
    return similarities


def compute_margins(df: pd.DataFrame, target_col: str, hard_col: str, margin_col_name: str) -> None:
    """
    For each row in the DataFrame, computes the margin between each target similarity and the hardest non-target similarity.

    Args:
      df: DataFrame containing the data
      target_col: Name of the column with list of target similarities
      hard_col: Name of the column with list of hard non-target similarities
      margin_col_name: Name of the column to store the computed margins

    Returns:
      None (modifies df in place)
    """
    df[margin_col_name] = None
    for idx, row in df.iterrows():
        target_similarities = row[target_col]
        hardest_nontarget_similarity = max(row[hard_col])
        margins = [target_sim - hardest_nontarget_similarity for target_sim in target_similarities]
        df.at[idx, margin_col_name] = margins


# Compute target and hard similarities, then the margins
sample["target_similarities"] = sample.progress_apply(get_similarity_to_targets, axis=1)
sample["hard_dois"] = None
sample["hard_similarities"] = None
for idx, example in tqdm(sample.iterrows(), total=len(sample)):
    hard_records = get_hard_records(example, n=2)
    sample.at[idx, "hard_dois"] = [r["doi"] for r in hard_records]
    sample.at[idx, "hard_similarities"] = [r["metric"] for r in hard_records]

compute_margins(sample, target_col="target_similarities", hard_col="hard_similarities", margin_col_name="old_margins")

100%|██████████| 49/49 [00:00<00:00, 178.75it/s]
100%|██████████| 49/49 [00:02<00:00, 18.11it/s]


In [7]:
margins = pd.to_numeric(sample.explode(column="old_margins")["old_margins"], errors="coerce").dropna()
margins.describe()

count    67.000000
mean     -0.073777
std       0.083496
min      -0.267450
25%      -0.125003
50%      -0.068840
75%      -0.029816
max       0.163279
Name: old_margins, dtype: float64

## Process the dois


In [None]:
dois_to_process = set(doi for dois in sample.citation_dois for doi in dois).union(
    doi for dois in sample.hard_dois for doi in dois
)
print(f"DOI's to process: {len(dois_to_process)}")

# Load research papers so we can get full text by doi
research = pd.read_json("../data/research_used.jsonl", lines=True)
research = research[research["doi"].isin(dois_to_process)].reset_index(drop=True)
print(f"Loaded {len(research)} research papers")


def doi_to_paper(doi: str) -> str:
    record = research[research["doi"] == doi].iloc[0]
    return record["title"] + "\n\n" + record["abstract"] + "\n\n" + record["body"]

DOI's to process: 151
Loaded 151 research papers
Simulations of the formation of stellar discs in the Galactic Centre via cloud-cloud collisions

Young massive stars in the central parsec of our Galaxy are best explained by star formation within at least one, and possibly two, massive self-gravitating gaseous discs. With help of numerical simulations, we here consider whether the observed population of young stars could have originated from a large angle collision of two massive gaseous clouds at R ~= 1pc from SgrA*. In all the simulations per


In [9]:
# from openai import OpenAI
# import os


# def bind_client(func):
#     """
#     Decorator to bind OpenAI client to a function that will provide DeepSeek API access
#     """
#     client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")

#     def wrapper(*args, **kwargs):
#         return func(client, *args, **kwargs)

#     return wrapper


# @bind_client
# def deepseek(client, prompt: str) -> str:
#     """
#     Sends a prompt to the DeepSeek API (using DeepSeek-V3.1 non-thinking model)

#     Expects a prompt that will instruct the model to respond with a JSON object.
#     However, the function returns the raw string response, to allow for validation and
#     error handling in multiple passes without losing the original response
#     """
#     response = client.chat.completions.create(
#         model="deepseek-chat",
#         messages=[{"role": "system", "content": prompt}],
#         stream=False,
#         response_format={"type": "json_object"},
#     )
#     return response.choices[0].message.content

from citeline.apis.deepseek import deepseek_formatted as deepseek

response = deepseek("Respond with a JSON object with keys 'greeting' and 'farewell'", model="deepseek-reasoner")
print(response)
print(json.loads(response))

{
  "greeting": "Hello",
  "farewell": "Goodbye"
}
{'greeting': 'Hello', 'farewell': 'Goodbye'}


#### OpenAI Client

In [11]:
# from citeline.apis.openai_client import openai_llm_client

# openai_llm = openai_llm_client(model="gpt-5-nano")

#### Gemini Client

In [12]:
# from google import genai
# from google.genai import types
# from citeline.llm.models import Findings

# client = genai.Client()
# with open("../src/citeline/llm/prompts/original_contributions_gemini_v3.txt", "r") as f:
#     prompt_template = f.read()


# def gemini(paper: str) -> str:
#     raw_text = ""
#     try:
#         response = client.models.generate_content(
#             model="gemini-2.5-flash",
#             config=types.GenerateContentConfig(
#                 temperature=0.0,
#                 system_instruction=prompt_template,
#                 response_mime_type="application/json",
#                 response_schema=Findings,
#             ),
#             contents=paper,
#         )
#         return response.parsed
#     except Exception as e:
#         print(f"Error during Gemini call: {e}")
#         return ""

In [None]:
with open("../src/citeline/llm/prompts/original_contributions_v3.txt", "r") as f:
    prompt_template = f.read()

# llm_function = deepseek
# llm_function = openai_llm

with open("new_findings.jsonl", "w") as f:
    for doi in tqdm(dois_to_process):
        paper = doi_to_paper(doi)
        prompt = prompt_template.format(paper=paper)
        try:
            # response = llm_function(prompt)
            response = deepseek(prompt, model="deepseek-reasoner")
        except Exception as e:
            print(f"Error processing doi {doi}: {e}")
            continue
        try:
            data = json.loads(response)
            data["doi"] = doi
            f.write(json.dumps(data) + "\n")
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for doi {doi}. Response was:\n{response}")
            with open("failed_dois.txt", "a") as f_fail:
                f_fail.write(doi + "\n")
            continue

  1%|▏         | 2/151 [06:03<8:04:26, 195.08s/it]

In [None]:
new_findings = pd.read_json("new_findings.jsonl", lines=True)
print(f"Loaded {len(new_findings)} new findings")

new_findings_exploded = new_findings.explode("findings")
new_findings_exploded["vector"] = embedder(new_findings_exploded["findings"].tolist()).tolist()
new_findings_exploded.head()

In [None]:
# Get new similarity to target
sample["new_target_similarities"] = None
sample["new_hard_similarities"] = None


def get_vectors_by_doi(doi: str) -> np.ndarray:
    return np.array(new_findings_exploded[new_findings_exploded["doi"] == doi]["vector"].tolist())


for idx, row in sample.iterrows():
    # For each target doi, compute the max similarity wrt the new embeddings
    query_vector = row["vector"]
    new_similarities = []
    for target_doi in row["citation_dois"]:
        target_vectors = get_vectors_by_doi(target_doi)
        new_similarities.append(np.max(np.dot(query_vector, target_vectors.T)))
    sample.at[idx, "new_target_similarities"] = new_similarities

    # Collect all the hard vectors, compute the hard similarities
    new_hard_similarities = []
    for doi in row["hard_dois"]:
        candidate_vectors = get_vectors_by_doi(doi)
        new_hard_similarities.append(np.max(np.dot(query_vector, candidate_vectors.T)))
    sample.at[idx, "new_hard_similarities"] = new_hard_similarities

In [None]:
compute_margins(
    sample, target_col="new_target_similarities", hard_col="new_hard_similarities", margin_col_name="new_margins"
)

In [None]:
def compute_margin_diffs(df: pd.DataFrame, new_col: str, ref_col: str) -> pd.Series:
    new_values = df[new_col].explode().tolist()
    ref_values = df[ref_col].explode().tolist()
    diffs = [new - ref for new, ref in zip(new_values, ref_values)]
    return pd.Series(diffs)


diffs = compute_margin_diffs(sample, new_col="new_margins", ref_col="old_margins")
print(diffs.describe())

In [None]:
sample['new_margins'].explode().hist(bins=15)

## Error analysis

Let's look at where the new margin is still negative (the target document vectors aren't as close to the query as the hard examples)


In [None]:
error_rows = sample[sample["new_margins"].apply(lambda margins: any(margin < 0 for margin in margins))]
print(f"{len(error_rows)} rows have at least one negative new margin")

error_margins = pd.to_numeric(error_rows.explode(column="new_margins")["new_margins"], errors="coerce").dropna()
print(error_margins.describe())

In [None]:
error_rows

In [None]:
def analyze_error_row(idx: int) -> None:

    example = error_rows.loc[idx]
    margins = [round(float(margin), 4) for margin in example["new_margins"]]
    print(f"Margins: {margins}")
    print("Original sentence:")
    print(example["sent_original"])
    print("\nExpanded sentence:")
    print(example["sent_no_cit"] + "\n")

    hardest_idx = np.argmax(example["new_hard_similarities"])
    hard_doi = example["hard_dois"][hardest_idx]
    hard_findings = new_findings_exploded[new_findings_exploded["doi"] == hard_doi]
    hard_vectors = np.array(hard_findings["vector"].tolist())
    hard_similarities = np.dot(example["vector"], hard_vectors.T)
    hardest_indices = np.argsort(-hard_similarities)[:3]
    for idx in hardest_indices:
        print(f"Similarity: {hard_similarities[idx]:.4f}, DOI: {hard_findings.iloc[idx]['doi']}")
        pprint(hard_findings.iloc[idx]["findings"])
        print("-----")


def print_target_contributions(idx: int) -> None:
    row = error_rows.loc[idx]
    print("Original sentence:")
    print(row["sent_original"])

    target_dois = row["citation_dois"]
    print(f"Target DOIs: {target_dois}")
    target_records = {
        doi: new_findings_exploded[new_findings_exploded["doi"] == doi]["findings"] for doi in target_dois
    }
    pprint("Target findings:")
    for doi in target_records:
        print(f"DOI: {doi}")
        for i, finding in enumerate(target_records[doi]):
            print(f"{i}: {finding}")
        print("-----")


idx = 4

print_target_contributions(idx)
analyze_error_row(idx)

In [None]:
doi = "10.1086/319733"
row = new_findings[new_findings["doi"] == doi].iloc[0]
target_texts = row["findings"]
target_vectors = embedder(target_texts)
query_vector = embedder(
    [
        "One of the most surprising observations to date are those of the narrow absorption lines in the quasar 3C 191, which show evidence for partial covering at a large distance (28 kpc) from the nucleus ( ); it is difficult to understand how the small clouds have maintained their integrity over the outflow timescale of ∼3 × 10 7 year."
    ]
)[0]

cosine_similarities = np.dot(query_vector, target_vectors.T)
tups = sorted(enumerate(cosine_similarities), key=lambda x: -x[1])
for i, sim in tups:
    print(f"Finding {i}: similarity {sim:.4f}")
print(cosine_similarities)

In [None]:
target_vector = embedder(
    ["Cosmic microwave background (CMB) anisotropy measurements indicate flat " "universe geometry."]
)[0]
# query_vector = error_rows.iloc[0]["vector"]
query_vector = embedder(
    [
        "This data provided the most convincing evidence then available for the Euclidean nature of the Universe; i.e. that the geometry is flat Fig. 16 The first measurement of polarization made of the CMBR, obtained by the DASI experiment at the South Pole "
    ]
)[0]
print(f"Cosine similarity: {query_vector.dot(target_vector):.4f}")

### Revision 2


In [None]:
with open("../src/citeline/llm/prompts/original_contributions_v2.txt", "r") as f:
    prompt_template = f.read()

NEW_FINDINGS_FILENAME = "new_findings_v2.jsonl"

with open(NEW_FINDINGS_FILENAME, "w") as f:
    for doi in tqdm(dois_to_process):
        paper = doi_to_paper(doi)
        prompt = prompt_template.format(paper=paper)
        try:
            response = deepseek(prompt)
        except Exception as e:
            print(f"Error processing doi {doi}: {e}")
            continue
        try:
            data = json.loads(response)
            data["doi"] = doi
            f.write(json.dumps(data) + "\n")
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for doi {doi}. Response was:\n{response}")
            with open("failed_dois.txt", "a") as f_fail:
                f_fail.write(doi + "\n")
            continue

In [None]:
new_findings = pd.read_json(NEW_FINDINGS_FILENAME, lines=True)
print(f"Loaded {len(new_findings)} new findings")

new_findings_exploded = new_findings.explode("findings")
new_findings_exploded["vector"] = embedder(new_findings_exploded["findings"].tolist()).tolist()
new_findings_exploded.head()

In [None]:
# Save previous iteration and reset df for new results
sample_old = sample.copy()

# Get new similarity to target
sample["new_target_similarities"] = None
sample["new_hard_similarities"] = None

for idx, row in sample.iterrows():
    # For each target doi, compute the max similarity wrt the new embeddings
    query_vector = row["vector"]
    new_similarities = []
    for target_doi in row["citation_dois"]:
        target_vectors = get_vectors_by_doi(target_doi)
        new_similarities.append(np.max(np.dot(query_vector, target_vectors.T)))
    sample.at[idx, "new_target_similarities"] = new_similarities

    # Collect all the hard vectors, compute the hard similarities
    new_hard_similarities = []
    for doi in row["hard_dois"]:
        candidate_vectors = get_vectors_by_doi(doi)
        new_hard_similarities.append(np.max(np.dot(query_vector, candidate_vectors.T)))
    sample.at[idx, "new_hard_similarities"] = new_hard_similarities

compute_margins(
    sample, target_col="new_target_similarities", hard_col="new_hard_similarities", margin_col_name="new_margins"
)
sample.head()

diffs = compute_margin_diffs(sample, new_col="new_margins", ref_col="old_margins")
print(diffs.describe())

In [None]:
error_rows = sample[sample["new_margins"].apply(lambda margins: any(margin < 0 for margin in margins))]
print(f"Number of rows with negative new margins: {len(error_rows)}")
error_rows

In [None]:
# Print the target contributions for an error row
idx = 0
analyze_error_row(idx)


def print_target_contributions(idx: int) -> None:
    row = error_rows.iloc[idx]
    print("Original sentence:")
    print(row["sent_original"])

    target_dois = row["citation_dois"]
    target_records = {
        doi: new_findings_exploded[new_findings_exploded["doi"] == doi]["findings"] for doi in target_dois
    }
    pprint("Target findings:")
    for doi in target_records:
        print(f"DOI: {doi}")
        for i, finding in enumerate(target_records[doi]):
            print(f"{i}: {finding}")
        print("-----")


print(f"Sentence in context:\n{error_rows.iloc[idx]['sent_no_cit']}")
print_target_contributions(idx)

In [None]:
error_rows.iloc[idx]["sent_no_cit"]

In [None]:
target_vector = embedder(
    [
        "Deep optical images shows a faint elliptical ring structure orbiting the spiral galaxy NGC 5907",
    ]
)[0]
# query_vector = error_rows.iloc[0]["vector"]
query_vector = embedder(
    [
        "However, deep optical images of a number of spiral galaxies, such as NGC 253, M 83, M 104, NGC 2855, (Malin and Hadley 1997) and NGC 5907 (), do show unusual, faint features in their surroundings.",
    ]
)[0]
print(f"Cosine similarity: {query_vector.dot(target_vector):.4f}")

In [None]:
hard_vector = embedder(["Most extended and complete luminosity function obtained for Galactic bulge to date"])[0]
print(f"Cosine similarity: {np.dot(hard_vector, query_vector):.4f}")

In [None]:
for i, row in new_findings_exploded[new_findings_exploded["doi"] == "10.1086/164480"].iterrows():
    print(f"Finding {i}:")
    pprint(row["findings"])
    print("-----")