In [146]:
import pandas as pd
from tqdm import tqdm
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint
from citeline.database.milvusdb import MilvusDB
from citeline.embedders import Embedder
from citeline.query_expander import get_expander
from citeline.statistics import compute_statistics

db = MilvusDB()
print(db)

tqdm.pandas()

embedder = Embedder.create("Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True)
print(embedder)

expander = get_expander("add_prev_3", path_to_data="../data/preprocessed/reviews.jsonl")
print(expander)

<citeline.database.milvusdb.MilvusDB object at 0x51bb5fd90>
Qwen/Qwen3-Embedding-0.6B, device=mps, normalize=True, dim=1024
QueryExpander(name=add_prev_3, data_length=2980)


In [148]:
sample = pd.read_json("../data/dataset/nontrivial_100.jsonl", lines=True)
sample = sample.sample(50, random_state=42).reset_index(drop=True)

# Apply query expansion and embed the queries
sample["sent_no_cit"] = expander(sample)
sample["vector"] = sample.progress_apply(lambda row: embedder([row["sent_no_cit"]])[0], axis=1)

# # Expand any rows with more than one target doi
# sample_normalized = sample.explode("citation_dois").reset_index(drop=True)
# print(f"Number of rows after exploding: {len(sample_normalized)}")

# # Get all the target dois
# target_dois = list(set(sample_normalized.citation_dois.tolist()))
# print(f"Number of distinct target dois: {len(target_dois)}")

100%|██████████| 50/50 [00:03<00:00, 14.35it/s]


In [149]:
db.list_collections()
db.client.load_collection("qwen06_contributions")

Collections:
 - astrobert_chunks: 460801 entities
 - astrobert_contributions: 89860 entities
 - bge_chunks: 460801 entities
 - bge_contributions: 89860 entities
 - nasa_chunks: 460801 entities
 - nasa_contributions: 89860 entities
 - qwen06_chunks: 460801 entities
 - qwen06_contributions: 89860 entities
 - qwen8b_contributions: 89860 entities
 - specter_chunks: 460801 entities
 - specter_contributions: 89860 entities


In [150]:
# Get the 'hard examples' for each query; e.g. the top most similar papers that are in the query's target
hard_examples = []

def get_hard_examples(example: pd.Series, n: int=2) -> list[str]:
    """
    Overfetches 3*n most similar records (bc if two reps from same doc are in top n, we won't have n distinct non-target dois)

    Returns:
      A list of doi's, ordered by their max similarity to the query
    """
    results = db.search(
        collection_name="qwen06_contributions",
        query_records=[example.to_dict()],
        query_vectors=[example.vector],
        limit=3*n,
    )
    results = results[0]
    
    # Filter results to non-targets only
    target_dois = set(example.citation_dois)
    non_target_results = [r for r in results if r['doi'] not in target_dois]
    assert len(non_target_results) >= n, f"Not enough non-target results for example {example}"
    hard_dois = [r['doi'] for r in non_target_results[:n]]
    hard_distances = [r['metric'] for r in non_target_results[:n]]
    return hard_dois, hard_distances


test_example = sample.iloc[0]
hard_dois, hard_similarities = get_hard_examples(test_example, n=2)
print(f"Test example has targets: {test_example.citation_dois}")
pprint(hard_dois)
pprint(hard_similarities)


Test example has targets: ['10.1046/j.1365-8711.2002.04940.x']
['10.1088/0004-637X/703/2/1416', '10.1111/j.1365-2966.2004.08313.x']
[0.6299333572387695, 0.5999367237091064]


In [152]:
# Get the hard examples' doi's and similarity scores for each query in the example
sample['hard_dois'] = None
sample["hard_similarities"] = None
for idx, example in tqdm(sample.iterrows(), total=len(sample)):
    hard_dois, hard_similarities = get_hard_examples(example, n=5)
    sample.at[idx, 'hard_dois'] = hard_dois
    sample.at[idx, 'hard_similarities'] = hard_similarities

100%|██████████| 50/50 [00:02<00:00, 17.22it/s]


In [153]:
sample.head()

Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes,sent_cit_masked,vector,hard_dois,hard_similarities
0,10.1146/annurev-astro-081710-102521,Their abundance is important because molecular...,"In this limit, the important reactions are dis...",318,[10.1046/j.1365-8711.2002.04940.x],20110901,[2002MNRAS.329...18F],Their abundance is important because molecular...,"[-0.036558766, -0.016381446, -0.011056783, 0.0...","[10.1088/0004-637X/703/2/1416, 10.1111/j.1365-...","[0.6299333572387695, 0.5999367237091064, 0.593..."
1,10.1146/annurev-astro-081817-051826,It is important to point out that the fraction...,Gesicki et al. (2014) derived masses and ages ...,231,"[10.1051/0004-6361/201220678, 10.1051/0004-636...",20180901,"[2013A&A...549A.147B, 2017A&A...605A..89B]",It is important to point out that the fraction...,"[-0.016821792, -0.014033473, -0.008635541, 0.0...","[10.1093/mnras/stx373, 10.1051/0004-6361:20021...","[0.6454620361328125, 0.6320536732673645, 0.629..."
2,10.1007/s00159-010-0029-x,How could the seed massive black holes have gr...,This argument is particularly important at ear...,259,"[10.1086/422910, 10.1086/427065, 10.1086/50744...",20100701,"[2004ApJ...613...36H, 2005ApJ...620...59S, 200...",How could the seed massive black holes have gr...,"[-0.018864863, -0.06410713, -0.0063532544, 0.0...","[10.1111/j.1365-2966.2006.10467.x, 10.1111/j.1...","[0.7770614624023438, 0.7540679574012756, 0.751..."
3,10.1146/annurev.aa.31.090193.003441,Nature has somehow solved this problem in doub...,These models can reproduce the observed spectr...,457,[10.1086/161053],19930101,[1983ApJ...269..423R],Nature has somehow solved this problem in doub...,"[0.044838626, -0.013867084, -0.0075066998, 0.0...","[10.1086/164480, 10.1086/155083, 10.1093/mnras...","[0.661096453666687, 0.6198840141296387, 0.6177..."
4,10.1007/s00159-012-0055-y,"However, a similar linewidth–size scaling law ...","Size, internal velocity dispersion and column ...",306,[10.1051/0004-6361:20020629],20121101,[2002A&A...390..307O],"However, a similar linewidth–size scaling law ...","[-0.009350319, -0.05616111, -0.008808155, -0.0...","[10.1086/169766, 10.1086/177465, 10.1086/18481...","[0.6350905299186707, 0.5991687178611755, 0.595..."


In [154]:
def average_target_similarity(example: pd.Series) -> float:
    """
    For examples with a single target doi, this computes the maximum similarity between the query vector and
    all vectors associated with the target doi in the database.

    For examples with multiple target dois, this computes the average maximum similarity across each target doi.
    """
    metric_values = []
    for target_doi in example.citation_dois:
        target_results = db.select_by_doi(doi=target_doi, collection_name="qwen06_contributions")
        target_vectors = np.array(target_results['vector'].tolist())
        dot_products = np.dot(example.vector, target_vectors.T)
        metric_values.append(max(dot_products))
    return np.mean(metric_values) if metric_values else 0.0


def average_nontarget_similarity(example: pd.Series) -> float:
    """
    Computes the average similarity between the query vector and the hard non-target examples.
    """
    return np.mean(example.hard_similarities) if example.hard_similarities else 0.0

def similarity_margin(example: pd.Series) -> float:
    """
    Computes the difference between the average target similarity and the average non-target similarity.
    """
    return average_target_similarity(example) - average_nontarget_similarity(example)
print(f"Average target similarity: {average_target_similarity(sample.iloc[10])}")
print(f"Average non-target similarity: {average_nontarget_similarity(sample.iloc[10])}")
print(f"Similarity margin: {similarity_margin(sample.iloc[10])}")

Average target similarity: 0.6101513374525258
Average non-target similarity: 0.6174222111701966
Similarity margin: -0.007270873717670767


Using our LLM, we need to reprocess the target doi's and the hard example non-target doi's

In [139]:
research = pd.read_json("../data/research_used.jsonl", lines=True)
def doi_to_paper(doi: str) -> str:
    record = research[research['doi'] == doi].iloc[0]
    return record['title'] + "\n\n" + record['abstract'] + "\n\n" + record['body']

# Test:
doi = "10.1088/0004-637X/703/2/1416"
print(doi_to_paper(doi)[:500])

A New Calculation of the Ionizing Background Spectrum and the Effects of He II Reionization

The ionizing background determines the ionization balance and the thermodynamics of the cosmic gas. It is therefore a fundamental ingredient to theoretical and empirical studies of both the intergalactic medium (IGM) and galaxy formation. We present here a new calculation of its spectrum that satisfies the empirical constraints we recently obtained by combining state-of-the-art luminosity functions and i


In [142]:
full_results = []
for idx, example in tqdm(sample.iterrows(), total=len(sample)):
    results = db.search(
        collection_name="qwen06_contributions",
        query_records=[example.to_dict()],
        query_vectors=[example.vector],
        limit=10,
    )
    this_result = {"record": example, "results": results[0]}
    full_results.append(this_result)

100%|██████████| 50/50 [00:02<00:00, 16.74it/s]


In [143]:
stats = compute_statistics(full_results)
pprint(stats)

Computing statistics: 100%|██████████| 50/50 [00:00<00:00, 5906.14it/s]

{'hitrate': array([0.26, 0.36, 0.38, 0.4 , 0.44, 0.44, 0.46, 0.5 , 0.58, 0.58]),
 'iou': array([0.19666667, 0.18833333, 0.17666667, 0.16366667, 0.154     ,
       0.12852381, 0.10692857, 0.10246032, 0.10349278, 0.08976335]),
 'recall': array([0.19666667, 0.27333333, 0.30333333, 0.34      , 0.38      ,
       0.38      , 0.385     , 0.425     , 0.49166667, 0.49166667])}





In [None]:
# THIS IS THE ITERATION 

# Get a set of all the hard doi's
# Get a set of all the citation doi's
hard_dois = set(doi for example in sample['hard_dois'] for doi in example)
target_dois = set(doi for example in sample['citation_dois'] for doi in example)
dois_to_process = hard_dois.union(target_dois)
print(f"Number of distinct dois to process: {len(dois_to_process)}")

# retrieve their papers and process their contributions using the LLM

# create a database table for these new contributions and insert them

# create a copy of the sample df, dropping the previous hard example data
# compute the new hard examples based on the new embeddings

Number of distinct dois to process: 276


In [156]:
from openai import OpenAI
import os

def bind_client(func):
    """
    Decorator to bind OpenAI client to a function that will provide DeepSeek API access
    """
    client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")
    def wrapper(*args, **kwargs):
        return func(client, *args, **kwargs)
    return wrapper

@bind_client
def deepseek(client, prompt: str) -> str:
    """
    Sends a prompt to the DeepSeek API (using DeepSeek-V3.1 non-thinking model)
    
    Expects a prompt that will instruct the model to respond with a JSON object.
    However, the function returns the raw string response, to allow for validation and
    error handling in multiple passes without losing the original response
    """
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": prompt}
        ],
        stream=False,
        response_format={"type": "json_object"},
    )
    return response.choices[0].message.content



response = deepseek("Respond with a JSON object with keys 'greeting' and 'farewell'")
print(response)
print(json.loads(response))

{"greeting": "Hello!", "farewell": "Goodbye!"}
{'greeting': 'Hello!', 'farewell': 'Goodbye!'}


In [161]:
with open("../src/citeline/llm/prompts/original_contributions_revised.txt", "r") as f:
    prompt_template = f.read()

with open("new_findings.jsonl", "w") as f:
    for doi in tqdm(dois_to_process):
        paper = doi_to_paper(doi)
        prompt = prompt_template.format(paper=paper)
        try:
            response = deepseek(prompt)
        except Exception as e:
            print(f"Error processing doi {doi}: {e}")
            continue
        try:
            data = json.loads(response)
            data['doi'] = doi
            f.write(json.dumps(data) + "\n")
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for doi {doi}. Response was:\n{response}")
            continue




100%|██████████| 276/276 [1:59:27<00:00, 25.97s/it]  


In [160]:
lst =json.loads(response)
pprint(lst['findings'])

['Kepler mission will monitor over 100,000 stars with a cadence of 30 min for '
 'asteroseismic investigation',
 'Kepler Asteroseismic Investigation (KAI) will have 11,800 pixels allocated '
 'for short cadence (60 s) observations in specific target phase',
 'Conservative estimate: ~3000 pixels allocated to F, G, and K main sequence '
 'stars suitable for studying stellar cycles',
 'Photometric error for 60-s integration: 45 ppm at 8th magnitude, 69 ppm at '
 '9th magnitude, 111 ppm at 10th magnitude',
 'Frequency precision determined by S/N of oscillation modes, mode lifetime τ, '
 'and observation length T using Libbrecht (1992) formulation',
 'Mode height decreases from ~4000 ppm²/μHz for Sun to just over 1000 ppm²/μHz '
 'for stars at 4500 K',
 'Power density error: 0.24 ppm²/μHz at 8th magnitude, 0.57 ppm²/μHz at 9th '
 'magnitude, 1.47 ppm²/μHz at 10th magnitude',
 'Rotational frequency splitting decreases towards lower effective temperature '
 'with steeper slope for stars hotte

In [None]:
# from citeline.llm.llm_function import LLMFunction
# from citeline.llm.models import Findings

# write_contributions = LLMFunction(
#     model_name="deepseek-r1:70b",
#     prompt_path="../src/citeline/llm/prompts/original_contributions_revised.txt",
#     output_model=Findings,
# )

# write_contributions(prompt_kwargs={"paper": doi_to_paper(doi)})

LLM response: findings=['The ionizing background at high redshifts is local, meaning the specific intensity Jν depends mainly on the local emissivity εν due to the short mean free path of photons.', 'The mean free path Δl mfp(ν,z) is proportional to ν^3 for H I and ν^6 for He II when considering uniform absorbing material.', 'For discrete absorbers with a column density distribution dN/dN_i ∝ N_i^-β, the mean free path scales as ν^(3(β-1)), leading to spectral hardening.', 'The specific intensity Jν is hardened such that Jν ∝ εν ν^3 near the H I ionization edge and Jν ∝ εν ν^6 near the He II edge when absorbers are uniformly distributed.', 'In the case of discrete absorbers, the hardening is weaker: Jν ∝ εν ν^(3(β-1)) for H I with β=1.4, resulting in a less pronounced spectral change.', 'For an isolated point source, the spectrum is attenuated by a factor exp(-τν(l)), where τν(l) = n_i σ_i(ν) l, leading to different hardening compared to uniform emission.', 'The photoionization cross-s

Findings(findings=['The ionizing background at high redshifts is local, meaning the specific intensity Jν depends mainly on the local emissivity εν due to the short mean free path of photons.', 'The mean free path Δl mfp(ν,z) is proportional to ν^3 for H I and ν^6 for He II when considering uniform absorbing material.', 'For discrete absorbers with a column density distribution dN/dN_i ∝ N_i^-β, the mean free path scales as ν^(3(β-1)), leading to spectral hardening.', 'The specific intensity Jν is hardened such that Jν ∝ εν ν^3 near the H I ionization edge and Jν ∝ εν ν^6 near the He II edge when absorbers are uniformly distributed.', 'In the case of discrete absorbers, the hardening is weaker: Jν ∝ εν ν^(3(β-1)) for H I with β=1.4, resulting in a less pronounced spectral change.', 'For an isolated point source, the spectrum is attenuated by a factor exp(-τν(l)), where τν(l) = n_i σ_i(ν) l, leading to different hardening compared to uniform emission.', 'The photoionization cross-sectio

In [84]:
# Extract a single example
idx = 1
example = sample_normalized.iloc[idx]

example_dict = example.to_dict()
for key, value in example_dict.items():
    print(f"{key}: {str(value)[:100]}")  # Print first 100 characters of each value

print("===")
print("Full query text:")
pprint(example_dict["sent_no_cit"])

source_doi: 10.1146/annurev-astro-081817-051826
sent_original: It is important to point out that the fraction of intermediate age/young dwarf stars in the Bulge, s
sent_no_cit: Gesicki et al. (2014) derived masses and ages of central stars of PNe and find ages of 3-10 Gyr. The
sent_idx: 231
citation_dois: 10.1051/0004-6361/201220678
pubdate: 20180901
resolved_bibcodes: ['2013A&A...549A.147B', '2017A&A...605A..89B']
sent_cit_masked: It is important to point out that the fraction of intermediate age/young dwarf stars in the Bulge, s
vector: [-0.01682179 -0.01403347 -0.00863554 ... -0.03616085  0.03525817
 -0.04730674]
===
Full query text:
('Gesicki et al. (2014) derived masses and ages of central stars of PNe and '
 'find ages of 3-10 Gyr. They derive ages scaling them to those by Bensby et '
 'al. (2013), therefore it is not an independent determination, and further '
 'investigation is needed. In conclusion, there might be presence of '
 'intermediate-age objects in the Bulge, although

In [85]:
example_dict["sent_no_cit"]

'Gesicki et al. (2014) derived masses and ages of central stars of PNe and find ages of 3-10 Gyr. They derive ages scaling them to those by Bensby et al. (2013), therefore it is not an independent determination, and further investigation is needed. In conclusion, there might be presence of intermediate-age objects in the Bulge, although in small numbers. It is important to point out that the fraction of intermediate age/young dwarf stars in the Bulge, suggested by results from , has to show a counterpart as C-rich Miras, and PNe.'

In [86]:
# For each row in the samples, we want to get its 'hard examples', the highest ranked dois that are not the target doi
# According to the current embedding strategy (qwen06+prev3 / contributions)

results = db.search(
    collection_name="qwen06_contributions",
    query_records=[example_dict],
    query_vectors=[example_dict["vector"]],
    limit=10,
)
results = results[0]
print(f"Number of results: {len(results)}")
unique_dois = set(result["doi"] for result in results)
print(f"Number of unique DOIs in results: {len(unique_dois)}")

target_doi = example_dict["citation_dois"]
print(f"Target DOI ({target_doi}) in top k: {target_doi in unique_dois}")

# Get the DOIs appearing in the top 5 results
# check if you're getting 5 distinct dois typically, or if you need to get more
# look at the text from those top results vs the query text

# Get the target doi
# Why is this query citing that target? Identify the reason

# Compute before stats

Number of results: 10
Number of unique DOIs in results: 8
Target DOI (10.1051/0004-6361/201220678) in top k: False


In [87]:
final_results = [{"record": sample.iloc[idx], "results": results}]
stats = compute_statistics(final_results)
for stat, values_at_k in stats.items():
    print(f"{stat:<8}: {values_at_k}")

Computing statistics: 100%|██████████| 1/1 [00:00<00:00, 2142.14it/s]

hitrate : [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
iou     : [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
recall  : [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]





In [83]:
results

[{'doi': '10.1088/0004-637X/703/2/1416',
  'pubdate': 20091001,
  'citation_count': 615,
  'text': 'A simple analytic model is provided that captures the main effects seen in numerical radiative transfer calculations, clarifying how recombination emission contributes to the ionization rates.',
  'metric': 0.6299333572387695},
 {'doi': '10.1046/j.1365-8711.2002.04940.x',
  'pubdate': 20020101,
  'citation_count': 215,
  'text': 'A minimal fraction of the cosmic abundance of metal atoms can significantly influence the ionization balance within the disc.',
  'metric': 0.6072202920913696},
 {'doi': '10.1111/j.1365-2966.2004.08313.x',
  'pubdate': 20041101,
  'citation_count': 286,
  'text': 'Found that ionic abundances obtained from recombination lines are larger than those derived from collisionally excited lines for all ions where both kinds of lines were measured.',
  'metric': 0.5999367237091064},
 {'doi': '10.1086/316190',
  'pubdate': 19980701,
  'citation_count': 2386,
  'text': 'To

1. Try an update to the prompt
1. Generate the new doc expansions for all papers in dataset or hard examples
1. see if stats improved
   - margin between first target result and first result in cosine distance?
   - hitrate, recall, iou?


In [98]:
doc = "A significant fraction of young and intermediate-age stars in the bulge cannot be fully explained by sampling biases or an He-enriched population, pointing to a complex star formation history."
vec = embedder([doc])[0]
print(np.dot(example["vector"], vec))

0.6248175


In [91]:
research = pd.read_json("../data/research_used.jsonl", lines=True)
research = research[research.doi.isin(target_dois)].reset_index(drop=True)
print(len(research))

73


In [92]:
target = research[research.doi == target_doi].iloc[0]
print(target)

bibcode                                         2013A&A...549A.147B
abstract          Based on high-resolution spectra obtained duri...
aff               [Lund Observatory, Department of Astronomy and...
author            [Bensby, T., Yee, J. C., Feltzing, S., Johnson...
bibstem                                            [A&A, A&A...549]
doctype                                                     article
doi                                     10.1051/0004-6361/201220678
id                                                          2523730
pubdate                                                  2013-01-01
title             Chemical evolution of the Galactic bulge as tr...
read_count                                                       54
reference         [1962ApJ...136..748E, 1975A&A....42..407G, 198...
data                                      [CDS:1, ESO:4, SIMBAD:61]
citation_count                                                  430
citation          [2013A&A...554A..44A, 2013A&A.

In [93]:
full_text = target["title"] + "\n" + target["abstract"] + "\n" + target["body"]
with open("temp_paper.txt", "w") as f:
    f.write(full_text)

In [14]:
db.list_collections()

Collections:
 - astrobert_chunks: 460801 entities
 - astrobert_contributions: 89860 entities
 - bge_chunks: 460801 entities
 - bge_contributions: 89860 entities
 - nasa_chunks: 460801 entities
 - nasa_contributions: 89860 entities
 - qwen06_chunks: 460801 entities
 - qwen8b_contributions: 89860 entities
 - specter_chunks: 460801 entities
 - specter_contributions: 89860 entities
