In [None]:
import pandas as pd
from tqdm import tqdm
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint
from citeline.database.milvusdb import MilvusDB
from citeline.embedders import Embedder
from citeline.query_expander import get_expander
from citeline.statistics import compute_statistics

db = MilvusDB()
print(db)

tqdm.pandas()

embedder = Embedder.create("Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True)
print(embedder)

expander = get_expander("add_prev_3", path_to_data="../data/preprocessed/reviews.jsonl")
print(expander)

<citeline.database.milvusdb.MilvusDB object at 0x10ca58dd0>
Qwen/Qwen3-Embedding-0.6B, device=mps, normalize=True, dim=1024
QueryExpander(name=add_prev_3, data_length=2980)


In [None]:
sample = pd.read_json("../data/dataset/nontrivial_100.jsonl", lines=True)
sample = sample.sample(50, random_state=42).reset_index(drop=True)

# Apply query expansion and embed the queries
sample["sent_no_cit"] = expander(sample)
sample["vector"] = sample.progress_apply(lambda row: embedder([row["sent_no_cit"]])[0], axis=1)

# Expand any rows with more than one target doi
sample_normalized = sample.explode("citation_dois").reset_index(drop=True)
print(f"Number of rows after exploding: {len(sample_normalized)}")

# Get all the target dois
target_dois = list(set(sample_normalized.citation_dois.tolist()))
print(f"Number of distinct target dois: {len(target_dois)}")

100%|██████████| 50/50 [00:08<00:00,  5.64it/s]

Number of rows after exploding: 73
Number of distinct target dois: 73





In [49]:
sample.head()

Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes,sent_cit_masked,vector
0,10.1146/annurev-astro-081710-102521,Their abundance is important because molecular...,"In this limit, the important reactions are dis...",318,[10.1046/j.1365-8711.2002.04940.x],20110901,[2002MNRAS.329...18F],Their abundance is important because molecular...,"[-0.036558766, -0.016381446, -0.011056783, 0.0..."
1,10.1146/annurev-astro-081817-051826,It is important to point out that the fraction...,Gesicki et al. (2014) derived masses and ages ...,231,"[10.1051/0004-6361/201220678, 10.1051/0004-636...",20180901,"[2013A&A...549A.147B, 2017A&A...605A..89B]",It is important to point out that the fraction...,"[-0.016821792, -0.014033473, -0.008635541, 0.0..."
2,10.1007/s00159-010-0029-x,How could the seed massive black holes have gr...,This argument is particularly important at ear...,259,"[10.1086/422910, 10.1086/427065, 10.1086/50744...",20100701,"[2004ApJ...613...36H, 2005ApJ...620...59S, 200...",How could the seed massive black holes have gr...,"[-0.018864863, -0.06410713, -0.0063532544, 0.0..."
3,10.1146/annurev.aa.31.090193.003441,Nature has somehow solved this problem in doub...,These models can reproduce the observed spectr...,457,[10.1086/161053],19930101,[1983ApJ...269..423R],Nature has somehow solved this problem in doub...,"[0.044838626, -0.013867084, -0.0075066998, 0.0..."
4,10.1007/s00159-012-0055-y,"However, a similar linewidth–size scaling law ...","Size, internal velocity dispersion and column ...",306,[10.1051/0004-6361:20020629],20121101,[2002A&A...390..307O],"However, a similar linewidth–size scaling law ...","[-0.009350319, -0.05616111, -0.008808155, -0.0..."


In [None]:
sample.iloc[2]["citation_dois"]

['10.1086/422910',
 '10.1086/427065',
 '10.1086/507444',
 '10.1088/0004-637X/696/2/1798']

In [None]:
db.list_collections()
db.client.load_collection("qwen06_contributions")

Collections:
 - astrobert_chunks: 460801 entities
 - astrobert_contributions: 89860 entities
 - bge_chunks: 460801 entities
 - bge_contributions: 89860 entities
 - nasa_chunks: 460801 entities
 - nasa_contributions: 89860 entities
 - qwen06_chunks: 460801 entities
 - qwen06_contributions: 89860 entities
 - qwen8b_contributions: 89860 entities
 - specter_chunks: 460801 entities
 - specter_contributions: 89860 entities


In [84]:
# Extract a single example
idx = 1
example = sample_normalized.iloc[idx]

example_dict = example.to_dict()
for key, value in example_dict.items():
    print(f"{key}: {str(value)[:100]}")  # Print first 100 characters of each value

print("===")
print("Full query text:")
pprint(example_dict["sent_no_cit"])

source_doi: 10.1146/annurev-astro-081817-051826
sent_original: It is important to point out that the fraction of intermediate age/young dwarf stars in the Bulge, s
sent_no_cit: Gesicki et al. (2014) derived masses and ages of central stars of PNe and find ages of 3-10 Gyr. The
sent_idx: 231
citation_dois: 10.1051/0004-6361/201220678
pubdate: 20180901
resolved_bibcodes: ['2013A&A...549A.147B', '2017A&A...605A..89B']
sent_cit_masked: It is important to point out that the fraction of intermediate age/young dwarf stars in the Bulge, s
vector: [-0.01682179 -0.01403347 -0.00863554 ... -0.03616085  0.03525817
 -0.04730674]
===
Full query text:
('Gesicki et al. (2014) derived masses and ages of central stars of PNe and '
 'find ages of 3-10 Gyr. They derive ages scaling them to those by Bensby et '
 'al. (2013), therefore it is not an independent determination, and further '
 'investigation is needed. In conclusion, there might be presence of '
 'intermediate-age objects in the Bulge, although

In [85]:
example_dict["sent_no_cit"]

'Gesicki et al. (2014) derived masses and ages of central stars of PNe and find ages of 3-10 Gyr. They derive ages scaling them to those by Bensby et al. (2013), therefore it is not an independent determination, and further investigation is needed. In conclusion, there might be presence of intermediate-age objects in the Bulge, although in small numbers. It is important to point out that the fraction of intermediate age/young dwarf stars in the Bulge, suggested by results from , has to show a counterpart as C-rich Miras, and PNe.'

In [86]:
# For each row in the samples, we want to get its 'hard examples', the highest ranked dois that are not the target doi
# According to the current embedding strategy (qwen06+prev3 / contributions)

results = db.search(
    collection_name="qwen06_contributions",
    query_records=[example_dict],
    query_vectors=[example_dict["vector"]],
    limit=10,
)
results = results[0]
print(f"Number of results: {len(results)}")
unique_dois = set(result["doi"] for result in results)
print(f"Number of unique DOIs in results: {len(unique_dois)}")

target_doi = example_dict["citation_dois"]
print(f"Target DOI ({target_doi}) in top k: {target_doi in unique_dois}")

# Get the DOIs appearing in the top 5 results
# check if you're getting 5 distinct dois typically, or if you need to get more
# look at the text from those top results vs the query text

# Get the target doi
# Why is this query citing that target? Identify the reason

# Compute before stats

Number of results: 10
Number of unique DOIs in results: 8
Target DOI (10.1051/0004-6361/201220678) in top k: False


In [87]:
final_results = [{"record": sample.iloc[idx], "results": results}]
stats = compute_statistics(final_results)
for stat, values_at_k in stats.items():
    print(f"{stat:<8}: {values_at_k}")

Computing statistics: 100%|██████████| 1/1 [00:00<00:00, 2142.14it/s]

hitrate : [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
iou     : [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
recall  : [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]





In [83]:
results

[{'doi': '10.1088/0004-637X/703/2/1416',
  'pubdate': 20091001,
  'citation_count': 615,
  'text': 'A simple analytic model is provided that captures the main effects seen in numerical radiative transfer calculations, clarifying how recombination emission contributes to the ionization rates.',
  'metric': 0.6299333572387695},
 {'doi': '10.1046/j.1365-8711.2002.04940.x',
  'pubdate': 20020101,
  'citation_count': 215,
  'text': 'A minimal fraction of the cosmic abundance of metal atoms can significantly influence the ionization balance within the disc.',
  'metric': 0.6072202920913696},
 {'doi': '10.1111/j.1365-2966.2004.08313.x',
  'pubdate': 20041101,
  'citation_count': 286,
  'text': 'Found that ionic abundances obtained from recombination lines are larger than those derived from collisionally excited lines for all ions where both kinds of lines were measured.',
  'metric': 0.5999367237091064},
 {'doi': '10.1086/316190',
  'pubdate': 19980701,
  'citation_count': 2386,
  'text': 'To

1. Try an update to the prompt
1. Generate the new doc expansions for all papers in dataset or hard examples
1. see if stats improved
   - margin between first target result and first result in cosine distance?
   - hitrate, recall, iou?


In [98]:
doc = "A significant fraction of young and intermediate-age stars in the bulge cannot be fully explained by sampling biases or an He-enriched population, pointing to a complex star formation history."
vec = embedder([doc])[0]
print(np.dot(example["vector"], vec))

0.6248175


In [91]:
research = pd.read_json("../data/research_used.jsonl", lines=True)
research = research[research.doi.isin(target_dois)].reset_index(drop=True)
print(len(research))

73


In [92]:
target = research[research.doi == target_doi].iloc[0]
print(target)

bibcode                                         2013A&A...549A.147B
abstract          Based on high-resolution spectra obtained duri...
aff               [Lund Observatory, Department of Astronomy and...
author            [Bensby, T., Yee, J. C., Feltzing, S., Johnson...
bibstem                                            [A&A, A&A...549]
doctype                                                     article
doi                                     10.1051/0004-6361/201220678
id                                                          2523730
pubdate                                                  2013-01-01
title             Chemical evolution of the Galactic bulge as tr...
read_count                                                       54
reference         [1962ApJ...136..748E, 1975A&A....42..407G, 198...
data                                      [CDS:1, ESO:4, SIMBAD:61]
citation_count                                                  430
citation          [2013A&A...554A..44A, 2013A&A.

In [93]:
full_text = target["title"] + "\n" + target["abstract"] + "\n" + target["body"]
with open("temp_paper.txt", "w") as f:
    f.write(full_text)

In [14]:
db.list_collections()

Collections:
 - astrobert_chunks: 460801 entities
 - astrobert_contributions: 89860 entities
 - bge_chunks: 460801 entities
 - bge_contributions: 89860 entities
 - nasa_chunks: 460801 entities
 - nasa_contributions: 89860 entities
 - qwen06_chunks: 460801 entities
 - qwen8b_contributions: 89860 entities
 - specter_chunks: 460801 entities
 - specter_contributions: 89860 entities
