In [None]:
from database.milvusdb import MilvusDB
from embedders import Embedder
import pandas as pd
from pprint import pprint
db = MilvusDB()
embedder = Embedder.create(model_name="Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True, for_queries=True)
dataset = pd.read_json("data/dataset/nontrivial_100.jsonl", lines=True)

print(db)
print(embedder)
print(dataset.columns)

In [None]:
# Get the target doi from the dataset
research = pd.read_json("data/research_used.jsonl", lines=True)


In [None]:
from query_expander import QueryExpander
reference_data = pd.read_json("data/preprocessed/reviews.jsonl", lines=True)
prev3_expander = QueryExpander("add_prev_3", reference_data=reference_data)

In [None]:
# Get one batch
example_batch = dataset.iloc[2:5]
print(type(example_batch))

In [None]:
# Get its top-k results
queries = prev3_expander(example_batch)
query_vectors = embedder(queries)
print(f"Vector shape: {query_vectors.shape}")

search_results = db.search(
    collection_name="qwen06_chunks",
    query_records=example_batch.to_dict(orient="records"),
    query_vectors=query_vectors,
    metric="IP",
    limit=200
)
search_results = search_results
print(f"Got {len(search_results)} results, length {len(search_results[0])} for first query")

# Where are the target queries located?

In [None]:
queries[1]

'Because the observed YMCs will age to become old GCs (see Section 1), they provide a convenient test bed for the study of the progenitors of stellar exotica. The progenitors of observed lusus naturæ in old GCs are not necessarily easy to identify in the young cluster population, though in some cases the evolutionary link is well established ( Glebbeek, Pols Hurley 2008 ). There may well be entire populations of peculiar objects in young star clusters that do not lead to observable interesting objects at later stages, and some objects destined for peculiarity may look perfectly ordinary at early times. An example of the latter is the dormant blue straggler population consisting of stars that were rejuvenated by mass transfer or collisions while still on the main sequence and now lurk among their fellow main-sequence stars until they remain behind after the others traverse the Hertzsprung gap (  ).'



In [None]:
example_batch.iloc[1]

In [None]:
# At what rank does the target citation appear?
idx = 1
citation_dois = example_batch.iloc[idx]['citation_dois']
print(f"Citation DOIs: {citation_dois}")
target_citation = citation_dois[0]
print(f"Target citation: {target_citation}")

these_search_results = search_results[idx]
target_found_at = [i for i, result in enumerate(these_search_results) if result['doi'] == target_citation]
print(f"Target found at ranks: {target_found_at}")
best_rank = target_found_at[0]
pprint(these_search_results[best_rank])

In [None]:
target_doi = research[research["doi"] == target_citation].iloc[0]
print(target_doi)

In [None]:
pprint(these_search_results[83])

In [None]:
# print(target_doi['body'].index("The wind momentum"))
print(target_doi['body'][43867:43867+518])

In [None]:
import numpy as np

text = "The most frequent type of encounter is one involving two main-sequence stars, leading to a main-sequence merger remnant with a mass smaller than the turnoff mass or a blue straggler when the mass of the merger exceeds the turnoff mass. If the mass of the merger is less than the turnoff mass, the product is a main-sequence star which is younger than primordial main-sequence stars with the same mass. Such a star will be left behind as a blue straggler once the primordial main-sequence stars leave the main-sequence."
text = "Blue stragglers can form from mass-transfer or collisions on the main sequence and may remain indistinguishable among normal stars until the original stars evolve off the main sequence"
test_embedding = embedder([text])[0]
print(test_embedding.shape)
print(np.linalg.norm(test_embedding))

query_vector = query_vectors[idx]
similarity = np.dot(test_embedding, query_vector)
print(f"Similarity score: {similarity}")

In [None]:
sent_no_cit = example_batch.iloc[1]['sent_no_cit']
pprint(f"Sentence without citation: {sent_no_cit}")
sent_vector = embedder([sent_no_cit])[0]

# Remove content of sent_no_cit from query[1]
prev3 = queries[1].replace(sent_no_cit, "")
pprint(f"Previous 3 sentences: {prev3}")

prev3_vector = embedder([prev3])[0]

In [133]:
weighted_vector = 3.25*sent_vector + 0.75*prev3_vector
# Normalize the vector
weighted_vector = weighted_vector / np.linalg.norm(weighted_vector)
print(np.linalg.norm(weighted_vector))

1.0


In [134]:
reweight_search_results = db.search(
    collection_name="qwen06_chunks",
    query_records=example_batch.iloc[1:2].to_dict(orient="records"),
    query_vectors=[weighted_vector],
    metric="IP",
    limit=200,
)

print(f"Got {len(reweight_search_results)} results, length {len(reweight_search_results[0])} for reweighted query")

Got 1 results, length 200 for reweighted query


In [136]:
# At what rank does the target citation appear?
idx = 1
citation_dois = example_batch.iloc[idx]["citation_dois"]
print(f"Citation DOIs: {citation_dois}")
target_citation = citation_dois[0]
print(f"Target citation: {target_citation}")

these_search_results = search_results[idx]
target_found_at = [i for i, result in enumerate(these_search_results) if result["doi"] == target_citation]
print(f"Target found at ranks: {target_found_at}")
best_rank = target_found_at[0]
pprint(these_search_results[best_rank])

Citation DOIs: ['10.48550/arXiv.astro-ph/9701042']
Target citation: 10.48550/arXiv.astro-ph/9701042
Target found at ranks: [14, 83]
{'citation_count': 28,
 'doi': '10.48550/arXiv.astro-ph/9701042',
 'metric': 0.6175870299339294,
 'pubdate': 19971201,
 'text': 'Portegies Zwart et al.: Star cluster ecology. I 0.04 0.03 n 0.02 '
         '0.01 0 10 Fig. 6. Fraction of stars in the computation of model C '
         'that are blue stragglers (upper solid line) and the fraction of '
         'stars on the main-sequence that were left behind as blue stragglers '
         'when primordial stars of equal mass evolved into giants (lower solid '
         'line), as a function of time. Due to the slow evolution on the main '
         'sequence, the lower line is less susceptible to Poissonian '
         'fluctuations. The dotted lines show the fraction of stars that are '
         'yellow stragglers, for all yellow stragglers (upper dotted line) and '
         'for those that evolved from blue str

In [None]:
queries[0]