In [107]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

Set up embedder and database

In [108]:
from embedders import Embedder
from database.milvusdb import MilvusDB

embedder = Embedder.create(model_name="Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True, for_queries=True)
print(f"Embedder created: {embedder}")

db = MilvusDB()
db.list_collections()

Embedder created: Qwen/Qwen3-Embedding-0.6B, device=mps, normalize=True, for_queries=True, dim=1024
Collections:
 - astrobert_chunks: 460801 entities
 - astrobert_contributions: 89860 entities
 - bge_chunks: 460801 entities
 - bge_contributions: 89860 entities
 - nasa_chunks: 460801 entities
 - nasa_contributions: 89860 entities
 - qwen06_chunks: 460801 entities
 - qwen8b_contributions: 89860 entities
 - specter_chunks: 460801 entities
 - specter_contributions: 89860 entities


Get dataset

In [99]:
examples = pd.read_json("data/dataset/nontrivial_checked.jsonl", lines=True)
examples = examples.sample(n=1000, random_state=43)
print(len(examples))

# Add vector column to examples
examples["vector"] = examples.progress_apply(lambda row: embedder([row["sent_no_cit"]])[0], axis=1)

# Denormalize on citation_dois (targets)
examples = examples.explode("citation_dois", ignore_index=True)
print(f"Number of samples after denormalization: {examples.shape[0]}")
examples.rename(columns={"citation_dois": "target_doi"}, inplace=True)
examples.head()

1000
Number of samples after denormalization: 1265


Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,target_doi,pubdate,resolved_bibcodes,sent_cit_masked,vector
0,10.1146/annurev-astro-082812-141031,"Indeed, Valenti et al. (2009) have argued that...","Indeed, have argued that SNe Iax are actually...",595,10.1038/nature08023,20140801,[2009Natur.459..674V],"Indeed, [REF] have argued that SNe Iax are act...","[-0.00657158, -0.029778033, -0.007904966, 0.03..."
1,10.1007/s00159-011-0047-3,Using a dedicated VLT/ISAAC multi-epoch SN sur...,Using a dedicated VLT/ISAAC multi-epoch SN sur...,1029,10.1051/0004-6361/200911982,20111101,"[2009A&A...507...61S, 2009A&A...507...71G]",Using a dedicated VLT/ISAAC multi-epoch SN sur...,"[0.036888514, -0.0060488163, -0.009207417, 0.0..."
2,10.1007/s00159-011-0047-3,Using a dedicated VLT/ISAAC multi-epoch SN sur...,Using a dedicated VLT/ISAAC multi-epoch SN sur...,1029,10.1051/0004-6361/200811254,20111101,"[2009A&A...507...61S, 2009A&A...507...71G]",Using a dedicated VLT/ISAAC multi-epoch SN sur...,"[0.036888514, -0.0060488163, -0.009207417, 0.0..."
3,10.1146/annurev-astro-081811-125615,"Madau et al. (1996 , 1998b ) and Lilly et al. ...",and developed a different method where data ...,904,10.1093/mnras/283.4.1388,20140801,"[1996MNRAS.283.1388M, 1998ApJ...498..106M, 199...",[REF] and [REF] developed a different method w...,"[-0.0115360785, -0.003134946, -0.008171569, 0...."
4,10.1146/annurev-astro-081811-125615,"Madau et al. (1996 , 1998b ) and Lilly et al. ...",and developed a different method where data ...,904,10.1086/305523,20140801,"[1996MNRAS.283.1388M, 1998ApJ...498..106M, 199...",[REF] and [REF] developed a different method w...,"[-0.0115360785, -0.003134946, -0.008171569, 0...."


In [100]:
def most_similar_to_query(example: pd.Series, candidates: pd.DataFrame) -> np.ndarray:
    """
    Takes in an example (with 'vector' column already set), and from the candidates
    (returned entities with that doi from the database), returns the vector most similar
    to the example's vector.

    """
    # Converts 'vector' column to rows * dim array, holding the candidate vectors
    candidate_vectors = np.stack(candidates["vector"])
    best_idx = np.argmax(np.dot(candidate_vectors, example["vector"]))
    best_vector = candidate_vectors[best_idx]
    return best_vector

def difference_vector(example: pd.Series, doi: str) -> np.ndarray:
    """
    Computes the difference vector between the example's vector and the most similar vector
    from the candidates retrieved by doi

    """
    candidates = db.select_by_doi(doi, collection_name="qwen06_chunks")
    most_similar = most_similar_to_query(example, candidates)
    # NOTE: be sure to remain consistent that query vector is first, target vector is 2nd in diff
    return example["vector"] - most_similar

In [101]:

# Get the sample of difference vectors from query to target
target_diff_vectors = np.zeros((len(examples), embedder.dim))
for i, row in tqdm(examples.iterrows(), total=examples.shape[0]):
    target_diff_vectors[i] = difference_vector(row, doi=row['target_doi'])

100%|██████████| 1265/1265 [00:23<00:00, 53.58it/s]


In [102]:
def compute_vector_stats(vectors: np.ndarray) -> dict:
    """
    Computes basic statistics for a set of vectors.

    Args:
        vectors (np.ndarray): An array of shape (n_samples, n_features) containing the vectors.

    Returns:
        dict: A dictionary containing the mean, std, min, and max for each feature.
    """
    mean_vector = np.mean(vectors, axis=0)
    cov_matrix = np.cov(vectors.T)
    trace = np.trace(cov_matrix)
    stats = {
        "mean_vector": mean_vector,
        "average_norm": np.linalg.norm(mean_vector),
        "std": np.std(vectors, axis=0),
        "trace": trace
    }
    return stats

In [103]:
target_stats = compute_vector_stats(target_diff_vectors)
for key, value in target_stats.items():
    print(f"Target {key}: {value}")


Target mean_vector: [-0.00266595  0.00085247  0.00141372 ...  0.00421002 -0.00266596
 -0.00092041]
Target average_norm: 0.1226045455638557
Target std: [0.03290039 0.02799492 0.00250348 ... 0.02597544 0.02834654 0.02658496]
Target trace: 0.7099010683512821


## Get the sample difference & variation for random difference vectors

In [104]:
import random

random_diff_vectors = np.zeros((len(examples), embedder.dim))
sample_dois = set(examples['target_doi'])
for i, example in tqdm(examples.iterrows(), total=examples.shape[0]):
    # Get this example's target DOIs and remove from set of choices
    example_doi = example['source_doi']
    target_dois = set(examples.loc[examples['source_doi'] == example_doi, 'target_doi'])
    choice_set = sample_dois - target_dois
    random_doi = random.choice(list(choice_set))

    random_diff_vectors[i] = difference_vector(example, doi=random_doi)

100%|██████████| 1265/1265 [00:10<00:00, 124.46it/s]


In [105]:
random_stats = compute_vector_stats(random_diff_vectors)
for key, value in random_stats.items():
    print(f"Random {key}: {value}")

Random mean_vector: [-0.00324973  0.00231311  0.00253555 ...  0.00419267 -0.00098203
 -0.00374234]
Random average_norm: 0.1496816436811038
Random std: [0.04481936 0.03786146 0.00318765 ... 0.03365424 0.03345412 0.03632914]
Random trace: 1.180025227938104


In [106]:
# Save the average difference vector to file

np.save("qwen06_difference_vector.npy", target_stats['mean_vector'])

The trace of random difference vectors' covariance matrix is 1.18, which is about 66% higher than that of the difference vectors between the query and the target. This suggests that there is much less variation in the difference vectors from the query to the target and this is perhaps a stable property.

### Get the difference vector from query+add3 to target

In [114]:
from query_expander import get_expander

add3 = get_expander("add_prev_3", path_to_data="data/preprocessed/reviews.jsonl")
print(add3)

QueryExpander(name=add_prev_3, data_length=2980)


In [116]:
expansions = add3(examples)
print(expansions[0])

Jordan et al. (2012) , Kromer et al. (2013) , and Fink et al. (2014) model SNe Iax as “failed deflagrations”—SN Ia explosions in which a transition from deflagration to detonation fails to occur, and furthermore the explosion fails to completely unbind the WD, leaving behind an ∼1M ⊙ bound remnant. This could explain the low 56 Ni yields, the low velocities, the unburnt C and He in the ejecta, the high degree of mixing, and the clumps of high-density material. However, SNe Iax occur predominantly in star-forming galaxies (but there is one case, SN 2008ge, occurring in an S0 galaxy, with no signs of star formation or pre-explosion massive stars at the explosion site; Foley et al. 2010b ), and their locations within these galaxies track the SFR similarly to the common Type IIP CC SNe ( Lyman et al. 2013 ). Indeed,  have argued that SNe Iax are actually CC SNe with low ejecta velocities derived from 7−9M ⊙ or 25−30M ⊙ progenitors, with cores collapsing into black holes.


In [119]:
# Create a copy of the df with sent_no_cit replaced by the expansion, and its embedding
expansion_df = examples.copy()
expansion_df['sent_no_cit'] = expansions
expansion_df["vector"] = expansion_df.progress_apply(lambda row: embedder([row["sent_no_cit"]])[0], axis=1)

expansion_df.head()

100%|██████████| 1265/1265 [01:20<00:00, 15.68it/s]
100%|██████████| 1265/1265 [02:59<00:00,  7.03it/s]


Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,target_doi,pubdate,resolved_bibcodes,sent_cit_masked,vector
0,10.1146/annurev-astro-082812-141031,"Indeed, Valenti et al. (2009) have argued that...","Jordan et al. (2012) , Kromer et al. (2013) , ...",595,10.1038/nature08023,20140801,[2009Natur.459..674V],"Indeed, [REF] have argued that SNe Iax are act...","[0.020763418, 0.014040946, -0.009526509, 0.012..."
1,10.1007/s00159-011-0047-3,Using a dedicated VLT/ISAAC multi-epoch SN sur...,"At even higher cluster redshift z ∼ 1, the Sup...",1029,10.1051/0004-6361/200911982,20111101,"[2009A&A...507...61S, 2009A&A...507...71G]",Using a dedicated VLT/ISAAC multi-epoch SN sur...,"[-0.005719765, -0.005792628, -0.012335606, 0.0..."
2,10.1007/s00159-011-0047-3,Using a dedicated VLT/ISAAC multi-epoch SN sur...,"At even higher cluster redshift z ∼ 1, the Sup...",1029,10.1051/0004-6361/200811254,20111101,"[2009A&A...507...61S, 2009A&A...507...71G]",Using a dedicated VLT/ISAAC multi-epoch SN sur...,"[-0.005719765, -0.005792628, -0.012335606, 0.0..."
3,10.1146/annurev-astro-081811-125615,"Madau et al. (1996 , 1998b ) and Lilly et al. ...",Redshifts z >4 have been confirmed from CO mea...,904,10.1093/mnras/283.4.1388,20140801,"[1996MNRAS.283.1388M, 1998ApJ...498..106M, 199...",[REF] and [REF] developed a different method w...,"[-0.02094898, -0.0007385412, -0.012135256, 0.0..."
4,10.1146/annurev-astro-081811-125615,"Madau et al. (1996 , 1998b ) and Lilly et al. ...",Redshifts z >4 have been confirmed from CO mea...,904,10.1086/305523,20140801,"[1996MNRAS.283.1388M, 1998ApJ...498..106M, 199...",[REF] and [REF] developed a different method w...,"[-0.02094898, -0.0007385412, -0.012135256, 0.0..."


In [120]:
# Get the sample of difference vectors from query to target
expansion_diff_vectors = np.zeros((len(expansion_df), embedder.dim))
for i, row in tqdm(expansion_df.iterrows(), total=expansion_df.shape[0]):
    expansion_diff_vectors[i] = difference_vector(row, doi=row["target_doi"])

100%|██████████| 1265/1265 [00:09<00:00, 132.89it/s]


In [121]:
expansion_stats = compute_vector_stats(expansion_diff_vectors)
for key, value in expansion_stats.items():
    print(f"Expansion {key}: {value}")

Expansion mean_vector: [-0.00333019  0.00222054  0.00044202 ... -0.00071515 -0.00295082
 -0.00144575]
Expansion average_norm: 0.09883509631172925
Expansion std: [0.03148758 0.02819422 0.0024491  ... 0.02678819 0.02768929 0.02722017]
Expansion trace: 0.6747653393168445


In [122]:
np.save('expansion_diff_vector.npy', expansion_stats['mean_vector'])