In [1]:
import pandas as pd
from tqdm import tqdm
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint
from citeline.database.milvusdb import MilvusDB
from citeline.embedders import Embedder
from citeline.query_expander import get_expander

db = MilvusDB()
print(db)

tqdm.pandas()

# Setup: load embedder, expander, dataset, db collection
embedder = Embedder.create("Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True)
print(embedder)

expander = get_expander("add_prev_3", path_to_data="../data/preprocessed/reviews.jsonl")
print(expander)

sample = pd.read_json("../data/dataset/nontrivial_nomath.jsonl", lines=True)
sample = sample.sample(20, random_state=42).reset_index(drop=True)

# Apply query expansion and embed the queries
sample["sent_no_cit"] = expander(sample)
sample["vector"] = sample.progress_apply(lambda row: embedder([row["sent_no_cit"]])[0], axis=1)

db.list_collections()
db.client.load_collection("qwen06_contributions")

sample.head()

<citeline.database.milvusdb.MilvusDB object at 0x16684a0d0>
Qwen/Qwen3-Embedding-0.6B, device=mps, normalize=True, dim=1024
QueryExpander(name=add_prev_3, data_length=2980)


100%|██████████| 20/20 [00:04<00:00,  4.47it/s]

Collections:
 - astrobert_chunks: 460801 entities
 - astrobert_contributions: 89860 entities
 - bge_chunks: 460801 entities
 - bge_contributions: 89860 entities
 - nasa_chunks: 460801 entities
 - nasa_contributions: 89860 entities
 - qwen06_chunks: 460801 entities
 - qwen06_contributions: 89860 entities
 - qwen06_findings_v2: 4342 entities
 - qwen8b_contributions: 89860 entities
 - specter_chunks: 460801 entities
 - specter_contributions: 89860 entities





Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes,sent_cit_masked,vector
0,10.1146/annurev-astro-081817-051832,LINERS in the SDSS observed through ∼1 kpc ape...,LINER emission can be produced by different em...,715,[10.1111/j.1365-2966.2006.10859.x],20190801,[2006MNRAS.372..961K],LINERS in the SDSS observed through ∼1 kpc ape...,"[0.015041431, -0.037659578, -0.0059942235, 0.0..."
1,10.1007/s00159-014-0081-z,The first studies to correct for these limitat...,In this review we will refer to Eddington rati...,519,"[10.1088/0004-637X/746/1/90, 10.1088/0004-637X...",20150101,"[2012ApJ...746...90A, 2013ApJ...775...41A]",The first studies to correct for these limitat...,"[-0.020360524, 0.024791028, -0.007710966, 0.03..."
2,10.1146/annurev.astro.36.1.599,Prior to the COBE measurement of temperature a...,"It is not reviewed extensively here, as accoun...",502,[10.1086/186504],19980101,[1992ApJ...396L...1S],Prior to the COBE measurement of temperature a...,"[-0.013863388, 0.0044795154, -0.005800571, 0.0..."
3,10.1007/s00159-010-0032-2,This data provided the most convincing evidenc...,These are produced during the epoch of inflati...,398,[10.1038/nature01269],20101001,[2002Natur.420..772K],This data provided the most convincing evidenc...,"[-0.015238127, 0.011404553, -0.012461156, 0.04..."
4,10.1146/annurev-astro-021022-043545,"For example, strong outliers from the MZR with...",There are important practical implications for...,494,[10.1088/0004-637X/695/1/259],20220801,[2009ApJ...695..259P],"For example, strong outliers from the MZR with...","[-0.0304536, -0.03546455, -0.008876894, -0.028..."


In [2]:
def get_hard_records(example: pd.Series, n: int = 2) -> list[str]:
    """
    Overfetches 3*n most similar records (bc if two reps from same doc are in top n, we won't have n distinct non-target dois)

    Returns:
      A list of doi's, ordered by their max similarity to the query
    """
    results = db.search(
        collection_name="qwen06_contributions",
        query_records=[example.to_dict()],
        query_vectors=[example.vector],
        limit=3 * n,
    )
    results = results[0]  # db.search operates on lists of queries; we only need the first result

    # Filter results to non-targets only
    target_dois = set(example.citation_dois)
    non_target_results = [r for r in results if r["doi"] not in target_dois]
    return non_target_results[:n]


def get_similarity_to_targets(example: pd.Series) -> list[float]:
    """
    For each target doi in the example, computes the max similarity between the example and any record with that doi.

    Returns a list of scores in order of example.citation_dois
    """
    similarities = []
    for target_doi in example.citation_dois:
        results = db.select_by_doi(doi=target_doi, collection_name="qwen06_contributions")
        target_vectors = np.array(results["vector"].tolist())
        similarity_scores = np.dot(example.vector, target_vectors.T)
        similarities.append(np.max(similarity_scores))
    return similarities


def compute_margins(df: pd.DataFrame, target_col: str, hard_col: str, margin_col_name: str) -> None:
    """
    For each row in the DataFrame, computes the margin between each target similarity and the hardest non-target similarity.

    Args:
      df: DataFrame containing the data
      target_col: Name of the column with list of target similarities
      hard_col: Name of the column with list of hard non-target similarities
      margin_col_name: Name of the column to store the computed margins

    Returns:
      None (modifies df in place)
    """
    df[margin_col_name] = None
    for idx, row in df.iterrows():
        target_similarities = row[target_col]
        hardest_nontarget_similarity = max(row[hard_col])
        margins = [target_sim - hardest_nontarget_similarity for target_sim in target_similarities]
        df.at[idx, margin_col_name] = margins


# Compute target and hard similarities, then the margins
sample["target_similarities"] = sample.progress_apply(get_similarity_to_targets, axis=1)
sample["hard_dois"] = None
sample["hard_similarities"] = None
for idx, example in tqdm(sample.iterrows(), total=len(sample)):
    hard_records = get_hard_records(example, n=2)
    sample.at[idx, "hard_dois"] = [r["doi"] for r in hard_records]
    sample.at[idx, "hard_similarities"] = [r["metric"] for r in hard_records]

compute_margins(sample, target_col="target_similarities", hard_col="hard_similarities", margin_col_name="old_margins")
sample.head()

100%|██████████| 20/20 [00:00<00:00, 125.86it/s]
100%|██████████| 20/20 [00:01<00:00, 14.85it/s]


Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes,sent_cit_masked,vector,target_similarities,hard_dois,hard_similarities,old_margins
0,10.1146/annurev-astro-081817-051832,LINERS in the SDSS observed through ∼1 kpc ape...,LINER emission can be produced by different em...,715,[10.1111/j.1365-2966.2006.10859.x],20190801,[2006MNRAS.372..961K],LINERS in the SDSS observed through ∼1 kpc ape...,"[0.015041431, -0.037659578, -0.0059942235, 0.0...",[0.6655957113697721],"[10.1051/0004-6361:20042277, 10.1051/0004-6361...","[0.6782451868057251, 0.6752788424491882]",[-0.012649475435952962]
1,10.1007/s00159-014-0081-z,The first studies to correct for these limitat...,In this review we will refer to Eddington rati...,519,"[10.1088/0004-637X/746/1/90, 10.1088/0004-637X...",20150101,"[2012ApJ...746...90A, 2013ApJ...775...41A]",The first studies to correct for these limitat...,"[-0.020360524, 0.024791028, -0.007710966, 0.03...","[0.7254008411120073, 0.5679474500921671]","[10.1088/0004-637X/739/2/56, 10.1086/505646]","[0.6806527972221375, 0.6702474355697632]","[0.04474804388986986, -0.11270534712997038]"
2,10.1146/annurev.astro.36.1.599,Prior to the COBE measurement of temperature a...,"It is not reviewed extensively here, as accoun...",502,[10.1086/186504],19980101,[1992ApJ...396L...1S],Prior to the COBE measurement of temperature a...,"[-0.013863388, 0.0044795154, -0.005800571, 0.0...",[0.4589954083258022],"[10.1086/304915, 10.1086/304915]","[0.7120750546455383, 0.6774315237998962]",[-0.2530796463197361]
3,10.1007/s00159-010-0032-2,This data provided the most convincing evidenc...,These are produced during the epoch of inflati...,398,[10.1038/nature01269],20101001,[2002Natur.420..772K],This data provided the most convincing evidenc...,"[-0.015238127, 0.011404553, -0.012461156, 0.04...",[0.6101514438106455],"[10.12942/lrr-2007-4, 10.1086/316325]","[0.6428074836730957, 0.6285284757614136]",[-0.03265603986245025]
4,10.1146/annurev-astro-021022-043545,"For example, strong outliers from the MZR with...",There are important practical implications for...,494,[10.1088/0004-637X/695/1/259],20220801,[2009ApJ...695..259P],"For example, strong outliers from the MZR with...","[-0.0304536, -0.03546455, -0.008876894, -0.028...",[0.6714103423488335],"[10.1093/mnras/stz243, 10.1093/mnras/stz243]","[0.7405368685722351, 0.7342004776000977]",[-0.06912652622340165]


In [3]:
margins = pd.to_numeric(sample.explode(column="old_margins")["old_margins"], errors="coerce").dropna()
margins.describe()

count    24.000000
mean     -0.060490
std       0.090116
min      -0.253080
25%      -0.122695
50%      -0.050818
75%      -0.003001
max       0.116795
Name: old_margins, dtype: float64

## Process the dois


In [4]:
dois_to_process = set(doi for dois in sample.citation_dois for doi in dois).union(
    doi for dois in sample.hard_dois for doi in dois
)
print(f"DOI's to process: {len(dois_to_process)}")

# Load research papers so we can get full text by doi
research = pd.read_json("../data/research_used.jsonl", lines=True)
research = research[research["doi"].isin(dois_to_process)].reset_index(drop=True)
print(f"Loaded {len(research)} research papers")


def doi_to_paper(doi: str) -> str:
    record = research[research["doi"] == doi].iloc[0]
    return record["title"] + "\n\n" + record["abstract"] + "\n\n" + record["body"]

# Test:
doi = list(dois_to_process)[0]
print(doi_to_paper(doi)[:500])

DOI's to process: 56
Loaded 56 research papers
The clustering of galaxies about extragalactic radio sources.

Clustering of galaxies about extragalactic radio sources is investigated by cross correlating the radio positions of 3CR and 4C sources with the positions of Zwicky galaxies and galaxies included in Lick counts. The cross-correlation functions are used to determine a linear clustering scale that ranges from galaxies selected at random in the universe to rich clusters of galaxies. It is found that: (1) the cross-correlation function f


In [None]:
from openai import OpenAI
import os


def bind_client(func):
    """
    Decorator to bind OpenAI client to a function that will provide DeepSeek API access
    """
    client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")

    def wrapper(*args, **kwargs):
        return func(client, *args, **kwargs)

    return wrapper


@bind_client
def deepseek(client, prompt: str) -> str:
    """
    Sends a prompt to the DeepSeek API (using DeepSeek-V3.1 non-thinking model)

    Expects a prompt that will instruct the model to respond with a JSON object.
    However, the function returns the raw string response, to allow for validation and
    error handling in multiple passes without losing the original response
    """
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "system", "content": prompt}],
        stream=False,
        response_format={"type": "json_object"},
    )
    return response.choices[0].message.content


response = deepseek("Respond with a JSON object with keys 'greeting' and 'farewell'")
print(response)
print(json.loads(response))

{
  "greeting": "Hello!",
  "farewell": "Goodbye!"
}
{'greeting': 'Hello!', 'farewell': 'Goodbye!'}


In [69]:
with open("../src/citeline/llm/prompts/original_contributions_v3.txt", "r") as f:
    prompt_template = f.read()

with open("new_findings.jsonl", "w") as f:
    for doi in tqdm(dois_to_process):
        paper = doi_to_paper(doi)
        prompt = prompt_template.format(paper=paper)
        try:
            response = deepseek(prompt)
        except Exception as e:
            print(f"Error processing doi {doi}: {e}")
            continue
        try:
            data = json.loads(response)
            data["doi"] = doi
            f.write(json.dumps(data) + "\n")
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for doi {doi}. Response was:\n{response}")
            with open("failed_dois.txt", "a") as f_fail:
                f_fail.write(doi + "\n")
            continue

 89%|████████▉ | 50/56 [28:06<02:11, 21.96s/it]

Error processing doi 10.48550/arXiv.astro-ph/0209244: Error code: 402 - {'error': {'message': 'Insufficient Balance', 'type': 'unknown_error', 'param': None, 'code': 'invalid_request_error'}}


 91%|█████████ | 51/56 [28:07<01:18, 15.63s/it]

Error processing doi 10.1111/j.1365-2966.2011.18569.x: Error code: 402 - {'error': {'message': 'Insufficient Balance', 'type': 'unknown_error', 'param': None, 'code': 'invalid_request_error'}}


 93%|█████████▎| 52/56 [28:07<00:44, 11.15s/it]

Error processing doi 10.1086/303508: Error code: 402 - {'error': {'message': 'Insufficient Balance', 'type': 'unknown_error', 'param': None, 'code': 'invalid_request_error'}}


 95%|█████████▍| 53/56 [28:08<00:24,  8.09s/it]

Error processing doi 10.1086/118665: Error code: 402 - {'error': {'message': 'Insufficient Balance', 'type': 'unknown_error', 'param': None, 'code': 'invalid_request_error'}}


 96%|█████████▋| 54/56 [28:09<00:11,  5.88s/it]

Error processing doi 10.1086/182301: Error code: 402 - {'error': {'message': 'Insufficient Balance', 'type': 'unknown_error', 'param': None, 'code': 'invalid_request_error'}}


 98%|█████████▊| 55/56 [28:10<00:04,  4.33s/it]

Error processing doi 10.1086/304915: Error code: 402 - {'error': {'message': 'Insufficient Balance', 'type': 'unknown_error', 'param': None, 'code': 'invalid_request_error'}}


100%|██████████| 56/56 [28:10<00:00, 30.20s/it]

Error processing doi 10.1088/0004-637X/695/1/259: Error code: 402 - {'error': {'message': 'Insufficient Balance', 'type': 'unknown_error', 'param': None, 'code': 'invalid_request_error'}}





In [70]:
new_findings = pd.read_json("new_findings.jsonl", lines=True)
print(f"Loaded {len(new_findings)} new findings")

new_findings_exploded = new_findings.explode("findings")
new_findings_exploded["vector"] = embedder(new_findings_exploded["findings"].tolist()).tolist()
new_findings_exploded.head()

Loaded 49 new findings


Unnamed: 0,findings,doi,vector
0,Cross-correlation functions were used to quant...,10.1093/mnras/189.3.433,"[-0.025416899472475052, -0.003842185717076063,..."
0,The angular cross-correlation function for gal...,10.1093/mnras/189.3.433,"[-0.029230862855911255, -0.03565283492207527, ..."
0,Very weak radio galaxies (10-100 times brighte...,10.1093/mnras/189.3.433,"[-0.00383683480322361, -0.04373485594987869, -..."
0,Extended powerful radio sources are typically ...,10.1093/mnras/189.3.433,"[0.005375322885811329, -0.008320756256580353, ..."
0,Classical double radio sources with strong emi...,10.1093/mnras/189.3.433,"[0.00968489795923233, -0.03276760131120682, -0..."


In [71]:
# Get new similarity to target
sample['new_target_similarities'] = None
sample['new_hard_similarities'] = None

def get_vectors_by_doi(doi: str) -> np.ndarray:
    return np.array(new_findings_exploded[new_findings_exploded["doi"] == doi]["vector"].tolist())


for idx, row in sample.iterrows():
    # For each target doi, compute the max similarity wrt the new embeddings
    query_vector = row['vector']
    new_similarities = []
    for target_doi in row['citation_dois']:
        target_vectors = get_vectors_by_doi(target_doi)
        new_similarities.append(np.max(np.dot(query_vector, target_vectors.T)))
    sample.at[idx, 'new_target_similarities'] = new_similarities

    # Collect all the hard vectors, compute the hard similarities
    new_hard_similarities = []
    for doi in row['hard_dois']:
        candidate_vectors = get_vectors_by_doi(doi)
        new_hard_similarities.append(np.max(np.dot(query_vector, candidate_vectors.T)))
    sample.at[idx, 'new_hard_similarities'] = new_hard_similarities

sample.head()

ValueError: shapes (1024,) and (0,) not aligned: 1024 (dim 0) != 0 (dim 0)

In [None]:
compute_margins(sample, target_col="new_target_similarities", hard_col="new_hard_similarities", margin_col_name="new_margins")
sample.head()

Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes,sent_cit_masked,vector,target_similarities,hard_dois,hard_similarities,old_margins,new_target_similarities,new_hard_similarities,new_margins
0,10.1146/annurev-astro-081817-051832,LINERS in the SDSS observed through ∼1 kpc ape...,LINER emission can be produced by different em...,715,[10.1111/j.1365-2966.2006.10859.x],20190801,[2006MNRAS.372..961K],LINERS in the SDSS observed through ∼1 kpc ape...,"[0.015041431, -0.037659578, -0.0059942235, 0.0...",[0.6655957113697721],"[10.1051/0004-6361:20042277, 10.1051/0004-6361...","[0.6782451868057251, 0.6752788424491882]",[-0.012649475435952962],[0.6850338315814171],"[0.6877027656716859, 0.6877027656716859]",[-0.002668934090268804]
1,10.1007/s00159-014-0081-z,The first studies to correct for these limitat...,In this review we will refer to Eddington rati...,519,"[10.1088/0004-637X/746/1/90, 10.1088/0004-637X...",20150101,"[2012ApJ...746...90A, 2013ApJ...775...41A]",The first studies to correct for these limitat...,"[-0.020360524, 0.024791028, -0.007710966, 0.03...","[0.7254008411120073, 0.5679474500921671]","[10.1088/0004-637X/739/2/56, 10.1086/505646]","[0.6806527972221375, 0.6702474355697632]","[0.04474804388986986, -0.11270534712997038]","[0.7196323037497792, 0.5789709734441804]","[0.5892201072718748, 0.7183288407544446]","[0.00130346299533457, -0.13935786731026423]"
2,10.1146/annurev.astro.36.1.599,Prior to the COBE measurement of temperature a...,"It is not reviewed extensively here, as accoun...",502,[10.1086/186504],19980101,[1992ApJ...396L...1S],Prior to the COBE measurement of temperature a...,"[-0.013863388, 0.0044795154, -0.005800571, 0.0...",[0.4589954083258022],"[10.1086/304915, 10.1086/304915]","[0.7120750546455383, 0.6774315237998962]",[-0.2530796463197361],[0.4391878881112823],"[0.5856518938040782, 0.5856518938040782]",[-0.14646400569279594]
3,10.1007/s00159-010-0032-2,This data provided the most convincing evidenc...,These are produced during the epoch of inflati...,398,[10.1038/nature01269],20101001,[2002Natur.420..772K],This data provided the most convincing evidenc...,"[-0.015238127, 0.011404553, -0.012461156, 0.04...",[0.6101514438106455],"[10.12942/lrr-2007-4, 10.1086/316325]","[0.6428074836730957, 0.6285284757614136]",[-0.03265603986245025],[0.5999219221811973],"[0.5019649340293122, 0.6676959086157561]",[-0.06777398643455879]
4,10.1146/annurev-astro-021022-043545,"For example, strong outliers from the MZR with...",There are important practical implications for...,494,[10.1088/0004-637X/695/1/259],20220801,[2009ApJ...695..259P],"For example, strong outliers from the MZR with...","[-0.0304536, -0.03546455, -0.008876894, -0.028...",[0.6714103423488335],"[10.1093/mnras/stz243, 10.1093/mnras/stz243]","[0.7405368685722351, 0.7342004776000977]",[-0.06912652622340165],[0.6616163290749043],"[0.7425494546228046, 0.7425494546228046]",[-0.08093312554790033]


In [None]:
def compute_margin_diffs(df: pd.DataFrame, new_col: str, ref_col: str) -> pd.Series:
    new_values = df[new_col].explode().tolist()
    ref_values = df[ref_col].explode().tolist()
    diffs = [new - ref for new, ref in zip(new_values, ref_values)]
    return pd.Series(diffs)

diffs = compute_margin_diffs(sample, new_col="new_margins", ref_col="old_margins")
print(diffs.describe())



count    24.000000
mean      0.021624
std       0.063498
min      -0.066341
25%      -0.013046
50%       0.006099
75%       0.040443
max       0.192169
dtype: float64


## Error analysis

Let's look at where the new margin is still negative (the target document vectors aren't as close to the query as the hard examples)

In [11]:
error_rows = sample[sample['new_margins'].apply(lambda margins: any(margin < 0 for margin in margins))]
print(f"{len(error_rows)} rows have at least one negative new margin")

error_margins = pd.to_numeric(error_rows.explode(column="new_margins")["new_margins"], errors="coerce").dropna()
print(error_margins.describe())

17 rows have at least one negative new margin
count    18.000000
mean     -0.083426
std       0.060875
min      -0.186137
25%      -0.136851
50%      -0.071826
75%      -0.034987
max       0.001303
Name: new_margins, dtype: float64


In [12]:
error_rows

Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes,sent_cit_masked,vector,target_similarities,hard_dois,hard_similarities,old_margins,new_target_similarities,new_hard_similarities,new_margins
0,10.1146/annurev-astro-081817-051832,LINERS in the SDSS observed through ∼1 kpc ape...,LINER emission can be produced by different em...,715,[10.1111/j.1365-2966.2006.10859.x],20190801,[2006MNRAS.372..961K],LINERS in the SDSS observed through ∼1 kpc ape...,"[0.015041431, -0.037659578, -0.0059942235, 0.0...",[0.6655957113697721],"[10.1051/0004-6361:20042277, 10.1051/0004-6361...","[0.6782451868057251, 0.6752788424491882]",[-0.012649475435952962],[0.6850338315814171],"[0.6877027656716859, 0.6877027656716859]",[-0.002668934090268804]
1,10.1007/s00159-014-0081-z,The first studies to correct for these limitat...,In this review we will refer to Eddington rati...,519,"[10.1088/0004-637X/746/1/90, 10.1088/0004-637X...",20150101,"[2012ApJ...746...90A, 2013ApJ...775...41A]",The first studies to correct for these limitat...,"[-0.020360524, 0.024791028, -0.007710966, 0.03...","[0.7254008411120073, 0.5679474500921671]","[10.1088/0004-637X/739/2/56, 10.1086/505646]","[0.6806527972221375, 0.6702474355697632]","[0.04474804388986986, -0.11270534712997038]","[0.7196323037497792, 0.5789709734441804]","[0.5892201072718748, 0.7183288407544446]","[0.00130346299533457, -0.13935786731026423]"
2,10.1146/annurev.astro.36.1.599,Prior to the COBE measurement of temperature a...,"It is not reviewed extensively here, as accoun...",502,[10.1086/186504],19980101,[1992ApJ...396L...1S],Prior to the COBE measurement of temperature a...,"[-0.013863388, 0.0044795154, -0.005800571, 0.0...",[0.4589954083258022],"[10.1086/304915, 10.1086/304915]","[0.7120750546455383, 0.6774315237998962]",[-0.2530796463197361],[0.4391878881112823],"[0.5856518938040782, 0.5856518938040782]",[-0.14646400569279594]
3,10.1007/s00159-010-0032-2,This data provided the most convincing evidenc...,These are produced during the epoch of inflati...,398,[10.1038/nature01269],20101001,[2002Natur.420..772K],This data provided the most convincing evidenc...,"[-0.015238127, 0.011404553, -0.012461156, 0.04...",[0.6101514438106455],"[10.12942/lrr-2007-4, 10.1086/316325]","[0.6428074836730957, 0.6285284757614136]",[-0.03265603986245025],[0.5999219221811973],"[0.5019649340293122, 0.6676959086157561]",[-0.06777398643455879]
4,10.1146/annurev-astro-021022-043545,"For example, strong outliers from the MZR with...",There are important practical implications for...,494,[10.1088/0004-637X/695/1/259],20220801,[2009ApJ...695..259P],"For example, strong outliers from the MZR with...","[-0.0304536, -0.03546455, -0.008876894, -0.028...",[0.6714103423488335],"[10.1093/mnras/stz243, 10.1093/mnras/stz243]","[0.7405368685722351, 0.7342004776000977]",[-0.06912652622340165],[0.6616163290749043],"[0.7425494546228046, 0.7425494546228046]",[-0.08093312554790033]
5,10.1146/annurev-astro-081817-051839,In the model envisioned by D'Ercole et al. (20...,This is due to the stellar initial mass functi...,490,[10.1111/j.1365-2966.2008.13915.x],20180901,[2008MNRAS.391..825D],"In the model envisioned by [REF] , the gas com...","[-0.044657625, -0.028853767, -0.006932063, 0.0...",[0.6369456532219859],"[10.1111/j.1365-2966.2007.11606.x, 10.1088/204...","[0.6732667684555054, 0.6654112935066223]",[-0.03632111523351944],[0.5838595407753466],"[0.6083851062728141, 0.5965818764790356]",[-0.024525565497467428]
6,10.1146/annurev-astro-120419-014455,Surveys with JWST or WFIRST will have the abil...,The proposed interferometry mission DECi-hertz...,993,[10.1088/0004-637X/734/2/102],20200801,[2011ApJ...734..102K],Surveys with JWST or WFIRST will have the abil...,"[0.028232649, -0.04315507, -0.010680103, -0.03...",[0.5027499113173651],"[10.1088/1742-6596/840/1/012010, 10.1093/mnras...","[0.6875762939453125, 0.6856400370597839]",[-0.18482638262794737],[0.5126016121120317],"[0.6723167091443054, 0.6496783645014728]",[-0.1597150970322737]
7,10.1146/annurev-astro-081817-051853,Dawson Chiang (2014) found a dynamical configu...,One way to enhance the number of warm Jupiters...,654,[10.1126/science.1256943],20180901,[2014Sci...346..212D],[REF] found a dynamical configuration that can...,"[0.0327862, -0.055534672, -0.009620562, 0.0598...",[0.7039254529010839],"[10.3847/0004-637X/829/2/132, 10.3847/0004-637...","[0.7425232529640198, 0.7328674793243408]",[-0.038597800062935894],[0.7172072006455823],"[0.7465189564489526, 0.7465189564489526]",[-0.02931175580337031]
8,10.1146/annurev-astro-052622-031748,Chon et al. (2022) found an opposite trend: in...,This agreement is despite large differences in...,927,[10.1093/mnras/stac1549],20240901,[2022MNRAS.514.4639C],[REF] found an opposite trend: in their simula...,"[-1.1811993e-05, -0.04160871, -0.00715549, -0....",[0.5759996606220756],"[10.1086/308776, 10.1086/164018]","[0.7245280742645264, 0.7085235714912415]",[-0.14852841364245073],[0.6606710623489045],"[0.7145753567058142, 0.6388411211760789]",[-0.05390429435690969]
9,10.1007/s00159-008-0010-0,"(Shang et al. 1998), do show unusual, faint fe...",Such events 123 196 R. Sancisi et al. are more...,159,[10.1086/311563],20080601,[1998ApJ...504L..23S],"([REF]), do show unusual, faint features in th...","[0.019467777, -0.050828665, -0.009596894, 0.01...",[0.5403238222714388],"[10.1086/147890, 10.48550/arXiv.astro-ph/9902227]","[0.6086273193359375, 0.6064630746841431]",[-0.06830349706449867],[0.4859017889127616],"[0.5709708032259562, 0.5361326245540075]",[-0.08506901431319458]


In [68]:
def analyze_error_row(idx: int) -> None:

    example = error_rows.iloc[idx]
    margins = [round(float(margin), 4) for margin in example['new_margins']]
    print(f"Margins: {margins}")
    print("Original sentence:")
    print(example['sent_original'])
    print("\nExpanded sentence:")
    print(example['sent_no_cit'] + "\n")

    hardest_idx = np.argmax(example['new_hard_similarities'])
    hard_doi = example['hard_dois'][hardest_idx]
    hard_findings = new_findings_exploded[new_findings_exploded['doi'] == hard_doi]
    hard_vectors = np.array(hard_findings['vector'].tolist())
    hard_similarities = np.dot(example['vector'], hard_vectors.T)
    hardest_indices = np.argsort(-hard_similarities)[:3]
    for idx in hardest_indices:
        print(f"Similarity: {hard_similarities[idx]:.4f}, DOI: {hard_findings.iloc[idx]['doi']}")
        pprint(hard_findings.iloc[idx]['findings'])
        print("-----")


def print_target_contributions(idx: int) -> None:
    row = error_rows.iloc[idx]
    print("Original sentence:")
    print(row["sent_original"])

    target_dois = row["citation_dois"]
    print(f"Target DOIs: {target_dois}")
    target_records = {
        doi: new_findings_exploded[new_findings_exploded["doi"] == doi]["findings"] for doi in target_dois
    }
    pprint("Target findings:")
    for doi in target_records:
        print(f"DOI: {doi}")
        for i, finding in enumerate(target_records[doi]):
            print(f"{i}: {finding}")
        print("-----")

idx = 10

print_target_contributions(idx)
analyze_error_row(idx)

Original sentence:
This neutrino-driven mechanism works in spherically symmetric simulations only for low-mass progenitors with an ONeMg core (e.g. Leung et al. 2020 ), also known as electron capture EC supernovae (–).
Target DOIs: ['10.3847/1538-4357/ab5d2f']
'Target findings:'
DOI: 10.3847/1538-4357/ab5d2f
0: Electron-capture supernovae occur in super-asymptotic giant branch stars with masses 8-10 solar masses that form degenerate oxygen-neon-magnesium cores
1: Oxygen-neon-magnesium cores reaching 1.38 solar masses undergo electron capture on neon isotopes, triggering oxygen-neon deflagration
2: Two-dimensional hydrodynamical simulations show bifurcation between electron-capture induced collapse and thermonuclear explosion in oxygen-neon-magnesium cores
3: The final fate of oxygen-neon-magnesium cores depends mainly on central density, with higher densities favoring collapse into neutron stars
4: Oxygen-neon-magnesium cores from stellar evolutionary models have high tendency to colla

In [36]:
import re
text = research[research["doi"] == "10.1086/186504"].iloc[0]['body']
substring = "anisotropy"
locations = [m.start() for m in re.finditer(substring, text)]
for location in locations:
    print(text[location-150:location+100])
    print("-----")

ng, or known Galactic emission. The structure is consistent with a thermal spectrum at 31, 53, and 90 0Hz as expected for cosmic microwave background anisotropy~ The rms sky variation, smoothed to a total 100 FWHM Gaussian, is 30 ± 5 pK (AT/T = 11 x 
-----
 The rms sky variation, smoothed to a total 100 FWHM Gaussian, is 30 ± 5 pK (AT/T = 11 x IO_6) for Galactic latitude I b I > 20° data with the dipole anisotropy removed. The mis cosmic quadrupole amplitude is 13 ± 4 ~iK (AT/T ~ 5 x lo_6). The angular
-----
 of large-scale structure in the universe. The COBE DMR instrument, described by Smoot et al. (1990), is designed to measure the large-angular--scale anisotropy of the CMB. The instrument operates at three frequencies : 31.5, 53, and 90 GHz (waveleng
-----
lity (Wright et al. 1992). These new results are consistent with, and substantially more sensitive than, the previously published large-angular-scale anisotropy measurements, in particular those of Princeton (Fixsen et al. 1983),

In [19]:
error_rows['citation_dois'].iloc[1]

['10.1088/0004-637X/746/1/90', '10.1088/0004-637X/775/1/41']

In [65]:
target_vector = embedder(
    ["Warm Jupiter systems may undergo stalled tidal migration that produces warm Jupiters rather than hot Jupiters"]
)[0]
# query_vector = error_rows.iloc[0]["vector"]
query_vector = embedder(
    [
        "found a dynamical configuration that can further extend the fraction of time spent as a warm Jupiter but did not quantify the likelihood of this configuration or its effect on the observed ratio of warm to hot Jupiters"
    ]
)[0]
print(f"Cosine similarity: {query_vector.dot(target_vector):.4f}")

Cosine similarity: 0.6367


### Revision 2

In [None]:
with open("../src/citeline/llm/prompts/original_contributions_v2.txt", "r") as f:
    prompt_template = f.read()

NEW_FINDINGS_FILENAME = "new_findings_v2.jsonl"

with open(NEW_FINDINGS_FILENAME, "w") as f:
    for doi in tqdm(dois_to_process):
        paper = doi_to_paper(doi)
        prompt = prompt_template.format(paper=paper)
        try:
            response = deepseek(prompt)
        except Exception as e:
            print(f"Error processing doi {doi}: {e}")
            continue
        try:
            data = json.loads(response)
            data["doi"] = doi
            f.write(json.dumps(data) + "\n")
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for doi {doi}. Response was:\n{response}")
            with open("failed_dois.txt", "a") as f_fail:
                f_fail.write(doi + "\n")
            continue

In [None]:
new_findings = pd.read_json(NEW_FINDINGS_FILENAME, lines=True)
print(f"Loaded {len(new_findings)} new findings")

new_findings_exploded = new_findings.explode("findings")
new_findings_exploded["vector"] = embedder(new_findings_exploded["findings"].tolist()).tolist()
new_findings_exploded.head()

In [None]:
# Save previous iteration and reset df for new results
sample_old = sample.copy()

# Get new similarity to target
sample["new_target_similarities"] = None
sample["new_hard_similarities"] = None

for idx, row in sample.iterrows():
    # For each target doi, compute the max similarity wrt the new embeddings
    query_vector = row["vector"]
    new_similarities = []
    for target_doi in row["citation_dois"]:
        target_vectors = get_vectors_by_doi(target_doi)
        new_similarities.append(np.max(np.dot(query_vector, target_vectors.T)))
    sample.at[idx, "new_target_similarities"] = new_similarities

    # Collect all the hard vectors, compute the hard similarities
    new_hard_similarities = []
    for doi in row["hard_dois"]:
        candidate_vectors = get_vectors_by_doi(doi)
        new_hard_similarities.append(np.max(np.dot(query_vector, candidate_vectors.T)))
    sample.at[idx, "new_hard_similarities"] = new_hard_similarities

compute_margins(
    sample, target_col="new_target_similarities", hard_col="new_hard_similarities", margin_col_name="new_margins"
)
sample.head()

diffs = compute_margin_diffs(sample, new_col="new_margins", ref_col="old_margins")
print(diffs.describe())

In [None]:
error_rows = sample[sample["new_margins"].apply(lambda margins: any(margin < 0 for margin in margins))]
print(f"Number of rows with negative new margins: {len(error_rows)}")
error_rows

In [None]:
# Print the target contributions for an error row
idx = 0
analyze_error_row(idx)

def print_target_contributions(idx: int) -> None:
    row = error_rows.iloc[idx]
    print("Original sentence:")
    print(row["sent_original"])

    target_dois = row["citation_dois"]
    target_records = {doi: new_findings_exploded[new_findings_exploded["doi"] == doi]['findings'] for doi in target_dois}
    pprint("Target findings:")
    for doi in target_records:
        print(f"DOI: {doi}")
        for i, finding in enumerate(target_records[doi]):
            print(f"{i}: {finding}")
        print("-----")
print(f"Sentence in context:\n{error_rows.iloc[idx]['sent_no_cit']}")
print_target_contributions(idx)

In [None]:
error_rows.iloc[idx]['sent_no_cit']

In [None]:
target_vector = embedder(
    [
        "Deep optical images shows a faint elliptical ring structure orbiting the spiral galaxy NGC 5907",
    ]
)[0]
# query_vector = error_rows.iloc[0]["vector"]
query_vector = embedder(
    [
        "However, deep optical images of a number of spiral galaxies, such as NGC 253, M 83, M 104, NGC 2855, (Malin and Hadley 1997) and NGC 5907 (), do show unusual, faint features in their surroundings.",
    ]
)[0]
print(f"Cosine similarity: {query_vector.dot(target_vector):.4f}")

In [None]:
hard_vector = embedder(["Most extended and complete luminosity function obtained for Galactic bulge to date"])[0]
print(f"Cosine similarity: {np.dot(hard_vector, query_vector):.4f}")

In [None]:
for i, row in new_findings_exploded[new_findings_exploded["doi"] == "10.1086/164480"].iterrows():
    print(f"Finding {i}:")
    pprint(row["findings"])
    print("-----")