In [3]:
import json
import pandas as pd
import numpy as np
from embedders import Embedder

document_embedder = Embedder.create(
    model_name="Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True, for_queries=False
)
print(document_embedder)

Qwen/Qwen3-Embedding-0.6B, device=mps, normalize=True, for_queries=False, dim=1024


In [6]:
target_dois = json.load(open("target_dois.json"))
research = pd.read_json("../data/research_used.jsonl", lines=True)
# To save memory, filter the research df to only include target DOIs
research = research[research["doi"].isin(target_dois)]
print(len(target_dois))
print(len(research))

122
122


In [14]:
from LLMFunction import ChatResponse
from models import Findings

MODEL_NAME = "mistral-nemo"
PROMPT_PATH = "prompts/original_contributions.txt"

chat_response = ChatResponse(
    model_name=MODEL_NAME, prompt_path=PROMPT_PATH, output_model=Findings
)


def record_to_paper(record: pd.Series) -> str:
    return "\n\n".join([record.title, record.abstract, record.body])

def record_to_findings(record) -> list[str]:
    paper = record_to_paper(record)
    response = chat_response({"paper": paper})
    return response.findings

In [19]:
from tqdm import tqdm
research_df = pd.DataFrame(columns=["doi", "finding", "embedding"])
for example in tqdm(research.itertuples(), desc="Processing research records"):
    paper = record_to_paper(example)
    doi = example.doi
    findings = record_to_findings(example)
    print(f"DOI: {doi}, {len(findings)} findings generated")
    embeddings = document_embedder(findings)
    rows = [{"doi": doi, "finding": finding, "embedding": embedding} for finding, embedding in zip(findings, embeddings)]
    research_df = pd.concat([research_df, pd.DataFrame(rows)], ignore_index=True)

Processing research records: 0it [00:00, ?it/s]

DOI: 10.1086/305588, 3 findings generated


Processing research records: 1it [00:14, 14.75s/it]

DOI: 10.1086/157753, 2 findings generated


Processing research records: 2it [00:27, 13.54s/it]

DOI: 10.1093/mnras/200.1.115, 4 findings generated


Processing research records: 3it [00:43, 14.86s/it]

DOI: 10.1086/154079, 5 findings generated


Processing research records: 4it [00:59, 15.17s/it]

DOI: 10.1046/j.1365-8711.2001.04006.x, 3 findings generated


Processing research records: 5it [01:11, 14.08s/it]

DOI: 10.1086/147433, 6 findings generated


Processing research records: 6it [01:27, 14.65s/it]

DOI: 10.1086/175121, 4 findings generated


Processing research records: 7it [01:42, 14.95s/it]

DOI: 10.1086/156499, 3 findings generated


Processing research records: 8it [01:55, 14.33s/it]

DOI: 10.1086/164050, 5 findings generated


Processing research records: 9it [02:10, 14.51s/it]

DOI: 10.1046/j.1365-8711.2003.06206.x, 18 findings generated


Processing research records: 11it [02:50, 17.00s/it]

DOI: 10.5303/JKAS.2015.48.5.237, 5 findings generated
DOI: 10.1046/j.1365-8711.2002.05128.x, 2 findings generated


Processing research records: 12it [03:03, 15.68s/it]

DOI: 10.1088/0067-0049/220/1/11, 3 findings generated


Processing research records: 13it [03:17, 15.22s/it]

DOI: 10.1093/mnras/193.3.439, 2 findings generated


Processing research records: 15it [03:40, 13.27s/it]

DOI: 10.1088/0004-637X/810/1/25, 0 findings generated
DOI: 10.1086/422899, 5 findings generated


Processing research records: 16it [03:59, 14.83s/it]

DOI: 10.1093/mnras/285.2.403, 2 findings generated


Processing research records: 17it [04:12, 14.22s/it]

DOI: 10.1086/176805, 4 findings generated


Processing research records: 18it [04:27, 14.53s/it]

DOI: 10.1088/0004-637X/690/1/231, 5 findings generated


Processing research records: 19it [04:42, 14.69s/it]

DOI: 10.1086/172786, 5 findings generated


Processing research records: 20it [04:58, 15.02s/it]

DOI: 10.1051/0004-6361:20053838, 2 findings generated


Processing research records: 21it [05:10, 14.22s/it]

DOI: 10.1086/339181, 6 findings generated


Processing research records: 22it [05:28, 15.30s/it]

DOI: 10.1093/mnras/203.1.1, 3 findings generated


Processing research records: 23it [05:42, 14.80s/it]

DOI: 10.1126/science.aar7480, 3 findings generated


Processing research records: 24it [05:56, 14.54s/it]

DOI: 10.1086/587434, 4 findings generated


Processing research records: 25it [06:11, 14.91s/it]

DOI: 10.1051/0004-6361/201321397, 7 findings generated


Processing research records: 26it [06:30, 16.02s/it]

DOI: 10.1051/0004-6361/201425222, 4 findings generated


Processing research records: 28it [06:59, 15.29s/it]

DOI: 10.1086/376724, 4 findings generated
DOI: 10.1086/424585, 5 findings generated


Processing research records: 29it [07:14, 15.10s/it]

DOI: 10.1086/320852, 8 findings generated


Processing research records: 30it [07:34, 16.76s/it]

DOI: 10.1086/322995, 7 findings generated


Processing research records: 31it [07:54, 17.50s/it]

DOI: 10.1051/0004-6361:20034469, 2 findings generated


Processing research records: 32it [08:05, 15.62s/it]

DOI: 10.1086/324186, 5 findings generated


Processing research records: 33it [08:23, 16.53s/it]

DOI: 10.1086/157992, 5 findings generated


Processing research records: 34it [08:39, 16.20s/it]

DOI: 10.1086/505417, 4 findings generated


Processing research records: 36it [09:08, 15.40s/it]

DOI: 10.1038/s41550-017-0088, 5 findings generated
DOI: 10.1086/305523, 5 findings generated


Processing research records: 37it [09:29, 16.91s/it]

DOI: 10.1093/mnras/stu2726, 5 findings generated


Processing research records: 38it [09:45, 16.73s/it]

DOI: 10.1086/430500, 4 findings generated


Processing research records: 40it [10:17, 16.33s/it]

DOI: 10.1093/mnras/stu987, 4 findings generated


Processing research records: 41it [10:31, 15.67s/it]

DOI: 10.1016/0019-1035(80)90076-7, 3 findings generated
DOI: 10.1088/0004-637X/783/2/114, 1 findings generated


Processing research records: 42it [10:48, 16.04s/it]

DOI: 10.3847/0004-637X/832/1/41, 3 findings generated


Processing research records: 44it [11:18, 15.59s/it]

DOI: 10.1126/science.aaf0714, 5 findings generated
DOI: 10.1086/368016, 1 findings generated


Processing research records: 45it [11:31, 14.80s/it]

DOI: 10.1086/156310, 2 findings generated


Processing research records: 46it [11:45, 14.49s/it]

DOI: 10.1086/163350, 3 findings generated


Processing research records: 47it [12:00, 14.56s/it]

DOI: 10.3847/2041-8213/ac6674, 5 findings generated


Processing research records: 49it [12:30, 14.86s/it]

DOI: 10.1046/j.1365-8711.2002.05041.x, 3 findings generated
DOI: 10.1029/2005GL024484, 5 findings generated


Processing research records: 50it [12:46, 15.01s/it]

DOI: 10.1086/115504, 5 findings generated


Processing research records: 51it [13:00, 14.83s/it]

DOI: 10.1093/mnras/213.1.1, 4 findings generated


Processing research records: 52it [13:15, 14.79s/it]

DOI: 10.1086/190360, 3 findings generated


Processing research records: 53it [13:29, 14.70s/it]

DOI: 10.1126/science.1251053, 4 findings generated


Processing research records: 54it [13:43, 14.54s/it]

DOI: 10.1088/0004-6256/136/6/2782, 2 findings generated


Processing research records: 56it [14:07, 13.01s/it]

DOI: 10.1086/117477, 0 findings generated
DOI: 10.1051/0004-6361:20041471, 9 findings generated


Processing research records: 57it [14:29, 15.61s/it]

DOI: 10.1051/0004-6361:20021660, 6 findings generated


Processing research records: 58it [14:46, 16.05s/it]

DOI: 10.1038/s41550-017-0184, 5 findings generated


Processing research records: 59it [15:00, 15.64s/it]

DOI: 10.1086/190455, 1 findings generated


Processing research records: 60it [15:12, 14.42s/it]

DOI: 10.1093/mnras/sts448, 3 findings generated


Processing research records: 61it [15:30, 15.44s/it]

DOI: 10.1088/0004-637X/754/2/105, 7 findings generated


Processing research records: 63it [16:11, 17.11s/it]

DOI: 10.1086/175072, 1 findings generated
DOI: 10.1086/177166, 3 findings generated


Processing research records: 64it [16:23, 15.69s/it]

DOI: 10.1103/PhysRevD.81.062002, 5 findings generated


Processing research records: 65it [16:40, 16.07s/it]

DOI: 10.1088/0004-637X/723/2/1019, 7 findings generated


Processing research records: 66it [16:59, 16.89s/it]

DOI: 10.1086/303510, 7 findings generated


Processing research records: 67it [17:20, 18.29s/it]

DOI: 10.1086/161573, 2 findings generated


Processing research records: 68it [17:34, 16.80s/it]

DOI: 10.1051/0004-6361/201833650, 6 findings generated


Processing research records: 69it [17:53, 17.48s/it]

DOI: 10.1086/308231, 6 findings generated


Processing research records: 70it [18:13, 18.34s/it]

DOI: 10.1093/mnras/stv2400, 7 findings generated


Processing research records: 71it [18:32, 18.51s/it]

DOI: 10.1093/mnras/stx2463, 4 findings generated


Processing research records: 72it [18:49, 18.17s/it]

DOI: 10.1126/science.1223344, 3 findings generated


Processing research records: 73it [19:04, 17.14s/it]

DOI: 10.1093/mnras/sts250, 9 findings generated


Processing research records: 75it [19:31, 14.99s/it]

DOI: 10.1088/0004-637X/778/1/58, 0 findings generated


Processing research records: 76it [19:47, 15.18s/it]

DOI: 10.1086/307608, 5 findings generated
DOI: 10.1111/j.1365-2966.2009.14960.x, 3 findings generated


Processing research records: 77it [20:01, 14.84s/it]

DOI: 10.1093/mnras/stv1194, 4 findings generated


Processing research records: 78it [20:15, 14.63s/it]

DOI: 10.1051/0004-6361/201118158, 6 findings generated


Processing research records: 79it [20:32, 15.24s/it]

DOI: 10.1038/416059a, 3 findings generated


Processing research records: 81it [21:00, 14.97s/it]

DOI: 10.1088/0004-637X/696/1/891, 4 findings generated
DOI: 10.1086/151073, 3 findings generated


Processing research records: 82it [21:14, 14.55s/it]

DOI: 10.1016/0920-5632(93)90087-M, 5 findings generated


Processing research records: 84it [21:46, 15.25s/it]

DOI: 10.1086/306078, 5 findings generated
DOI: 10.1093/mnras/204.2.415, 2 findings generated


Processing research records: 85it [22:00, 14.90s/it]

DOI: 10.1086/301146, 8 findings generated


Processing research records: 87it [22:31, 14.74s/it]

DOI: 10.1086/191534, 1 findings generated
DOI: 10.1086/341326, 6 findings generated


Processing research records: 88it [22:48, 15.39s/it]

DOI: 10.1086/177696, 2 findings generated


Processing research records: 89it [23:01, 14.60s/it]

DOI: 10.1086/173325, 3 findings generated


Processing research records: 90it [23:15, 14.53s/it]

DOI: 10.1088/2041-8205/748/2/L29, 4 findings generated


Processing research records: 92it [23:47, 15.10s/it]

DOI: 10.1093/mnras/271.4.993, 4 findings generated
DOI: 10.1093/mnras/185.1.77P, 3 findings generated


Processing research records: 93it [23:58, 13.91s/it]

DOI: 10.1103/PhysRevLett.81.2858, 4 findings generated


Processing research records: 95it [24:25, 13.79s/it]

DOI: 10.1086/152165, 3 findings generated


Processing research records: 96it [24:39, 13.82s/it]

DOI: 10.1086/312014, 4 findings generated


Processing research records: 97it [24:53, 13.87s/it]

DOI: 10.1093/mnras/staa1158, 3 findings generated
DOI: 10.1038/33874, 5 findings generated


Processing research records: 98it [25:07, 13.90s/it]

DOI: 10.48550/arXiv.astro-ph/9712293, 3 findings generated


Processing research records: 99it [25:19, 13.30s/it]

DOI: 10.1086/346149, 5 findings generated


Processing research records: 101it [25:51, 14.62s/it]

DOI: 10.3847/1538-4357/aa8ee9, 4 findings generated
DOI: 10.1093/mnras/289.2.490, 2 findings generated


Processing research records: 102it [26:03, 13.90s/it]

DOI: 10.1088/0004-637X/699/2/1092, 5 findings generated


Processing research records: 103it [26:20, 14.73s/it]

DOI: 10.1029/2008RS003997, 4 findings generated


Processing research records: 104it [26:36, 15.31s/it]

DOI: 10.1088/2041-8205/719/2/L140, 3 findings generated


Processing research records: 105it [26:53, 15.61s/it]

DOI: 10.3847/0004-637X/820/2/89, 3 findings generated


Processing research records: 106it [27:08, 15.44s/it]

DOI: 10.1088/0004-6256/136/4/1510, 6 findings generated


Processing research records: 108it [27:40, 15.88s/it]

DOI: 10.3847/1538-4357/aaecd1, 5 findings generated


Processing research records: 109it [27:56, 16.03s/it]

DOI: 10.1088/0004-637X/795/2/156, 5 findings generated
DOI: 10.48550/arXiv.astro-ph/9902329, 8 findings generated


Processing research records: 110it [28:19, 18.13s/it]

DOI: 10.1086/307363, 6 findings generated


Processing research records: 111it [28:38, 18.21s/it]

DOI: 10.1051/0004-6361:20011035, 3 findings generated


Processing research records: 112it [28:49, 16.18s/it]

DOI: 10.1086/309688, 5 findings generated


Processing research records: 113it [29:07, 16.72s/it]

DOI: 10.1086/374687, 6 findings generated


Processing research records: 115it [29:39, 16.20s/it]

DOI: 10.1086/159439, 4 findings generated
DOI: 10.1086/368111, 5 findings generated


Processing research records: 116it [29:59, 17.19s/it]

DOI: 10.1086/342159, 3 findings generated


Processing research records: 117it [30:11, 15.87s/it]

DOI: 10.1088/0004-6256/138/5/1271, 7 findings generated


Processing research records: 118it [30:32, 17.38s/it]

DOI: 10.1086/168518, 5 findings generated


Processing research records: 120it [31:05, 16.85s/it]

DOI: 10.1093/mnras/stu949, 5 findings generated
DOI: 10.48550/arXiv.astro-ph/0703642, 7 findings generated


Processing research records: 121it [31:21, 16.50s/it]

DOI: 10.1086/523960, 11 findings generated


Processing research records: 122it [31:50, 15.66s/it]


In [20]:
research_df.to_parquet("research_findings.parquet")

In [None]:
def similarity(query_vector: np.ndarray, ref_vectors: np.ndarray) -> np.float32:
    """
    Compute the cosine similarity between a single vector and a set of reference vectors.
    Returns the max similarity
    """
    return max(np.dot(ref_vectors, query_vector))


def target_similarity(example: pd.Series, research: pd.DataFrame) -> float:
    # Choose a target paper from the cited papers (if more than one) and get generate its contributions
    target_doi = random.choice(example.citation_dois)
    target_record = research.loc[research["doi"] == target_doi].squeeze()
    paper = record_to_paper(target_record)
    target_contributions = chat_response({"paper": paper}).findings

    query_vector = query_embedder([example.sent_no_cit])[0]
    ref_vectors = document_embedder(target_contributions)
    return similarity(query_vector, ref_vectors)