In [1]:
import json
import logging
from datetime import datetime
import os
import pandas as pd
from tqdm import tqdm

from citeline.apis.deepseek import get_deepseek_formatted_response
from citeline.database.milvusdb import MilvusDB
from citeline.embedders import Embedder
from citeline.llm.models import Findings

logger = logging.getLogger(__name__)
tqdm.pandas()

PATH_TO_RESEARCH_USED = "../data/research_used.jsonl"
logging.basicConfig(
    filename="../logs/contribution_processor.log",
    filemode="w",
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)


def reconstruct_paper(row: pd.Series) -> str:
    return f"Title: {row['title']}\nAbstract: {row['abstract']}\nBody: {row['body']}"


research = pd.read_json(PATH_TO_RESEARCH_USED, lines=True)
research["pubdate"] = research["pubdate"].str.replace("-", "").astype("int")
research.head(2)

Unnamed: 0,bibcode,abstract,aff,author,bibstem,doctype,doi,id,pubdate,title,read_count,reference,data,citation_count,citation,body,dois,keywords,loaded_from,body_sentences
0,1974MNRAS.167P..31F,The relative positions of the high and low bri...,"[-, -]","[Fanaroff, B. L., Riley, J. M.]","[MNRAS, MNRAS.167]",article,10.1093/mnras/167.1.31P,1307050,19740501,The morphology of extragalactic radio sources ...,32,"[1968MNRAS.138..259M, 1969MNRAS.145...31M, 196...","[NED:80, SIMBAD:81]",2681,"[1974MNRAS.169..395B, 1974Natur.250..625W, 197...","Mon. Not. R. astr. Soc. (i~y~.) 167, Short Com...",[10.1093/mnras/167.1.31P],,data/json/Astro_Research.json,"[Mon. Not. R. astr. Soc., (i~y~.) 167, Shor..."
1,1955ApJ...121..161S,The evolutionary significance of the observed ...,[-],"[Salpeter, Edwin E.]","[ApJ, ApJ...121]",article,10.1086/145971,1123816,19550101,The Luminosity Function and Stellar Evolution.,121,"[1938ApJ....88..429K, 1939POMin...7....1L, 194...",,8170,"[1956ApJS....2..365W, 1957ApJ...125..422S, 195...",1955ApJ. . .121. .161S THE LUMINOSITY FUNCTION...,[10.1086/145971],,data/json/Astro_Research.json,"[1955ApJ. . .121., .161S THE LUMINOSITY FUNC..."


In [2]:
# Load prompt template
with open("../src/citeline/llm/prompts/original_contributions_v3.txt", "r") as file:
    prompt_template = file.read()

print(prompt_template[:500])

You are given the full text of a single scientific research paper. Extract all factual statements asserted or utilized in this paper, including both novel contributions and descriptions of methods/instruments/data used.

Hard requirements:

1. Output ONLY a single JSON object: {{"findings":[ ... ]}}. No extra text, no commentary, no additional keys.
2. Each array item must be a single, self-contained declarative sentence (one idea per sentence). Avoid pronouns and author-phrases (no "we", "autho


In [3]:
PROGRESS_PATH = '../data/progress_research_contributions.json'
if not os.path.exists(PROGRESS_PATH):
    print("Processing will begin at index 0. Creating progress file...")
    progress = {'next_index': 0}
    with open(PROGRESS_PATH, 'w') as f:
        json.dump(progress, f)
else:
    print("Progress file found. Resuming from last saved index...")
    with open(PROGRESS_PATH, 'r') as f:
        progress = json.load(f)
    print(progress)

Progress file found. Resuming from last saved index...
{'next_index': 7856}


In [4]:
OUTFILE = f'../data/research_contributions_v3.jsonl'
print(f"Output will be written to {OUTFILE}")
with open(OUTFILE, "a") as f:
    for i in tqdm(range(progress['next_index'], len(research))):
        row = research.iloc[i]
        paper = reconstruct_paper(row)
        prompt = prompt_template.format(paper=paper)
        response = get_deepseek_formatted_response(prompt, model=Findings)

        if not response or "findings" not in response:
            logger.warning(f"No findings for DOI {row['doi']}")
            continue

        findings = response["findings"]

        for finding in findings:
            row = {
                "text": finding,
                "doi": row["doi"],
                "citation_count": int(row["citation_count"]),
                "pubdate": int(row["pubdate"])
            }
            f.write(json.dumps(row) + "\n")
        
        # Update progress
        progress['next_index'] = i + 1
        with open(PROGRESS_PATH, 'w') as f_prog:
            json.dump(progress, f_prog)


Output will be written to ../data/research_contributions_v3.jsonl


100%|██████████| 2133/2133 [20:06:56<00:00, 33.95s/it]   


In [7]:
# Get the set of unique DOIs in the contributions file
def find_unprocessed_dois() -> set:
    contributions = pd.read_json(OUTFILE, lines=True)
    processed_dois = contributions['doi'].unique()
    research_dois = research['doi'].unique()
    unprocessed_dois = set(research_dois) - set(processed_dois)
    return unprocessed_dois

unprocessed_dois = find_unprocessed_dois()



In [10]:
with open(OUTFILE, "a") as f:
    for doi in tqdm(unprocessed_dois):
        row = research[research['doi'] == doi].iloc[0]
        paper = reconstruct_paper(row)
        prompt = prompt_template.format(paper=paper)
        response = get_deepseek_formatted_response(prompt, model=Findings)

        if not response or "findings" not in response:
            logger.warning(f"No findings for DOI {row['doi']}")
            continue

        findings = response["findings"]

        for finding in findings:
            row = {
                "text": finding,
                "doi": row["doi"],
                "citation_count": int(row["citation_count"]),
                "pubdate": int(row["pubdate"])
            }
            f.write(json.dumps(row) + "\n")
    

100%|██████████| 1/1 [01:40<00:00, 100.86s/it]


In [11]:
unprocessed_dois = find_unprocessed_dois()
print(f"Unprocessed DOIs remaining: {len(unprocessed_dois)}")

Unprocessed DOIs remaining: 0


In [13]:
# Confirm that the number of findigs per paper is reasonable (e.g. > 5)
contributions = pd.read_json(OUTFILE, lines=True)
contributions_per_paper = contributions.groupby('doi').size()
contributions_per_paper.describe()

count    9989.000000
mean       29.961558
std        12.720767
min        10.000000
25%        21.000000
50%        27.000000
75%        34.000000
max       167.000000
dtype: float64

## Embeddings

In [18]:
contributions.drop(columns=['vector'], inplace=True, errors='ignore')
contributions.head()

Unnamed: 0,text,doi,citation_count,pubdate
0,Extragalactic radio sources in the 3CR complet...,10.1093/mnras/167.1.31P,2681,19740501
1,Radio sources with luminosities below approxim...,10.1093/mnras/167.1.31P,2681,19740501
2,Radio sources with luminosities above approxim...,10.1093/mnras/167.1.31P,2681,19740501
3,Class I morphology is defined as sources where...,10.1093/mnras/167.1.31P,2681,19740501
4,Class II morphology is defined as sources wher...,10.1093/mnras/167.1.31P,2681,19740501


In [None]:
embedder = Embedder.create("Qwen/Qwen3-Embedding-0.6B", device="mps", normalize=True)
batch_size = 16
contributions['vector'] = None

for i in tqdm(range(0, len(contributions), batch_size), desc="Embedding contributions"):
    batch = contributions.iloc[i:i + batch_size]
    vectors = embedder(batch['text'])

    # Assign vectors to individual rows
    for j, vec in enumerate(vectors):
        contributions.at[i + j, "vector"] = vec