In [1]:
from database.database import Database


db = Database()
db.test_connection()

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)


In [2]:
import pandas as pd
import torch
from Embedders import get_embedder

DATASET_PATH = "data/dataset/split/train.jsonl"
EMBEDDER_NAME = "BAAI/bge-large-en-v1.5"

device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
samples = pd.read_json(DATASET_PATH, lines=True)
embedder = get_embedder(EMBEDDER_NAME, device=device, normalize=True)
print(f"Loaded samples: {len(samples)}")
print("Embedder:", embedder)

Loaded samples: 326
Embedder: BAAI/bge-large-en-v1.5, device=mps, normalize=True


In [3]:
research = pd.read_json("data/preprocessed/research.jsonl", lines=True)

def reconstruct_paper(example: pd.Series) -> str:
    return f"{example['title']}\n\nAbstract: {example['abstract']}\n\n{example['body']}"

print("Example paper reconstruction:")
print(reconstruct_paper(research.iloc[0])[:500])

Example paper reconstruction:
The morphology of extragalactic radio sources of high and low luminosity

Abstract: The relative positions of the high and low brightness regions in the extragalactic sources in the 3 CR complete sample are found to be correlated with the luminosity of these sources.

Mon. Not. R. astr. Soc. (i~y~.) 167, Short Communication, 3 IP-35P. THE MORPHOLOGY OF EXTRAGALACTIC RADIO SOURCES OF HIGH AND LOW LUMINOSITY B. L. Fanaroff and J. M. Riley (Received 1974 March 6) SUMMARY The relative positions of t


### Establish set of documents to expand 

Since a target paper may appear in any number of training samples, we first collect the doi's of the document expansions already inserted into the database, then subtract them from the set of target doi's appearing in the training set.

In [4]:
expanded_dois = set()

# First, get the DOIs from the database that have already been expanded and inserted
with db.conn.cursor() as cursor:
    cursor.execute(
        "SELECT DISTINCT doi from contributions"
    )
    existing_dois = {row[0] for row in cursor.fetchall()}
    expanded_dois.update(existing_dois)
print(f"Existing DOIs in database: {len(existing_dois)}")

target_dois = set(doi for dois in samples["citation_dois"] for doi in dois) - expanded_dois
print(f"Target DOIs to expand: {len(target_dois)}")

Existing DOIs in database: 163
Target DOIs to expand: 81


### Iterate over training set

Next we iterate over the training set. Each example has one or more target doi's. These targets should be present in the `research` data frame, so using the doi we retrieve the `pd.Series` for that row. We can then reconstruct the paper (title, abstract, and body) and send to LLM for document expansion.

In [6]:
# Define the deepseek api, which copies the openai api
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv() 


def deepseek_client():
    assert "DEEPSEEK_API_KEY" in os.environ, "DEEPSEEK_API_KEY must be set in environment variables"
    client = OpenAI(
        api_key=os.environ["DEEPSEEK_API_KEY"],
        base_url="https://api.deepseek.com",
    )
    return client

client = deepseek_client()

SYSTEM_PROMPT = """You are an expert research assistant. Summarize the original scientific findings of the following research paper. Then write them out as a list of strings.

## Task
- Write the findings as a JSON array of strings, each finding being one string element.
- Do not include acknowledgments or references: focus only on the original research contributions made in this paper
- Only write out the JSON array, do not include any other text or formatting.

Example output:
{
  findings: ["Contribution 1", "Contribution 2", "Contribution 3"]
}

## Paper:
"""

def get_deepseek_response(paper: str):
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": paper}
            ],
            response_format={'type': 'json_object'},
            stream=False
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        raise e



In [7]:
import json 
all_findings = dict()
with open("error_log.csv", "w") as f:
    f.write("doi,error,api_response\n")

for doi in target_dois:
    print(f"Processing DOI: {doi}")
    paper = research[research["doi"] == doi].iloc[0]
    reconstructed_paper = reconstruct_paper(paper)

    json_response = None
    try:
        json_response = get_deepseek_response(reconstructed_paper)
    except Exception as e:
        print(f"Error getting response for DOI {doi}: {e}")
        with open("error_log.csv", "a") as f:
            f.write(f"{doi},{e},None\n")
        continue

    try:
        findings_object = json.loads(json_response)
        sentences = findings_object['findings']
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error parsing JSON response for DOI {doi}: {e}")
        with open("error_log.csv", "a") as f:
            f.write(f"{doi},{e},{json_response}\n")
        continue
    all_findings[doi] = sentences

    print("  Embedding & inserting findings into database...")
    embeddings = embedder(sentences)

    # Get the associated pubdate
    pubdate = (
        research[research["doi"] == doi]["pubdate"].values[0]
        if not research[research["doi"] == doi].empty
        else None
    )
    if not pubdate:
        print(f"Warning: No pubdate found for DOI {doi}. Skipping.")
        continue
    with db.conn.cursor() as cursor:
        for embedding, text in zip(embeddings, sentences):
            # Insert into the database
            cursor.execute(
                "INSERT INTO contributions (embedding, text, pubdate, doi) VALUES (%s, %s, %s, %s)",
                (embedding, text, pubdate, doi),
            )
    db.conn.commit()
    print(f"  Processed DOI {doi}: {len(sentences)} findings")

Processing DOI: 10.1086/430438
  Embedding & inserting findings into database...
  Processed DOI 10.1086/430438: 7 findings
Processing DOI: 10.1051/0004-6361:20054427
  Embedding & inserting findings into database...
  Processed DOI 10.1051/0004-6361:20054427: 16 findings
Processing DOI: 10.1086/301341
  Embedding & inserting findings into database...
  Processed DOI 10.1086/301341: 10 findings
Processing DOI: 10.1111/j.1365-2966.2005.10018.x
  Embedding & inserting findings into database...
  Processed DOI 10.1111/j.1365-2966.2005.10018.x: 9 findings
Processing DOI: 10.1086/421110
  Embedding & inserting findings into database...
  Processed DOI 10.1086/421110: 8 findings
Processing DOI: 10.1086/324269
  Embedding & inserting findings into database...
  Processed DOI 10.1086/324269: 19 findings
Processing DOI: 10.1093/mnras/stx071
  Embedding & inserting findings into database...
  Processed DOI 10.1093/mnras/stx071: 10 findings
Processing DOI: 10.1093/mnras/stae1049
  Embedding & ins

In [8]:
# Manually insert improperly formatted findings
doi = "10.1111/j.1365-2966.2004.07881.x"
sentences = [
    "Developed a method for aperture correction using resolved imaging to remove aperture bias in star formation rate (SFR) estimates, enabling accurate total SFR calculations in galaxies.",
    "Determined the SFR density to be 1.915^{+0.02}_{-0.01} (random)^{+0.14}_{-0.42} (systematic) h_{70}10^{-2} M_{\odot} yr^{-1} Mpc^{-3} at z=0.1 for a Kroupa initial mass function.",
    "Found that the majority of star formation in the low-redshift Universe occurs in moderately massive galaxies (10^{10}-10^{11} M_{\odot}), typically in high surface brightness disc galaxies.",
    "Approximately 15% of all star formation takes place in galaxies showing signs of an active nucleus, and about 20% occurs in starburst galaxies.",
    "Showed that the present to past average SFR, the Scalo b-parameter, is almost constant over almost three orders of magnitude in mass, declining only at M_{*} > 10^{10} M_{\odot}.",
    "The volume averaged b parameter is 0.408^{+0.005}_{-0.002} (random)^{+0.029}_{-0.090}h^{-1}_{70}, used to constrain the star formation history of the Universe.",
    "For the concordance cosmology, the present-day Universe is forming stars at at least 1/3 of its past average rate, corresponding to a time-scale of 7^{+0.7}_{-1.5} Gyr for an exponentially declining cosmic star formation history.",
    "Found a correlation between b and morphological type, as well as a tight correlation between the 4000-Å break (D4000) and b, suggesting D4000 can estimate b parameters for high-redshift galaxies.",
]
embeddings = embedder(sentences)

pubdate = (
        research[research["doi"] == doi]["pubdate"].values[0]
        if not research[research["doi"] == doi].empty
        else None
    )
if not pubdate:
    print(f"Warning: No pubdate found for DOI {doi}. Skipping.")
else:
    with db.conn.cursor() as cursor:
        for embedding, text in zip(embeddings, sentences):
            # Insert into the database
            cursor.execute(
                "INSERT INTO contributions (embedding, text, pubdate, doi) VALUES (%s, %s, %s, %s)",
                (embedding, text, pubdate, doi),
            )
    db.conn.commit()

In [None]:
print(f"Abstract: {long_example['abstract'][:500]}")
print(f"Body length: {len(long_example['body'])}")

In [None]:
with open('catalog.txt', 'w') as f:
    f.write(reconstruct_paper(long_example))