In [1]:
from dotenv import load_dotenv
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_deepseek import ChatDeepSeek
from langchain_ollama import ChatOllama
import os
import pandas as pd
from pydantic import BaseModel, Field

from database.database import Database

load_dotenv()
assert os.getenv("DEEPSEEK_API_KEY"), "Please set the DEEPSEEK_API_KEY environment variable."

db = Database()
db.test_connection()

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)


### Load data

In [2]:
research = pd.read_json('data/preprocessed/research.jsonl', lines=True)
training_data = pd.read_json('data/dataset/split/small_train.jsonl', lines=True)

### Set up LLM service

In [7]:
# Set up LLM service
class Findings(BaseModel):
    """
    A list of original findings made by the research paper
    """
    findings: list[str] = Field(description="List of findings made by the research paper")

# llm = ChatOllama(model="deepseek-r1", temperature=0.0).with_structured_output(schema=Findings)
llm = ChatDeepSeek(
    model="deepseek-chat", temperature=0.0, max_retries=2
).with_structured_output(Findings, method="json_mode", include_raw=True)

prompt = """You are an expert research assistant. Summarize the original scientific findings of the following research paper. Then write them out as a list of strings.

## Task
- Write the findings as a JSON array of strings, each finding being one string element.
- Do not include acknowledgments or references: focus only on the original research contributions made in this paper
- Only write out the JSON array, do not include any other text or formatting.

Example output:
["Contribution 1", "Contribution 2", "Contribution 3"]
"""

system_prompt = SystemMessage(content=prompt)


def full_text(record: pd.Series) -> HumanMessage:
    """
    Construct the full text of the paper from its title, authors, abstract, and body.
    """
    text = f"{record['title']}\n{', '.join(record['author'])}\n\nAbstract: {record['abstract']}\n\n{record['body']}"
    return HumanMessage(content=text)

In [None]:
paper = full_text(research.iloc[0])
messages = [("system", system_prompt.content), ("human", paper.content)]
response = llm.invoke(messages)
print(response)

In [None]:
response['raw']

In [4]:
def get_record_from_doi(doi: str) -> pd.Series:
    """
    Get the record from the dataset based on the DOI.
    """
    record = None
    try:
        record = research[research['doi'] == doi].iloc[0]
    except IndexError:
        raise ValueError(f"DOI '{doi}' not found in the dataset.")
    return record

def get_contributions(doi: str) -> list[str]:
    """
    Get the contributions for a given DOI using the LLM.
    """
    record = get_record_from_doi(doi)
    print(f"Found record: {record['title']}")
    record_full_text = full_text(record)
    response = llm.invoke([system_prompt, full_text])
    print("Got LLM response")
    if not hasattr(response, 'content') or not isinstance(response.content, list):
        raise ValueError("LLM response is not in the expected format.")
    return response.content

example = training_data.iloc[0]
query_doi = example['citation_dois'][0]

# contributions = get_contributions(query_doi)
# print(f"Contributions for DOI {query_doi}:")
# for i, contribution in enumerate(contributions):
#     print(f"{i+1}: {contribution}")

# print("Bad example:")
# try:
#     contributions = get_contributions(doi="foobar")
# except Exception as e:
#     print(f"Error: {e}")


In [5]:
# Get all unique DOIs in the training data
training_target_dois = list(
    set(doi for dois in training_data["citation_dois"] for doi in dois if doi)
)

In [None]:
full_texts = [full_text(get_record_from_doi(doi)) for doi in training_target_dois]


10.1086/309975


In [20]:
i = 6
print(training_target_dois[i])
print(full_texts[i].content)

10.48550/arXiv.2203.02041
Evolution of the Stars and Gas in Galaxies
Tinsley, B. M.

Abstract: 1. Overview 1.1 Stellar populations and chemical compositions of galaxies 1.2 Galaxy formation and evolution 1.3 Plan of this review 2. The Formation and Evolution of Stars 2.1 Basic physical properties of stars 2.2 The initial mass function 2.2.1 The local IMF 2.2.2 The IMF at other times and places 2.3 Rates of star formation 2.3.1 The local SFR 2.3.2 The SFR elsewhere 2.3.3 Factors affecting the SFR 2.4 Stellar evolution beyond the main sequence 2.4.1 Stars near solar mass 2.4.2 Stars of 1-4 M? 2.4.3 Stars above 8 M? 2.4.4 Stars of intermediate mass 2.4.5 Effects of initial composition 3. Aims and Methods of Chemical Evolution 3.1 Basic assumptions and equations 3.2 Analytical approximations 3.2.1 A closed system, initially unenriched gas 3.2.2 A system with infall balanced by star formation 3.2.3 Generalities 3.3 Numerical models 4. Chemical Evolution in the Solar Neighborhood 4.1 Outline

In [None]:
findings = {
    "10.1086/309975": [
        "The comoving luminosity density of the universe increases with redshift in all three observed wave bands (2800 A, 4400 A, and 1 mu m).",
        "For a (q0 = 0.5, Omega = 1.0) cosmological model, the comoving luminosity density increases as (1 + z)^2.1 +/- 0.5 at 1 mu m, (1 + z)^2.7 +/- 0.5 at 4400 A, and (1 + z)^3.9 +/- 0.75 at 2800 A.",
        "The exponents for luminosity density evolution are reduced by 0.43 and 1.12 for (0.05, 0.1) and (-0.85, 0.1) cosmological models, respectively.",
        "The observed luminosity density evolution can be modeled by an actively evolving stellar population with a Salpeter initial mass function (IMF) extending to 125 M⊙ and a star formation rate declining as tau^-2.5.",
        "A Scalo IMF extending to the same mass limit produces too many long-lived low-mass stars and does not fit the observed data as well.",
        "The rapid evolution of the star formation rate and comoving luminosity density agrees with conclusions from analyses of the evolving metallicity of the universe.",
        "The physical luminosity density at short wavelengths has likely declined by 2 orders of magnitude since z ~ 1.",
        "The global star formation rate at z ~ 1 was 15 times higher for (0.5, 1.0), 11 times higher for (0.05, 0.1), and 7 times higher for (-0.85, 0.1) cosmological models, with uncertainties of about 0.22 in the log.",
        "The physical luminosity density at 2800 Å has declined by a factor of 60–170 since z ~ 1 for a (0.5, 1.0) model, with a factor of 2 reduction in the extreme (-0.85, 0.1) model.",
    ],
    "10.1086/300959": [
        "The study finds a shallower decline in the universal rest-frame ultraviolet (UV) luminosity density from z=1 to the present, approximately (1+z)^1.5 for q_0=0.5, contradicting the steeper evolution reported by Lilly et al.",
        "The results suggest that galaxy formation continues smoothly to the present time rather than peaking at z=1, with much of the current formation occurring in smaller galaxies.",
        "The UV luminosity density at low redshifts is considerably higher than previously estimated by Lilly et al., aligning more closely with the local UV luminosity density determined by Treyer et al.",
        "The analysis uses a deep, multi-color spectroscopic redshift survey to avoid extrapolation issues present in previous studies, providing more accurate rest-frame UV luminosity measurements.",
        "The study compares luminosity functions at different rest-frame wavelengths (1700 Å, 2250 Å, 2750 Å) and finds only weak sensitivity to wavelength choice, supporting the robustness of the results.",
        "The 2500 Å rest-frame luminosity density evolution is well-described by a (1+z)^1.5 power law, indicating a slower decline in star formation activity than previously thought.",
        "Comparisons with the Treyer et al. local UV sample show consistent luminosity densities, further supporting the shallower evolution slope.",
        "The study highlights discrepancies with the Lilly et al. CFRS analysis, attributing them to methodological differences and extrapolation uncertainties in the CFRS data.",
        "The rest-frame red luminosity (8000 Å) shows minimal evolution from z=0.5 to z=1, contrasting with the more significant evolution observed in the UV.",
        "The findings imply that the bulk of star formation has occurred at more recent times, challenging the notion that the peak epoch of galaxy formation was at z=1.",
    ],
}

In [None]:

from Embedders import get_embedder
import torch
from tqdm import tqdm

embedder_name = "BAAI/bge-large-en-v1.5"
# device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
# embedder = get_embedder(model_name="BAAI/bge-large-en-v1.5", device=device, normalize=True)

for target_doi in training_target_dois[:1]:
    contributions = get_contributions(target_doi)
    print(f"{len(contributions)} contributions for DOI {target_doi}:")
    db.insert_document_expansions(
        expansions=contributions,
        doi=target_doi,
        embedder_name=embedder_name,
        normalize=True
    )


In [None]:
print(len(contributions))

In [None]:
db.insert_document_expansions(
    expansions=contributions, doi=target_doi, embedder_name=embedder_name, normalize=True
)

In [None]:
trial_doi = list(training_target_dois)[0]
trial_record = get_record_from_doi(trial_doi)
print(f"Length: {len(trial_record['body'])}")
contributions = get_contributions(trial_doi)
print(f"Contributions for DOI {trial_doi}:")
for i, contribution in enumerate(contributions):
    print(f"{i+1}: {contribution}")

In [None]:
target_doi

In [None]:
with db.conn.cursor() as cursor:
    for embedding, contribution in tqdm(zip(embeddings, contributions)):
        # embedding = embedding.cpu().numpy().tolist()
        cursor.execute(
            "INSERT INTO contributions (embedding, text, doi) VALUES (%s, %s, %s)",
            (embedding, contribution, trial_doi),
        )
    db.conn.commit()

In [None]:
print(f"Working on DOI: {trial_doi}")

contributions = get_contributions(trial_doi)
print(f"Contributions for DOI {trial_doi}:")
for i, contribution in enumerate(contributions):
    print(f"{i+1}: {contribution}")

In [None]:
# for doi in tqdm(training_target_dois, desc="Inserting contributions into database"):
#     insert_contribution_embeddings(doi)