In [1]:
from database.database import Database

db = Database()
db.test_connection()

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)


In [46]:
# Set up Ollama service
from langchain_ollama import ChatOllama
from pydantic import BaseModel, Field

class Findings(BaseModel):
    """
    A list of original findings made by the research paper
    """
    content: list[str] = Field(description="List of findings made by the research paper")

llm = ChatOllama(model="deepseek-r1", temperature=0.0).with_structured_output(schema=Findings)

In [33]:
import pandas as pd
research = pd.read_json('data/preprocessed/research.jsonl', lines=True)

paper = research.iloc[0]
print(f"Paper Title: {paper['title']}")
print(f"Paper Abstract: {paper['abstract']}")

Paper Title: The morphology of extragalactic radio sources of high and low luminosity
Paper Abstract: The relative positions of the high and low brightness regions in the extragalactic sources in the 3 CR complete sample are found to be correlated with the luminosity of these sources.


### Set up chat messages

In [49]:
from langchain_core.messages import HumanMessage, SystemMessage

prompt = """You are an expert research assistant. Summarize the original scientific findings of the following research paper. 

## Task
- Write the findings as a JSON array of strings, each finding being one string element.
- Do not include acknowledgments or references: focus only on the original research contributions made in this paper
- Only write out the JSON array, do not include any other text or formatting.

Example output:
["Contribution 1", "Contribution 2", "Contribution 3"]
"""

system_prompt = SystemMessage(content=prompt)

def full_paper_text(paper: pd.Series) -> HumanMessage:
    """
    Construct the full text of the paper from its title, authors, abstract, and body.
    """
    text = f"{paper['title']}\n{', '.join(paper['author'])}\n\nAbstract: {paper['abstract']}\n\n{paper['body']}"
    return HumanMessage(content=text)

full_text_message = full_paper_text(paper)
print(full_text_message.content[:500])

The morphology of extragalactic radio sources of high and low luminosity
Fanaroff, B. L., Riley, J. M.

Abstract: The relative positions of the high and low brightness regions in the extragalactic sources in the 3 CR complete sample are found to be correlated with the luminosity of these sources.

Mon. Not. R. astr. Soc. (i~y~.) 167, Short Communication, 3 IP-35P. THE MORPHOLOGY OF EXTRAGALACTIC RADIO SOURCES OF HIGH AND LOW LUMINOSITY B. L. Fanaroff and J. M. Riley (Received 1974 March 6) SUMMA


In [50]:
response = llm.invoke([system_prompt, work_message])
print(f"LLM identified {len(response.content)} contributions.")
for i, contribution in enumerate(response.content):
    print(f"{i+1} - {contribution}")

LLM identified 5 contributions.
1 - The relative positions of the high and low brightness regions in the extragalactic sources in the 3CR complete sample are found to be correlated with the luminosity of these sources.
2 - In sources for which we have maps of adequate resolution, this is equivalent to having the 'hot spots' nearer to (Class I) or further away from (Class II) the central bright galaxy or quasar than the regions of diffuse radio emission.
3 - The results are presented in Table I whose arrangement is as follows: (1) The luminosity at 178 MHz in W Hz-' sr-1 (Hubble's constant = 50 km s~ Mpc'); the sources are arranged in order of their luminosity. 3 `P 32P B. L. Fanaroff and J. M. Riley TABLE I Beam Size size Pi78 Type Class (kpc) (kpc) 5X10^21 231 C I I 07 5X10^22 272.1 C(D) I 8 i*9 X 10^24 831 C(D) I 250 12 449 C(D) I? 270 12 300 390 12 254 100 330 204 270 400 ...
4 - Despite these differences in their relative positions, the low brightness regions in both classes show s

In [32]:
training_data = pd.read_json('data/dataset/split/small_train.jsonl', lines=True)

In [51]:
def get_record_from_doi(doi: str) -> pd.Series:
    """
    Get the record from the dataset based on the DOI.
    """
    return research[research['doi'] == doi].iloc[0]

def get_contributions(doi: str) -> list[str]:
    """
    Get the contributions for a given DOI using the LLM.
    """
    record = get_record_from_doi(doi)
    full_text = full_paper_text(record)
    response = llm.invoke([system_prompt, full_text])
    return response.content

example = training_data.iloc[0]
print(example)

query_doi = example['citation_dois'][0]

contributions = get_contributions(query_doi)
print(f"Contributions for DOI {query_doi}:")
for i, contribution in enumerate(contributions):
    print(f"{i+1}: {contribution}")


source_doi                      10.1146/annurev.astro.46.060407.145222
sent_original        It is unclear whether the solution can be foun...
sent_no_cit          It is unclear whether the solution can be foun...
sent_idx                                                           541
citation_dois                       [10.1111/j.1365-2966.2009.14750.x]
pubdate                                                     2009-09-01
resolved_bibcodes                                [2009MNRAS.396..203S]
Name: 0, dtype: object
Contributions for DOI 10.1111/j.1365-2966.2009.14750.x:
1: The chemical evolution of galaxies is a complex interplay between stellar nucleosynthesis, galactic outflows, and the mixing of material within the galaxy. In this paper, we focus on the chemical evolution of the Milky Way, specifically examining how oxygen abundance ([O/Fe]) evolves over time due to these processes. The data presented in Figure 9 shows a bimodal distribution of [O/Fe], which suggests two distinct pop

In [54]:
training_target_dois = set(doi for dois in training_data['citation_dois'] for doi in dois if doi)  # Get all unique DOIs in the training data

from Embedders import get_embedder
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
embedder = get_embedder(model_name="BAAI/bge-large-en-v1.5", device=device, normalize=True)

contribution_log = {}

def insert_contribution_embeddings(doi: str):
    contributions = get_contributions(doi)
    contribution_log[doi] = contributions  
    embeddings = embedder(contributions)
    with db.conn.cursor() as cursor:
        for embedding, contribution in tqdm(zip(embeddings, contributions)):
            embedding = embedding.cpu().numpy().tolist()
            cursor.execute(
                "INSERT INTO contributions (embedding, text, doi) VALUES (%s, %s, %s)",
                (embedding, contribution, doi)
            )
        db.conn.commit()

In [None]:
for doi in tqdm(training_target_dois, desc="Inserting contributions into database"):
    insert_contribution_embeddings(doi)

Inserting contributions into database:   0%|          | 0/14 [00:00<?, ?it/s]