In [7]:
from langchain_ollama import ChatOllama
from langchain_core.messages import SystemMessage, HumanMessage
from llm.models import Findings
import logging

logging.basicConfig(
    filename="logs/long_paper_processing.log",
    filemode="w",
    level=logging.DEBUG,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

llm = ChatOllama(
    model="deepseek-r1:70b",
    reasoning=True,
    max_tokens=2048,
    num_ctx=2048,
    temperature=0.0,
).with_structured_output(Findings, method="json_schema")

# Just for testing token lengths
model = ChatOllama(
    model="deepseek-r1:70b",
    reasoning=True,
    max_tokens=2048,
    num_ctx=2048,
    temperature=0.0,
)

with open("llm/prompts/original_contributions.txt", "r") as file:
    sys_prompt = file.read()
system_message = SystemMessage(content=sys_prompt)
print(system_message.content[:1000])

print(f"System message token length: {model.get_num_tokens(system_message.content)}")

You are an expert research assistant. Summarize the original scientific findings of the following research paper. Then write them out as a list of strings.

## Task
- Write the findings as a JSON array of strings, each finding being one string element.
- Do not include acknowledgments or references: focus only on the original research contributions made in this paper
- Only write out the JSON array, do not include any other text or formatting.

Example output:
{
  findings: ["Contribution 1", "Contribution 2", "Contribution 3"]
}

## Paper:


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

System message token length: 125


1. Set up ChatOllama with system prompt to accept a paper and return the findings object
1. Set up db client
1. Read in the long papers dataset
1. Read in the research dataset, filter to long papers only
1. Get the DOI's from the contributions table, filter out any in dataset already in db
1. for every doi in the dataset, send to ChatOllama to get its findings

In [8]:
# Get the unique dois from the database contribution table
from database.database import Database

db = Database()
db.test_connection()

results = db.query("SELECT DISTINCT doi FROM contributions")
existing_dois = {row[0] for row in results}
print(f"Number of unique DOIs in contributions: {len(existing_dois)}")

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)
Number of unique DOIs in contributions: 4864


In [9]:
import pandas as pd

research = pd.read_json("data/preprocessed/research.jsonl", lines=True)
long_papers_df = pd.read_json("logs/long_papers.jsonl", lines=True)

def reconstruct_paper(example: pd.Series) -> str:
    return f"{example['title']}\n\nAbstract: {example['abstract']}\n\n{example['body']}"

assert set(long_papers_df.doi).issubset(set(research.doi)), "Long papers DOIs must be a subset of research DOIs"

In [None]:
import json
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

def openai_client():
    assert "OPENAI_API_KEY" in os.environ, "OPENAI_API_KEY must be set in .env file"
    client = OpenAI(
        api_key=os.environ["OPENAI_API_KEY"],
        model="gpt-4-1106-preview",
        max_retries=3,
        timeout=60,
    )

In [14]:
# 3. Read in the long papers dataset
import json

for i, row in long_papers_df.iterrows():
    
    doi = row["doi"]
    if doi in existing_dois:
        continue
    print(f"Processing DOI: {doi}")

    # Get paper from the research df
    paper = None
    try:
        paper = research[research["doi"] == doi].iloc[0]
    except IndexError:
        logging.error(f"DOI {doi} not found in research dataset, skipping.")
        continue
    paper_text = reconstruct_paper(paper)
    print(f"Paper {i + 1} length: {len(paper_text)} characters, {model.get_num_tokens(paper_text)} tokens")
        
    try:
        response = llm.invoke(
            [
                system_message,
                HumanMessage(content=paper_text),
            ]
        )
        with open("long_findings.jsonl", "a") as file:
            file.write(json.dumps({"doi": doi, "findings": response.findings}) + "\n")
    except Exception as e:
        # log the error and save the response for later inspection
        logging.error(f"Error processing DOI {doi} (see long_paper_llm_errors.jsonl): {e}")
        with open("logs/long_paper_llm_errors.jsonl", "a") as file:
            file.write(json.dumps({"doi": doi, "content": json.dumps(response), "error": str(e)}) + "\n")



Processing DOI: 10.1088/0004-637X/794/2/156
Paper 1 length: 211187 characters, 50206 tokens
Processing DOI: 10.1088/0004-637X/706/2/1364
Paper 2 length: 259750 characters, 61079 tokens
Processing DOI: 10.1086/191823
Paper 3 length: 307033 characters, 138225 tokens
Processing DOI: 10.1111/j.1365-2966.2008.12991.x
Paper 4 length: 257902 characters, 58074 tokens
Processing DOI: 10.1111/j.1365-2966.2012.20937.x
Paper 5 length: 296922 characters, 67986 tokens
Processing DOI: 10.1111/j.1365-2966.2007.12671.x
Paper 6 length: 428161 characters, 102382 tokens
Processing DOI: 10.12942/lrr-2009-2
Paper 7 length: 338218 characters, 70508 tokens
Processing DOI: 10.1103/RevModPhys.82.3121
Paper 8 length: 287782 characters, 63530 tokens
Processing DOI: 10.1086/192235
Paper 9 length: 203303 characters, 68620 tokens
Processing DOI: 10.1051/0004-6361/202243782
Paper 10 length: 229744 characters, 54499 tokens
Processing DOI: 10.1111/j.1365-2966.2011.20043.x
Paper 11 length: 327951 characters, 81222 token

KeyboardInterrupt: 