In [None]:
import pandas as pd

df = pd.read_json("logs/long_papers.jsonl", lines=True)
print(f"Length of DataFrame: {len(df)}")
dois_long = set(df["doi"].tolist())
print(f"Number of unique DOIs to process: {len(dois_long)}")

Length of DataFrame: 416
Number of unique DOIs to process: 144


In [4]:
# Get the unique dois from the database contribution table
from database.database import Database

db = Database()
db.test_connection()

results = db.query("SELECT DISTINCT doi FROM contributions")
existing_dois = {row[0] for row in results}
print(f"Number of unique DOIs in contributions: {len(existing_dois)}")

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)
Number of unique DOIs in contributions: 4864


In [5]:
research = pd.read_json("data/preprocessed/research.jsonl", lines=True)


def reconstruct_paper(example: pd.Series) -> str:
    return f"{example['title']}\n\nAbstract: {example['abstract']}\n\n{example['body']}"

In [7]:
long_paper = research[research["doi"] == "10.1088/0004-637X/794/2/156"].iloc[0]
long_paper_text = reconstruct_paper(long_paper)
print(f"Reconstructed paper text:\n{long_paper_text[:1000]}")

with open("long_Paper", "w") as f:
    f.write(long_paper_text)

Reconstructed paper text:
Evidence for Ubiquitous Collimated Galactic-scale Outflows along the Star-forming Sequence at z ~ 0.5

Abstract: We analyze Mg II λλ2796, 2803 and Fe II λλ2586, 2600 absorption profiles in individual spectra of 105 galaxies at 0.3 &lt; z &lt; 1.4. The galaxies, drawn from redshift surveys of the GOODS fields and the Extended Groth Strip, sample the range in star formation rates (SFRs) occupied by the star-forming sequence with stellar masses log M <SUB>*</SUB>/M <SUB>⊙</SUB> &gt;~ 9.6 down to SFR gsim 2 M <SUB>⊙</SUB> yr<SUP>-1</SUP> at 0.3 &lt; z &lt; 0.7. Using the Doppler shifts of Mg II and Fe II absorption as tracers of cool gas kinematics, we detect large-scale winds in 66 ± 5% of the galaxies. Hubble Space Telescope Advanced Camera for Surveys imaging and our spectral analysis indicate that the outflow detection rate depends primarily on galaxy orientation: winds are detected in ~89% of galaxies having inclinations (i) &lt;30° (face-on), while the wind 

In [None]:
import logging

logging.basicConfig(
    filename="logs/deeseek_long.log",
    filemode="w",
    level=logging.DEBUG,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

In [None]:
# Define the deepseek api, which copies the openai api
from openai import OpenAI
import json
import os
from dotenv import load_dotenv
from llm.models import Findings

load_dotenv()
MAX_PAPER_LEN = 195000  # ~65k tokens, leaving ~500 tokens for the response


def deepseek_client():
    assert "DEEPSEEK_API_KEY" in os.environ, "DEEPSEEK_API_KEY must be set in environment variables"
    client = OpenAI(
        api_key=os.environ["DEEPSEEK_API_KEY"],
        base_url="https://api.deepseek.com",
    )
    return client


client = deepseek_client()

with open("llm/prompts/original_contributions.txt", "r") as f:
    SYSTEM_PROMPT = f.read()


def get_deepseek_response(paper: str):
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": paper}],
            response_format={"type": "json_object"},
            stream=False,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        raise e


def get_contributions_from_paper(record: pd.Series) -> list[str]:
    paper = reconstruct_paper(record)


    # Get the deepseek API response
    json_response = None
    try:
        json_response = get_deepseek_response(paper)
    except Exception as e:
        logging.error(f"Error parsing JSON for DOI {record['doi']}: {e}")
        return []

    # Parse the JSON response, log any error
    try:
        data = json.loads(json_response)
        findings_obj = Findings.model_validate(data)
        return findings_obj.findings  # This is your list of strings
    except Exception as e:
        logging.error(f"Error parsing JSON for DOI {record['doi']}: {e}")
        print(f"JSON parse error for DOI {record['doi']}")
        return []