In [1]:
import pandas as pd

df = pd.read_json('data/dataset/nontrivial_llm.jsonl', lines=True)
print(f"Length of DataFrame: {len(df)}")
df.head()

Length of DataFrame: 14995


Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes
0,10.1016/j.newar.2024.101694,"Subsequently, Andrews et al. (2017) selected a...","Subsequently, [REF] selected a wide binary can...",58,[10.1093/mnras/stx2000],2024-06-01,[2017MNRAS.472..675A]
1,10.1016/j.newar.2024.101694,Andrews et al. (2017) investigated how the sep...,[REF] investigated how the separation of their...,61,[10.1093/mnras/stx2000],2024-06-01,[2017MNRAS.472..675A]
2,10.1016/j.newar.2024.101694,This led Andrews et al. (2017) to conclude tha...,This led [REF] to conclude that most of the pa...,64,[10.1093/mnras/stx2000],2024-06-01,[2017MNRAS.472..675A]
3,10.1016/j.newar.2024.101694,It may also owe in part to the mass ratio dist...,It may also owe in part to the mass ratio dist...,90,[10.1093/mnras/stz2480],2024-06-01,[2019MNRAS.489.5822E]
4,10.1016/j.newar.2024.101694,Hwang et al. (2022c) used a related method to ...,[REF] used a related method to study the eccen...,110,[10.3847/2041-8213/ac7c70],2024-06-01,[2022ApJ...933L..32H]


In [2]:
# Get the unique dois from the database contribution table
from database.database import Database
db = Database()
db.test_connection()

results = db.query("SELECT DISTINCT doi FROM contributions")
existing_dois = {row[0] for row in results}
print(f"Number of unique DOIs in contributions: {len(existing_dois)}")

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)
Number of unique DOIs in contributions: 9840


In [3]:
research = pd.read_json('data/preprocessed/research.jsonl', lines=True)

def reconstruct_paper(example: pd.Series) -> str:
    return f"{example['title']}\n\nAbstract: {example['abstract']}\n\n{example['body']}"

In [4]:
paper = research[research["doi"] == "10.1088/0067-0049/182/1/216"].iloc[0]
print(reconstruct_paper(paper))

Structure and Formation of Elliptical and Spheroidal Galaxies

Abstract: New surface photometry of all known elliptical galaxies in the Virgo cluster is combined with published data to derive composite profiles of brightness, ellipticity, position angle, isophote shape, and color over large radius ranges. These provide enough leverage to show that Sérsic log I vprop r <SUP>1/n </SUP> functions fit the brightness profiles I(r) of nearly all ellipticals remarkably well over large dynamic ranges. Therefore, we can confidently identify departures from these profiles that are diagnostic of galaxy formation. Two kinds of departures are seen at small radii. All 10 of our ellipticals with total absolute magnitudes M<SUB>VT</SUB> &lt;= -21.66 have cuspy cores—"missing light"—at small radii. Cores are well known and naturally scoured by binary black holes (BHs) formed in dissipationless ("dry") mergers. All 17 ellipticals with -21.54 &lt;= M<SUB>VT</SUB> &lt;= -15.53 do not have cores. We find a

In [5]:
import logging
logging.basicConfig(
    filename="logs/deepseek.log",
    filemode="w",
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

In [6]:
MAX_PAPER_LEN = 250000  # ~65k tokens, leaving ~1000 tokens for the response
# Remove rows from df where any of the citation_dois corresponds to a paper in research df with body > MAX_PAPER_LEN
df = df[df.apply(lambda x: all(len(reconstruct_paper(research[research["doi"] == doi].iloc[0])) <= MAX_PAPER_LEN for doi in x["citation_dois"]), axis=1)]
print(f"Length of DataFrame (papers under max length): {len(df)}")

Length of DataFrame (papers under max length): 14681


In [7]:
df.to_json('data/dataset/nontrivial_filtered.jsonl', orient='records', lines=True)

In [8]:
# Define the deepseek api, which copies the openai api
from openai import OpenAI
import json
import os
from dotenv import load_dotenv
from llm.models import Findings

load_dotenv()

def deepseek_client():
    assert "DEEPSEEK_API_KEY" in os.environ, "DEEPSEEK_API_KEY must be set in environment variables"
    client = OpenAI(
        api_key=os.environ["DEEPSEEK_API_KEY"],
        base_url="https://api.deepseek.com",
    )
    return client

client = deepseek_client()

with open('llm/prompts/original_contributions.txt', 'r') as f:
    SYSTEM_PROMPT = f.read()

def get_deepseek_response(paper: str):
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": paper}],
            response_format={"type": "json_object"},
            stream=False,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        raise e

def get_contributions_from_paper(record: pd.Series) -> list[str]:
    paper = reconstruct_paper(record)
    if len(paper) > MAX_PAPER_LEN:
        # Use ChatOllama with deepseek

        with open("logs/long_papers.jsonl", "a") as file:
            file.write(json.dumps({"doi": record['doi'], "length": len(paper)}) + "\n")
        logging.warning(f"Paper for DOI {record['doi']} is too long: {len(paper)} characters, skipping.")
        return []

    # Get the deepseek API response
    json_response = None
    try:
        json_response = get_deepseek_response(paper)
    except Exception as e:
        logging.error(f"Error parsing JSON for DOI {record['doi']}: {e}")
        return []

    # Parse the JSON response, log any error
    try:
        data = json.loads(json_response)
        findings_obj = Findings.model_validate(data)
        return findings_obj.findings  # This is your list of strings
    except Exception as e:
        logging.error(f"Error parsing JSON for DOI {record['doi']}: {e}")
        print(f"JSON parse error for DOI {record['doi']}")
        return []

In [9]:
# Get a set of the unique DOIs from the training data set's citation_dois
cited_dois = set(doi for dois in df['citation_dois'] for doi in dois)
print(f"Number of unique DOIs in training data: {len(cited_dois)}")

# Remove from this set any DOIs already processed
dois_to_process = cited_dois - existing_dois
print(f"Number of DOIs to process: {len(dois_to_process)}")



Number of unique DOIs in training data: 9954
Number of DOIs to process: 151


In [10]:
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"findings_{timestamp}.jsonl"


# For each remaining DOI, get the findings
with open(filename, "a") as file:
    for i, doi in enumerate(dois_to_process):
        print(i, end=", ")  # Get the record from the research DataFrame
        if i % 40 == 0:
            print()

            
        record = research[research["doi"] == doi].iloc[0]
        if record.empty:
            logging.warning(f"Record for DOI {doi} not found in research DataFrame")
            continue
        findings = get_contributions_from_paper(record)
        if not findings:
            logging.warning(f"No findings extracted for DOI {doi}")
            continue

        # Add to jsonl
        findings_data = {"doi": doi, "findings": findings}
        file.write(json.dumps(findings_data) + "\n")

0, 
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 
81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, JSON parse error for DOI 10.3847/1538-4365/ac2a48
110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 
121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 

In [11]:
from embedders import get_embedder
embedder = get_embedder(model_name='BAAI/bge-large-en-v1.5', device='mps', normalize=True)

# Read in the findings from the jsonl file
findings_df = pd.read_json(filename, lines=True)

# Remove any findings that are already in the database (by doi)
existing_dois = set(existing_dois)
new_findings = findings_df[~findings_df['doi'].isin(existing_dois)]

print(f"Number of new findings to insert: {len(new_findings)}")




Number of new findings to insert: 123


In [12]:
# Insert the findings into the database
for i, row in new_findings.iterrows():
    print(i, end=", ")
    if (i + 1) % 40 == 0:
        print()
    doi = row["doi"]
    findings = row["findings"]
    embeddings = embedder(findings)
    pubdate = research[research["doi"] == doi]["pubdate"].iloc[0]

    # Insert into database (pseudo code)
    for finding, embedding in zip(findings, embeddings):
        with db.conn.cursor() as cursor:
            # Convert embedding to a format suitable for your database, e.g., a byte array or a string
            cursor.execute(
                "INSERT INTO contributions (embedding, text, doi, pubdate) VALUES (%s, %s, %s, %s)",
                (embedding, finding, doi, pubdate),
            )
    db.conn.commit()

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 
120, 121, 122, 