In [16]:
import pandas as pd

df = pd.read_json('data/dataset/nontrivial_llm.jsonl', lines=True)
print(f"Length of DataFrame: {len(df)}")
df.head()

Length of DataFrame: 8959


Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes
0,10.1016/j.newar.2024.101694,"Subsequently, Andrews et al. (2017) selected a...","Subsequently, [REF] selected a wide binary can...",58,[10.1093/mnras/stx2000],2024-06-01,[2017MNRAS.472..675A]
1,10.1016/j.newar.2024.101694,Andrews et al. (2017) investigated how the sep...,[REF] investigated how the separation of their...,61,[10.1093/mnras/stx2000],2024-06-01,[2017MNRAS.472..675A]
2,10.1016/j.newar.2024.101694,This led Andrews et al. (2017) to conclude tha...,This led [REF] to conclude that most of the pa...,64,[10.1093/mnras/stx2000],2024-06-01,[2017MNRAS.472..675A]
3,10.1016/j.newar.2024.101694,It may also owe in part to the mass ratio dist...,It may also owe in part to the mass ratio dist...,90,[10.1093/mnras/stz2480],2024-06-01,[2019MNRAS.489.5822E]
4,10.1016/j.newar.2024.101694,Hwang et al. (2022c) used a related method to ...,[REF] used a related method to study the eccen...,110,[10.3847/2041-8213/ac7c70],2024-06-01,[2022ApJ...933L..32H]


In [17]:
# Get the unique dois from the database contribution table
from database.database import Database
db = Database()
db.test_connection()

results = db.query("SELECT DISTINCT doi FROM contributions")
existing_dois = {row[0] for row in results}
print(f"Number of unique DOIs in contributions: {len(existing_dois)}")

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)
Number of unique DOIs in contributions: 4864


In [18]:
research = pd.read_json('data/preprocessed/research.jsonl', lines=True)

def reconstruct_paper(example: pd.Series) -> str:
    return f"{example['title']}\n\nAbstract: {example['abstract']}\n\n{example['body']}"

In [19]:
import logging
logging.basicConfig(
    filename="logs/deepseek.log",
    filemode="w",
    level=logging.DEBUG,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

In [20]:
# Define the deepseek api, which copies the openai api
from openai import OpenAI
import json
import os
from dotenv import load_dotenv
from llm.models import Findings

load_dotenv()
MAX_PAPER_LEN = 195000 # ~65k tokens, leaving ~500 tokens for the response

def deepseek_client():
    assert "DEEPSEEK_API_KEY" in os.environ, "DEEPSEEK_API_KEY must be set in environment variables"
    client = OpenAI(
        api_key=os.environ["DEEPSEEK_API_KEY"],
        base_url="https://api.deepseek.com",
    )
    return client

client = deepseek_client()

with open('llm/prompts/original_contributions.txt', 'r') as f:
    SYSTEM_PROMPT = f.read()

def get_deepseek_response(paper: str):
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": paper}],
            response_format={"type": "json_object"},
            stream=False,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        raise e

def get_contributions_from_paper(record: pd.Series) -> list[str]:
    paper = reconstruct_paper(record)
    if len(paper) > MAX_PAPER_LEN:
        # Use ChatOllama with deepseek

        with open("logs/long_papers.jsonl", "a") as file:
            file.write(json.dumps({"doi": record['doi'], "length": len(paper)}) + "\n")
        logging.warning(f"Paper for DOI {record['doi']} is too long: {len(paper)} characters, skipping.")
        return []

    # Get the deepseek API response
    json_response = None
    try:
        json_response = get_deepseek_response(paper)
    except Exception as e:
        logging.error(f"Error parsing JSON for DOI {record['doi']}: {e}")
        return []

    # Parse the JSON response, log any error
    try:
        data = json.loads(json_response)
        findings_obj = Findings.model_validate(data)
        return findings_obj.findings  # This is your list of strings
    except Exception as e:
        logging.error(f"Error parsing JSON for DOI {record['doi']}: {e}")
        print(f"JSON parse error for DOI {record['doi']}")
        return []

In [21]:
# Get a set of the unique DOIs from the training data set's citation_dois
cited_dois = set(doi for dois in df['citation_dois'] for doi in dois)
print(f"Number of unique DOIs in training data: {len(cited_dois)}")

# Remove from this set any DOIs already processed
dois_to_process = cited_dois - existing_dois
print(f"Number of DOIs to process: {len(dois_to_process)}")



Number of unique DOIs in training data: 6513
Number of DOIs to process: 1649


In [22]:
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"findings_{timestamp}.jsonl"


# For each remaining DOI, get the findings
with open(filename, "a") as file:
    for i, doi in enumerate(dois_to_process):
        print(i, end=", ")  # Get the record from the research DataFrame
        if i % 40 == 0:
            print()

            
        record = research[research["doi"] == doi].iloc[0]
        if record.empty:
            logging.warning(f"Record for DOI {doi} not found in research DataFrame")
            continue
        findings = get_contributions_from_paper(record)
        if not findings:
            logging.warning(f"No findings extracted for DOI {doi}")
            continue

        # Add to jsonl
        findings_data = {"doi": doi, "findings": findings}
        file.write(json.dumps(findings_data) + "\n")

0, 
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 
81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 
121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 
161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 
201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,

In [23]:
from Embedders import get_embedder
embedder = get_embedder(model_name='BAAI/bge-large-en-v1.5', device='mps', normalize=True)

# Read in the findings from the jsonl file
findings_df = pd.read_json(filename, lines=True)

# Remove any findings that are already in the database (by doi)
existing_dois = set(existing_dois)
new_findings = findings_df[~findings_df['doi'].isin(existing_dois)]

print(f"Number of new findings to insert: {len(new_findings)}")




Number of new findings to insert: 1453


In [24]:
# Insert the findings into the database
for i, row in new_findings.iterrows():
    print(i, end=", ")
    if (i + 1) % 40 == 0:
        print()
    doi = row["doi"]
    findings = row["findings"]
    embeddings = embedder(findings)
    pubdate = research[research["doi"] == doi]["pubdate"].iloc[0]

    # Insert into database (pseudo code)
    for finding, embedding in zip(findings, embeddings):
        with db.conn.cursor() as cursor:
            # Convert embedding to a format suitable for your database, e.g., a byte array or a string
            cursor.execute(
                "INSERT INTO contributions (embedding, text, doi, pubdate) VALUES (%s, %s, %s, %s)",
                (embedding, finding, doi, pubdate),
            )
    db.conn.commit()

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 
120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 
200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 