In [1]:
import pandas as pd

Establish control set of DOI's: those cited in the query dataset and all cited papers are small enough to extract contributions (less than ~65k tokens)

In [14]:
# Get the doi's cited by the dataset
query_dataset = pd.read_json('data/dataset/nontrivial_filtered.jsonl', lines=True)
cited_dois = []
for row in query_dataset.itertuples():
    cited_dois.extend(row.citation_dois)

cited_dois_set = set(cited_dois)
print(f"Number of rows: {len(query_dataset)}")
print(f"Number of unique cited DOIs: {len(cited_dois_set)}")

Number of rows: 14815
Number of unique cited DOIs: 10017


In [9]:
research = pd.read_json("data/preprocessed/research.jsonl", lines=True)


def reconstruct_paper(example: pd.Series) -> str:
    return f"{example['title']}\n\nAbstract: {example['abstract']}\n\n{example['body']}"

In [13]:
MAX_PAPER_LEN = 250_000

query_df_filtered = query_dataset[query_dataset.apply(lambda x: all(len(reconstruct_paper(research.loc[research.doi == doi].iloc[0])) < MAX_PAPER_LEN for doi in x['citation_dois']), axis=1)]
print(f"Filtered query dataset to {len(query_df_filtered)} rows")

Filtered query dataset to 14815 rows


In [15]:
# Load the existing contributions, filter out any that are from uncited paper
contributions_df = pd.read_json("data/findings/combined.jsonl", lines=True)
print(f"Contributions contains {len(contributions_df.doi.unique())} unique DOIs.")
contributions_df_filtered = contributions_df[contributions_df.doi.isin(cited_dois_set)]
print(f"{len(contributions_df_filtered.doi.unique())} unique DOIs")

Contributions contains 9786 unique DOIs.
9750 unique DOIs


In [16]:
# If there are any duplicate DOIs in the contributions_df_filtered, only keep the first occurrence
contributions_df_filtered = contributions_df_filtered.drop_duplicates(subset='doi', keep='first')
print(len(contributions_df_filtered), "rows")

9750 rows


In [17]:
dois_to_process = cited_dois_set - set(contributions_df_filtered.doi)
print(f"DOIs to process: {len(dois_to_process)}")

DOIs to process: 267


## Process the remaining DOI's

1. Set up logging and functions
1. Send each doi to LLM for contribution extraction
1. Write out to file

In [18]:
import logging

logging.basicConfig(
    filename="logs/deepseek.log",
    filemode="w",
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

In [19]:
# Define the deepseek api, which copies the openai api
from openai import OpenAI
import json
import os
from dotenv import load_dotenv
from llm.models import Findings

load_dotenv()


def deepseek_client():
    assert "DEEPSEEK_API_KEY" in os.environ, "DEEPSEEK_API_KEY must be set in environment variables"
    client = OpenAI(
        api_key=os.environ["DEEPSEEK_API_KEY"],
        base_url="https://api.deepseek.com",
    )
    return client


client = deepseek_client()

with open("llm/prompts/original_contributions.txt", "r") as f:
    SYSTEM_PROMPT = f.read()


def get_deepseek_response(paper: str):
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": paper}],
            response_format={"type": "json_object"},
            stream=False,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        raise e


def get_contributions_from_paper(record: pd.Series) -> list[str]:
    paper = reconstruct_paper(record)
    if len(paper) > MAX_PAPER_LEN:
        # Use ChatOllama with deepseek

        with open("logs/long_papers.jsonl", "a") as file:
            file.write(json.dumps({"doi": record["doi"], "length": len(paper)}) + "\n")
        logging.warning(f"Paper for DOI {record['doi']} is too long: {len(paper)} characters, skipping.")
        return []

    # Get the deepseek API response
    json_response = None
    try:
        json_response = get_deepseek_response(paper)
    except Exception as e:
        logging.error(f"Error parsing JSON for DOI {record['doi']}: {e}")
        return []

    # Parse the JSON response, log any error
    try:
        data = json.loads(json_response)
        findings_obj = Findings.model_validate(data)
        return findings_obj.findings  # This is your list of strings
    except Exception as e:
        logging.error(f"Error parsing JSON for DOI {record['doi']}: {e}")
        print(f"JSON parse error for DOI {record['doi']}")
        return []

In [20]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"findings_{timestamp}.jsonl"


# For each remaining DOI, get the findings
with open(filename, "a") as file:
    for i, doi in enumerate(dois_to_process):
        print(i, end=", ")  # Get the record from the research DataFrame
        if i % 40 == 0:
            print()

        record = research[research["doi"] == doi].iloc[0]
        if record.empty:
            logging.warning(f"Record for DOI {doi} not found in research DataFrame")
            continue
        findings = get_contributions_from_paper(record)
        if not findings:
            logging.warning(f"No findings extracted for DOI {doi}")
            continue

        # Add to jsonl
        findings_data = {"doi": doi, "findings": findings}
        file.write(json.dumps(findings_data) + "\n")

0, 
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, JSON parse error for DOI 10.1111/j.1365-2966.2004.07881.x
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 
81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 
121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 
161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 
201, 202, 203, 204, 205, 206, 207, 208, 2

It turns out string length does not always approximate token length well. Checking the log for doi processing we find that several papers, although well below 250,000 chars, still exceeded DeepSeek's 65k token limit. Since we are limiting our dataset papers that can be processed by this LLM, we must now change the doi control group to those found in our findings.

Once again let's combine all the findings, remove any duplicates by doi, then set the unique doi's from the findings as the doi control set. 

1. Write out this control set as doi's used
1. Make sure the query dataset only includes examples that cite these doi's
1. Write out contributions dataset for only these doi's

In [21]:
contributions_df_filtered.head()

Unnamed: 0,doi,findings
0,10.1038/42201,[Direct images of a parsec-scale disk of ioniz...
1,10.1051/0004-6361/202038096,[Low H I covering fractions are strongly corre...
2,10.1093/mnras/stx127,[Diffusive cosmic ray (CR) transport results i...
3,10.1086/311810,[The hard X-rays in Cyg X-1 and similar black ...
4,10.1093/mnras/sty2984,[Investigated the formation of extremely metal...


In [22]:
new_df = pd.read_json("data/findings/findings_20250818_123938.jsonl", lines=True)
new_df.head()

Unnamed: 0,doi,findings
0,10.1086/345660,[The intergalactic medium contains less neutra...
1,10.1086/426070,"[Standard solar models with older, higher heav..."
2,10.1088/0004-637X/763/2/148,[The circumgalactic medium (CGM) of late-type ...
3,10.1086/163605,[Monte Carlo computations accurately model X-r...
4,10.1086/300959,[The universal rest-frame ultraviolet luminosi...


In [23]:
combined_df = pd.concat([contributions_df_filtered, new_df], ignore_index=True)
combined_df = combined_df.drop_duplicates(subset='doi', keep='first')
print(len(combined_df), "rows")
print(f"{len(combined_df.doi.unique())} unique DOIs in combined DataFrame")

9989 rows
9989 unique DOIs in combined DataFrame


The set of doi's in the contributions dataset becomes our control group. 

In [24]:
import json

research_dois_used = list(combined_df.doi.unique())
with open("data/research_dois_used.json", "w") as f:
    json.dump(research_dois_used, f)


In [None]:
research = pd.read_json("data/preprocessed/research.jsonl", lines=True)
research_used = research[research.doi.isin(research_dois_used)]
len(research_used)
assert len(research_used) == len(research_dois_used), "Should be one record per doi"
research_used.to_json("data/research_used.jsonl", lines=True, orient="records")

In [30]:
research_used.pubdate[:5]

0    1974-05-01
1    1955-01-01
6    1997-12-01
8    1978-05-01
9    2011-08-01
Name: pubdate, dtype: object

In [32]:
# Using research_used df and looking up by 'doi', add citation count and pubdate to combined_df
combined_df['citation_count'] = combined_df['doi'].apply(lambda x: research_used.loc[research_used['doi'] == x, 'citation_count'].values[0])
combined_df['pubdate'] = combined_df['doi'].apply(lambda x: research_used.loc[research_used['doi'] == x, 'pubdate'].values[0])

# Convert pubdate to int YYYYMMDD format
combined_df['pubdate'] = combined_df['pubdate'].apply(lambda x: int(x.replace("-", "")))

In [33]:
combined_df.head()


Unnamed: 0,doi,findings,citation_count,pubdate
0,10.1038/42201,[Direct images of a parsec-scale disk of ioniz...,114,19970801
1,10.1051/0004-6361/202038096,[Low H I covering fractions are strongly corre...,95,20200701
2,10.1093/mnras/stx127,[Diffusive cosmic ray (CR) transport results i...,118,20170501
3,10.1086/311810,[The hard X-rays in Cyg X-1 and similar black ...,355,19990101
4,10.1093/mnras/sty2984,[Investigated the formation of extremely metal...,52,20190101


In [40]:
# For each row, 'findings' is a list of strings. Split that into a row for each string, doi, citation_count, and pubdate carrying over to new rows
contributions_denormalized = []
for index, row in combined_df.iterrows():
    for finding in row['findings']:
        contributions_denormalized.append({
            'doi': row['doi'],
            'text': finding,
            'citation_count': row['citation_count'],
            'pubdate': row['pubdate']
        })

contributions_df = pd.DataFrame(contributions_denormalized)

assert len(contributions_df) == sum(len(row['findings']) for _, row in combined_df.iterrows())


In [41]:
print(f"{len(contributions_df)}")
print(f"{len(contributions_df.doi.unique())}")

89860
9989


In [42]:
contributions_df.head()

Unnamed: 0,doi,text,citation_count,pubdate
0,10.1038/42201,Direct images of a parsec-scale disk of ionize...,114,19970801
1,10.1038/42201,"The disk is viewed nearly edge-on, with indivi...",114,19970801
2,10.1038/42201,The projected axes of the disk and AGN are ali...,114,19970801
3,10.1038/42201,Observations using the Very Large Baseline Arr...,114,19970801
4,10.1038/42201,The brightness temperature of the 'hot zone' (...,114,19970801


In [43]:
contributions_df.to_json("data/research_contributions.jsonl", lines=True, orient="records")


In [44]:
len(contributions_df)

89860