In [1]:
import pandas as pd
from pprint import pprint

df = pd.read_json('data/dataset/split/small_train.jsonl', lines=True)
df.head()

Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes
0,10.1146/annurev.astro.46.060407.145222,It is unclear whether the solution can be foun...,It is unclear whether the solution can be foun...,541,[10.1111/j.1365-2966.2009.14750.x],2009-09-01,[2009MNRAS.396..203S]
1,10.1016/j.newar.2024.101694,The gravitational effects of the directly-imag...,The gravitational effects of the directly-imag...,276,[10.1051/0004-6361/201834371],2024-06-01,[2019A&A...623A..72K]
2,10.1146/annurev-astro-081811-125615,Connolly et al. (1997) and Pascarelle et al. (...,[REF] and [REF] combined the optical HST imagi...,477,"[10.1086/310829, 10.1086/311708, 10.1086/30997...",2014-08-01,"[1997ApJ...486L..11C, 1998ApJ...508L...1P, 199..."
3,10.1146/annurev-astro-081811-125615,Cowie et al. (1999) and Wilson et al. (2002) c...,[REF] and [REF] combined Keck spectroscopy in ...,481,"[10.1086/300959, 10.1086/341818, 10.1086/309975]",2014-08-01,"[1999AJ....118..603C, 2002AJ....124.1258W, 199..."
4,10.1146/annurev-astro-091916-055240,Tumlinson et al. (2011 ) found that O vi trace...,[REF] found that O vi traces a warm CGM compon...,348,[10.1126/science.1209840],2017-08-01,[2011Sci...334..948T]


In [2]:
from database.database import Database
db = Database()
db.test_connection()

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)


## Test run: query - top k - deepseek boolean
1. Get the top k=20 results from the contributions database
1. Check if the target is in the results. If not, pick a different paper (no need using up DS credits)
1. Get the 20 or less unique dois from the top 20 results
1. Query the db to reconstruct those papers, Title, Abstract, and Body
1. Combine the query with the paper for each of those papers to get a 0/1 score
1. Keep only the papers with a 1 score

In [3]:
# Get the query (sentence without citation) and its embedding
query_record = df.iloc[1]
query = query_record['sent_no_cit']
pprint(query)
print(query_record['citation_dois'])

from Embedders import get_embedder

embedder = get_embedder(model_name="BAAI/bge-large-en-v1.5", device="mps", normalize=True)
query_vector = embedder([query])[0]
print(query_vector.shape)

('The gravitational effects of the directly-imaged planet Pic b on its host '
 'star, for example, were detected by [REF] with high significance.')
['10.1051/0004-6361/201834371']
(1024,)


In [4]:
db.conn.rollback()

In [5]:
# Get the top 20 results
results = db.vector_search(
    query_vector=query_vector,
    table_name="contributions",
    target_column="embedding",
    pubdate=query_record['pubdate'],
    top_k=20,
    probes=16
)
print(f"Got {len(results)} results")

Got 20 results


In [6]:
# Use dict keys to get unique dois while preserving order
dois = list(dict.fromkeys([result.doi for result in results]))
print(f"Unique DOIs: {len(dois)}")
print(dois)
print(f"Target in results: {query_record['citation_dois'][0] in dois}")

Unique DOIs: 17
['10.3847/1538-4357/acbb5f', '10.1086/508143', '10.1093/mnras/stx1301', '10.1051/0004-6361/201834371', '10.1086/324269', '10.3847/0004-637X/823/2/102', '10.1086/147433', '10.1093/mnras/staa1522', '10.1086/426733', '10.1088/0004-637X/788/2/119', '10.1093/mnras/staa3854', '10.1088/0004-637X/763/2/113', '10.1111/j.1365-2966.2008.13979.x', '10.1086/312838', '10.1111/j.1365-2966.2007.12610.x', '10.1111/j.1365-2966.2011.20189.x', '10.1051/0004-6361:20077525']
Target in results: True


In [7]:
def reconstruct_paper(doi: str) -> str:
    query = "SELECT title, abstract, body FROM papers WHERE doi = %s"
    cursor = db.conn.execute(query, (doi,))
    result = cursor.fetchone()
    if result:
        title, abstract, body = result
        return f"{title}\n\nAbstract: {abstract}\n\n{body}"
    else:
        raise ValueError(f"No paper found for DOI: {doi}")

In [25]:
from Rerankers import get_deepseek_boolean

import json
import logging
logging.basicConfig(
    filename="logs/deepseek.log",
    filemode="w",
    level=logging.DEBUG,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

deepseek_boolean = get_deepseek_boolean()

MAX_PAPER_LEN = 210_000  # Limit to 210,000 characters ~ 63k tokens

# Reconstruct and truncate this paper
def should_paper_be_cited(doi: str, query: str) -> dict:
    paper = ""
    try:
        full_paper = reconstruct_paper(doi)
        paper = full_paper[:MAX_PAPER_LEN]
        if len(full_paper) > MAX_PAPER_LEN:
            logging.debug(f"Truncated paper for DOI {doi} from {len(full_paper)} to {MAX_PAPER_LEN} characters")
    except ValueError as e:
        logging.error(f"Error reconstructing paper for DOI {doi}: {e}")

    # Try getting the DeepSeek response
    response = None
    try:
        response = deepseek_boolean(query=query, candidate=paper)
    except Exception as e:
        logging.error(f"Error calling DeepSeek API for DOI {doi}: {e}")

    # Parse the response
    try:
        json_content = json.loads(response.choices[0].message.content)

    except json.JSONDecodeError as e:
        logging.error(f"Error parsing JSON response for DOI {doi}: {e}. Response content: {response.choices[0].message.content}")

    try: 
        should_cite = json_content['should_cite']
    except KeyError as e:
        logging.error(f"Error extracting 'should_cite' from JSON response for DOI {doi}: {e}. Response content: {response.choices[0].message.content}")

    logging.info(f"DOI: {doi}, Should cite: {should_cite}, Reasoning: {json_content.get('reasoning', 'No reasoning key in response')}")
    logging.info(f"Raw response: {response}")

    return {
        "should_cite": json_content.get('should_cite'), # Confirmed existence in try block above
        "reasoning": json_content.get('reasoning', 'No reasoning key in response'),
    }

In [26]:
should_cite = should_paper_be_cited(dois[0], query)
pprint(should_cite)

{'reasoning': 'The candidate paper discusses the detection of gravitational '
              'effects on stars by unseen companions, including black holes, '
              "using Gaia's astrometric measurements. However, the draft text "
              'specifically mentions the gravitational effects of the '
              'directly-imaged planet Pic b on its host star, which is a '
              'different context and not directly related to the candidate '
              "paper's focus on astrometric measurements of partial orbits "
              'with Gaia for detecting unseen companions like black holes. '
              'Therefore, the candidate paper does not provide relevant '
              'information or support for the specific example given in the '
              'draft text.',
 'should_cite': False}


In [27]:
dois_to_cite = []
all_results = []
for doi in dois:
    results = dict()
    try:
        results = should_paper_be_cited(doi, query)
    except Exception as e:
        logging.error(f"Error processing DOI {doi}: {e}")
        continue

    all_results.append(results)
    if results['should_cite']:
        dois_to_cite.append(doi)



In [29]:
for result in dois_to_cite:
    print(result)

10.1051/0004-6361/201834371
10.3847/0004-637X/823/2/102
10.1093/mnras/staa1522


IoU: 0.33, relatively good!

In [34]:
for _, query_record in df.iloc[1:].iterrows():
    query = query_record.sent_no_cit
    print(query)

The gravitational effects of the directly-imaged planet Pic b on its host star, for example, were detected by [REF] with high significance.
[REF] and [REF] combined the optical HST imaging of the HDF with ground-based NIR data to improve photometric redshift analyses in the redshift “desert” at 1 z 2, between the regime of [REF] and that of [REF] .
[REF] and [REF] combined Keck spectroscopy in several fields with deep U -band imaging to measure shorter rest-frame UV wavelengths (2,000–2,500 Å) at z 1 than were probed in the CFRS analysis of [REF] and derived a shallower rate of decline in the SFRD.
[REF] found that O vi traces a warm CGM component that contributes >2×10 9 M ⊙ of gas to the L * baryon budget.
The rates of accretion onto galaxies and of outflow out of galaxies are crucial parameters in most models of galaxy evolution ( [REF] ).
The EUV ion Ne viii redshifts into the COS band at z >0.5, where a few detections ( [REF] ) hint that it may be present in halos out to 100–200 k

In [None]:
# Assuming the db has a 'contributions' table already with embedding(1024), text, and doi columns

from test_findings import findings
print(f"Imported findings with {len(findings)} entries")

In [None]:
from Embedders import get_embedder
embedder = get_embedder(model_name='BAAI/bge-large-en-v1.5', device='mps', normalize=True)

In [None]:
from tqdm import tqdm
# For each entry in findings, embed the string list
# insert these into the database
for doi, sentences in findings.items():
    print(f"Processing DOI: {doi} with {len(sentences)} sentences")
    embeddings = embedder(sentences)

    # Get the associated pubdate
    pubdate = research[research['doi'] == doi]['pubdate'].values[0] if not research[research['doi'] == doi].empty else None
    if not pubdate:
        print(f"Warning: No pubdate found for DOI {doi}. Skipping.")
        continue
    with db.conn.cursor() as cursor:
        for embedding, text in zip(embeddings, sentences):
            # Insert into the database
            cursor.execute(
                "INSERT INTO contributions (embedding, text, pubdate, doi) VALUES (%s, %s, %s, %s)",
                (embedding, text, pubdate, doi)
            )
    db.conn.commit()