In [1]:
from database.database import Database
import pandas as pd
import numpy as np
from embedders import get_embedder

embedder = get_embedder(model_name="BAAI/bge-large-en-v1.5", device="mps", normalize=True)
db = Database()
db.test_connection()

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)


In [4]:
# Sample 20 examples from the training data
examples = pd.read_json("data/dataset/nontrivial_filtered.jsonl", lines=True).sample(20, random_state=42)
examples.head()

Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes
10804,10.1007/s00159-014-0081-z,The increase in SFR with redshift for star-for...,The increase in SFR with redshift for star-for...,563,"[10.1088/0004-637X/713/1/686, 10.1111/j.1365-2...",2015-01-01,"[2010ApJ...713..686D, 2010MNRAS.407.2091G, 201..."
4759,10.1146/annurev.astro.42.053102.134034,"The second constraint, which is difficult to a...","The second constraint, which is difficult to a...",521,[10.1046/j.1365-8711.2003.06976.x],2004-09-01,[2003MNRAS.344L...7C]
3842,10.1007/s00159-020-00125-0,"For example, Lauer et al. ( 2005 ) catalogs a ...","For example, [REF] catalogs a number of nuclea...",174,"[10.1086/429565, 10.1086/504042]",2020-07-01,"[2005AJ....129.2138L, 2006ApJS..165...57C]"
3544,10.1146/annurev-astro-032620-021910,This is seen for nearby populations but also i...,This is seen for nearby populations but also i...,822,"[10.1051/0004-6361/201117239, 10.1088/0004-637...",2020-08-01,"[2011A&A...533A.119E, 2011ApJ...742...96W]"
2150,10.1146/annurev-astro-081817-051853,One possibility is that tidal heating reinflat...,One possibility is that tidal heating reinflat...,335,[10.1088/0004-637X/727/2/75],2018-09-01,[2011ApJ...727...75I]


In [15]:
def get_search_results(example: pd.Series, table: str = "chunks") -> pd.DataFrame:
    embedding = embedder([example["sent_no_cit"]])[0]
    
    # Perform vector search
    chunk_results = db.vector_search(
        query_vector=embedding,
        target_table=table,
        target_column="embedding",
        pubdate=example.get("pubdate"),
        top_k=1000,  # Adjust as needed
    )
    
    assert chunk_results['distance'].is_monotonic_increasing, "Distances are not in ascending order"
    return chunk_results

results = get_search_results(examples.iloc[0])
print(len(results))

1000


In [16]:
example = examples.iloc[0]
example['citation_dois']

['10.1088/0004-637X/713/1/686',
 '10.1111/j.1365-2966.2010.16969.x',
 '10.1088/0004-637X/768/1/74']

In [17]:
# for each doi in example['citation_dois'] get the lowest index in the results where it appears, or -1 if not found
doi_indices = {doi: results[results['doi'] == doi].index.min() if not results[results['doi'] == doi].empty else -1 for doi in example['citation_dois']}
print(doi_indices)

{'10.1088/0004-637X/713/1/686': -1, '10.1111/j.1365-2966.2010.16969.x': np.int64(89), '10.1088/0004-637X/768/1/74': np.int64(6)}


In [22]:
# We see doi 10.1088/0004-637X/713/1/686 didn't appear in the top 1000 results. Check it is in the database

count = db.query("SELECT COUNT(id) FROM chunks WHERE doi = '10.1088/0004-637X/713/1/686'")
print(len(count))

1


In [24]:
# Why didn't this DOI appear in the database? is it too long?
research = pd.read_json('data/preprocessed/research.jsonl', lines=True)

In [29]:
paper = research[research['doi'] == '10.1088/0004-637X/713/1/686']
# get the text of the 'body' field
body_text = paper['body'].values[0]
print(len(body_text), body_text[:500])  # Print length and first 500 characters

88784 1. INTRODUCTION Over the last decade, deep and wide multiwavelength galaxy surveys have been key in addressing critical issues regarding galaxy evolution. By using a variety of color selection techniques, and deriving photometric and/or spectroscopic redshifts, different galaxy populations have now been probed to unprecedented detail up to at least z ∼ 3. Also, the history of cosmic star formation as well as the corresponding build-up of stellar mass has now been constrained using different obse
