In [1]:
import pandas as pd

df = pd.read_json('data/dataset/split/small_train.jsonl', lines=True)
df.head()

Unnamed: 0,source_doi,sent_original,sent_no_cit,sent_idx,citation_dois,pubdate,resolved_bibcodes
0,10.1146/annurev.astro.46.060407.145222,It is unclear whether the solution can be foun...,It is unclear whether the solution can be foun...,541,[10.1111/j.1365-2966.2009.14750.x],2009-09-01,[2009MNRAS.396..203S]
1,10.1016/j.newar.2024.101694,The gravitational effects of the directly-imag...,The gravitational effects of the directly-imag...,276,[10.1051/0004-6361/201834371],2024-06-01,[2019A&A...623A..72K]
2,10.1146/annurev-astro-081811-125615,Connolly et al. (1997) and Pascarelle et al. (...,[REF] and [REF] combined the optical HST imagi...,477,"[10.1086/310829, 10.1086/311708, 10.1086/30997...",2014-08-01,"[1997ApJ...486L..11C, 1998ApJ...508L...1P, 199..."
3,10.1146/annurev-astro-081811-125615,Cowie et al. (1999) and Wilson et al. (2002) c...,[REF] and [REF] combined Keck spectroscopy in ...,481,"[10.1086/300959, 10.1086/341818, 10.1086/309975]",2014-08-01,"[1999AJ....118..603C, 2002AJ....124.1258W, 199..."
4,10.1146/annurev-astro-091916-055240,Tumlinson et al. (2011 ) found that O vi trace...,[REF] found that O vi traces a warm CGM compon...,348,[10.1126/science.1209840],2017-08-01,[2011Sci...334..948T]


In [2]:
research = pd.read_json('data/preprocessed/research.jsonl', lines=True)

In [3]:
# Get the unique target DOIs from the dataset
dois = list(set(doi for dois in df.citation_dois.tolist() for doi in dois))
dois


['10.1086/311708',
 '10.1086/310829',
 '10.1086/300959',
 '10.1088/0004-637X/767/1/49',
 '10.1093/mnras/283.4.1388',
 '10.1086/341818',
 '10.1111/j.1365-2966.2009.14750.x',
 '10.1126/science.1209840',
 '10.48550/arXiv.2203.02041',
 '10.1086/163605',
 '10.3847/1538-4357/aa6007',
 '10.1086/309975',
 '10.1051/0004-6361/201834371',
 '10.3847/1538-4357/aa9ced']

In [4]:
def reconstruct_paper(example: pd.Series) -> str:
    return f"{example['title']}\n\nAbstract: {example['abstract']}\n\n{example['body']}"

In [5]:
from database.database import Database
db = Database()
db.test_connection()

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)


In [6]:
# Assuming the db has a 'contributions' table already with embedding(1024), text, and doi columns

from test_findings import findings
print(f"Imported findings with {len(findings)} entries")

Imported findings with 14 entries


In [7]:
from Embedders import get_embedder
embedder = get_embedder(model_name='BAAI/bge-large-en-v1.5', device='mps', normalize=True)

In [10]:
from tqdm import tqdm
# For each entry in findings, embed the string list
# insert these into the database
for doi, sentences in findings.items():
    print(f"Processing DOI: {doi} with {len(sentences)} sentences")
    embeddings = embedder(sentences)

    # Get the associated pubdate
    pubdate = research[research['doi'] == doi]['pubdate'].values[0] if not research[research['doi'] == doi].empty else None
    if not pubdate:
        print(f"Warning: No pubdate found for DOI {doi}. Skipping.")
        continue
    with db.conn.cursor() as cursor:
        for embedding, text in zip(embeddings, sentences):
            # Insert into the database
            cursor.execute(
                "INSERT INTO contributions (embedding, text, pubdate, doi) VALUES (%s, %s, %s, %s)",
                (embedding, text, pubdate, doi)
            )
    db.conn.commit()

Processing DOI: 10.1088/0004-637X/767/1/49 with 7 sentences
Processing DOI: 10.1086/163605 with 10 sentences
Processing DOI: 10.1111/j.1365-2966.2009.14750.x with 16 sentences
Processing DOI: 10.1086/309975 with 9 sentences
Processing DOI: 10.1086/300959 with 10 sentences
Processing DOI: 10.1086/311708 with 8 sentences
Processing DOI: 10.1051/0004-6361/201834371 with 16 sentences
Processing DOI: 10.3847/1538-4357/aa6007 with 10 sentences
Processing DOI: 10.1093/mnras/283.4.1388 with 12 sentences
Processing DOI: 10.48550/arXiv.2203.02041 with 15 sentences
Processing DOI: 10.1086/310829 with 8 sentences
Processing DOI: 10.1086/341818 with 11 sentences
Processing DOI: 10.3847/1538-4357/aa9ced with 12 sentences
Processing DOI: 10.1126/science.1209840 with 10 sentences
