# Hydrate the `chunks` table

This notebook chunks and inserts research papers into the database, for use in the baseline model. To make model comparisons apples-to-apples, we need to make sure that the papers represented in `chunks` are the same as those in `contributions`. So we:

1. Load the research dataframe, which contains the full text of all papers
1. Get a set of the DOI's in the `contributions` table to use as a control list
1. Get the set of DOIS in the `chunks` table, and compute the set difference
1. The remaining DOIs are in the research dataset but not yet in `chunks`, so chunk them and insert into the `chunks` table

In [1]:
import pandas as pd

# examples_df = pd.read_json('data/dataset/nontrivial_llm.jsonl', lines=True)
research_df = pd.read_json('data/preprocessed/research.jsonl', lines=True)

# print(f"Examples len: {len(examples_df)}")
print(f"Research len: {len(research_df)}")


Research len: 52618


In [2]:
from database.database import Database

db = Database()
db.test_connection()

results = db.query(f"SELECT DISTINCT doi FROM chunks")
existing_dois = {row[0] for row in results}
print(f"Number of unique DOIs in chunks: {len(existing_dois)}")

results = db.query(f"SELECT DISTINCT doi FROM contributions")
contribution_dois = {row[0] for row in results}
print(f"Number of unique DOIs in contributions: {len(contribution_dois)}")

dois_to_chunk = set(contribution_dois) - set(existing_dois)
print(f"Number of DOIs to chunk: {len(dois_to_chunk)}")

Database         User             Host                             Port            
citelinedb       bbasseri         localhost                        5432            
Database version: ('PostgreSQL 17.5 (Homebrew) on aarch64-apple-darwin24.4.0, compiled by Apple clang version 17.0.0 (clang-1700.0.13.3), 64-bit',)
Number of unique DOIs in chunks: 4864
Number of unique DOIs in contributions: 6317
Number of DOIs to chunk: 1453


In [3]:
# Set up splitter and embedder

import torch
from semantic_text_splitter import TextSplitter
from Embedders import get_embedder

splitter = TextSplitter(capacity=1500, overlap=150)

device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
embedder = get_embedder("BAAI/bge-large-en-v1.5", device=device, normalize=True)

In [4]:
import logging

logging.basicConfig(
    filename="logs/chunk_hydration.log",
    filemode="w",
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

In [5]:
from tqdm import tqdm
def reconstruct_paper(example: pd.Series) -> str:
    return f"{example['title']}\n\nAbstract: {example['abstract']}\n\n{example['body']}"

for doi in tqdm(dois_to_chunk, desc="processing dois"):
    # Get the doi's full record from research df
    record = None
    try:
        record = research_df[research_df['doi'] == doi].iloc[0]
    except IndexError:
        logging.error(f"DOI {doi} not found in research_df")
        continue

    paper = reconstruct_paper(record)
    chunks = splitter.chunks(paper)
    chunks = [chunk.strip().replace("\x00", "") for chunk in chunks if chunk.strip()]  # Remove null chars and empty chunks
    embeddings = embedder(chunks)

    for chunk, embedding in zip(chunks, embeddings):
        with db.conn.cursor() as cursor:
            cursor.execute(
                f"INSERT INTO chunks (embedding, text, doi, pubdate) VALUES (%s, %s, %s, %s)",
                (embedding, chunk, doi, record['pubdate'])
            )
    db.conn.commit()
        

processing dois: 100%|██████████| 1453/1453 [52:51<00:00,  2.18s/it] 


In [6]:
# Check that the set of DOIs present in `chunks` is the same as those in `contributions`
results = db.query(f"SELECT DISTINCT doi FROM chunks")
chunk_dois = {row[0] for row in results}
print(f"Number of unique DOIs in chunks after hydration: {len(chunk_dois)}")
print(f"Number of unique DOIs in contributions: {len(contribution_dois)}")
print(f"Same set of DOIS: {chunk_dois == contribution_dois}")
assert chunk_dois == contribution_dois, "DOIs in chunks do not match those in contributions"

Number of unique DOIs in chunks after hydration: 6317
Number of unique DOIs in contributions: 6317
Same set of DOIS: True


In [7]:
examples = pd.read_json('data/dataset/nontrivial_llm.jsonl', lines=True)
print(f"Examples len: {len(examples)}")
# Filter for only those examples where all citation_dois are in the contributions_dois
examples = examples[examples['citation_dois'].apply(lambda x: all(doi in contribution_dois for doi in x))]
print(f"Examples after filtering: {len(examples)}")

Examples len: 8959
Examples after filtering: 8540


In [8]:
examples.to_json('data/dataset/nontrivial_no_longpapers.jsonl', orient='records', lines=True)