In [17]:
import numpy as np
import pandas as pd
from semantic_text_splitter import TextSplitter
import torch
from tqdm import tqdm
from citeline.embedders import Embedder
import itertools

EMBEDDER = "Qwen/Qwen3-Embedding-0.6B"


def reconstruct_paper(example: pd.Series) -> str:
    return f"{example['title']}\n\nAbstract: {example['abstract']}\n\n{example['body']}"


sample_df = pd.read_json("../data/dataset/nontrivial_checked.jsonl", lines=True).sample(n=10, random_state=42)
target_dois = set(sample_df["citation_dois"].explode().unique())
print(f"Testing text splitter grid search on {len(target_dois)} unique DOIs")

# Get just the target DOI texts and split them
research = pd.read_json("../data/preprocessed/research.jsonl", lines=True)
research = research[research["doi"].isin(target_dois)]
research["paper"] = research.apply(reconstruct_paper, axis=1)
research["pubdate"] = research["pubdate"].apply(lambda x: int(x.replace("-", "")))

research.head()

Testing text splitter grid search on 13 unique DOIs


Unnamed: 0,bibcode,abstract,aff,author,bibstem,doctype,doi,id,pubdate,title,...,reference,data,citation_count,citation,body,dois,keywords,loaded_from,body_sentences,paper
589,1986ApJ...303...39D,"The formation of dwarf, diffuse, metal-poor ga...","[Yale University, New Haven, CT; Weizmann Inst...","[Dekel, A., Silk, J.]","[ApJ, ApJ...303]",article,10.1086/164050,1621141,19860401,"The Origin of Dwarf Galaxies, Cold Dark Matter...",...,"[1961AJ.....66..249H, 1961AJ.....66..384H, 196...","[NED:10, SIMBAD:10]",1975,"[1986A&A...168...81C, 1986AJ.....92..580A, 198...",O'Q! 00 00 o oo KO 00 O'! ^ The Astrophysical ...,[10.1086/164050],"[Abundance, Cold Plasmas, Dark Matter, Dwarf G...",data/json/Astro_Research.json,"[O'Q! 00 00 o oo KO 00 O'!, ^ The Astrophysic...","The Origin of Dwarf Galaxies, Cold Dark Matter..."
7901,1993ApJ...410..696O,We have reduced and analyzed a set of narrow-b...,"[Rice Univ., Houston, TX, Rice Univ., Houston,...","[O'Dell, C. R., Wen, Zheng, Hu, Xihai]","[ApJ, ApJ...410]",article,10.1086/172786,1876038,19930601,Discovery of New Objects in the Orion Nebula o...,...,"[1954TrSht..25....1P, 1958AnDea...7...67S, 195...","[ESA:1, SIMBAD:26, hst:1]",366,"[1993RMxAA..27...55O, 1993RMxAA..27..153H, 199...","Ti~w ASTROPHYSICAL JOURNAL, 410:696-700, 1993 ...",[10.1086/172786],"[Herbig-Haro Objects, Hubble Space Telescope, ...",data/json/salvaged_articles.json,"[Ti~w ASTROPHYSICAL JOURNAL, 410:696-700, 1993...",Discovery of New Objects in the Orion Nebula o...
8463,2002AJ....123.1454B,Nearby dwarf irregular galaxies were searched ...,"[Lowell Observatory, 1400 West Mars Hill Road,...","[Billett, Olivia H., Hunter, Deidre A., Elmegr...","[AJ, AJ....123]",article,10.1086/339181,11740864,20020301,Compact Star Clusters in Nearby Dwarf Irregula...,...,"[1952PASP...64..196G, 1955ApJ...121..161S, 195...","[ESA:1, NED:91, SIMBAD:100, hst:1]",126,"[2002A&A...390..481V, 2002AJ....124.1393L, 200...",1. INTRODUCTION The properties of super–star c...,"[10.1086/339181, 10.48550/arXiv.astro-ph/0112260]","[Galaxies: Formation, Galaxies: Irregular, Gal...",data/json/salvaged_articles.json,[1. INTRODUCTION The properties of super–star ...,Compact Star Clusters in Nearby Dwarf Irregula...
18980,1985ApJ...295...73K,Core radii and central surface brightnesses of...,"[Dominion Astrophysical Observatory, Victoria,...","[Kormendy, J.]","[ApJ, ApJ...295]",article,10.1086/163350,1589567,19850801,Families of ellipsoidal stellar systems and th...,...,"[1966AJ.....71...64K, 1970ApJ...160..811F, 197...","[NED:30, SIMBAD:35]",553,"[1985AJ.....90.2221O, 1985ApJ...299..881T, 198...",1985ApJ. . .295. . .73K The Astrophysical Jour...,[10.1086/163350],"[Disk Galaxies, Dwarf Galaxies, Elliptical Gal...",data/json/salvaged_articles.json,"[1985ApJ. . .295. ., .73K The Astrophysical...",Families of ellipsoidal stellar systems and th...
24992,1977ApJS...34..405B,Techniques are described for constructing comp...,"[Minnesota, University, Minneapolis, Minn., Ha...","[Black, J. H., Dalgarno, A.]","[ApJS, ApJS...34]",article,10.1086/190455,1365218,19770701,Models of interstellar clouds. I. The Zeta Oph...,...,"[1949MNRAS.109..698B, 1951ApJ...113..441B, 195...",,445,"[1977ApJ...217L.109C, 1977ApJS...35..281T, 197...",197 7 Ap J S ... 34. .405B The Astrophysical J...,[10.1086/190455],"[Abundance, Astronomical Models, Hydrogen Clou...",data/json/salvaged_articles.json,[197 7 Ap J S ... 34. .405B The Astrophysical ...,Models of interstellar clouds. I. The Zeta Oph...


In [18]:
# Set up embedder
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
embedder = Embedder.create(EMBEDDER, device=device, normalize=True)

In [19]:
def chunk_text(text: str, splitter: TextSplitter) -> list[str]:
    chunks = splitter.chunks(text)
    chunks = [stripped_chunk.replace("\x00", "") for chunk in chunks if (stripped_chunk := chunk.strip())]
    return chunks


tqdm.pandas()
splitter = TextSplitter((200, 400), overlap=50, trim=True)
research["chunks"] = research["paper"].progress_apply(lambda x: chunk_text(x, splitter))

100%|██████████| 13/13 [00:00<00:00, 720.54it/s]


In [20]:
print(f"The dataset is {len(research)} rows long, one for each paper.")
print(f"Total number of chunks: {research['chunks'].str.len().sum()}")
research = research.explode("chunks")
print(f"After exploding, the dataset is {len(research)} rows long, one for each chunk.")

The dataset is 13 rows long, one for each paper.
Total number of chunks: 3122
After exploding, the dataset is 3122 rows long, one for each chunk.


In [10]:
len(research)

3122

In [None]:
min_lengths = np.arange(50, 1001, 50)
increments = np.arange(50, 1001, 50)
overlaps = np.arange(0, 201, 25)
results = np.zeros((len(overlaps), len(min_lengths), len(increments)))
for min_len, increment, overlap in itertools.product(min_lengths, increments, overlaps):
    overlap = min(min_len - 1, overlap)  # ensure overlap is not greater than min_len
    print(f"Processing min_length={min_len}, increment={increment}, overlap={overlap}...")
    splitter = TextSplitter((min_len, min_len + increment), overlap=overlap, trim=True)
    tqdm.pandas(desc=f"Chunking papers (min_len={min_len}, max_len={min_len + increment}, overlap={overlap})")
    research["chunks"] = research["paper"].progress_apply(lambda x: chunk_paper(x, splitter))