In [1]:
import pandas as pd
import json

research_df = pd.read_json('data/research_used.jsonl', lines=True)
print(f"Research len: {len(research_df)}")




Research len: 10027


In [2]:
# Set up splitter
from semantic_text_splitter import TextSplitter

splitter = TextSplitter(capacity=1500, overlap=150)

In [7]:
def reconstruct_paper(example: pd.Series) -> str:
    return f"{example.title}\n\nAbstract: {example.abstract}\n\n{example.body}"

In [11]:
from tqdm import tqdm

for doi in tqdm(research_df.itertuples(), desc="processing"):
    doi = row.doi
    # Convert pubdate from 'YYYY-MM-DD' to int YYYYMMDD
    pubdate = int(row.pubdate.replace("-", ""))
    paper = reconstruct_paper(row)

    chunks = splitter.chunks(paper)
    chunks = [chunk.strip().replace("\x00", "") for chunk in chunks if chunk.strip()]  # Remove null chars and empty chunks
    with open('data/research_chunks.jsonl', 'a') as file:
        for chunk in chunks:
            file.write(json.dumps({"text": chunk, "doi": doi, "pubdate": pubdate}) + "\n")


processing: 10027it [00:02, 4761.84it/s]


In [6]:
# Check that the set of DOIs present in `chunks` is the same as those in `contributions`
results = db.query(f"SELECT DISTINCT doi FROM chunks")
chunk_dois = {row[0] for row in results}
print(f"Number of unique DOIs in chunks after hydration: {len(chunk_dois)}")
print(f"Number of unique DOIs in contributions: {len(contribution_dois)}")
print(f"Same set of DOIS: {chunk_dois == contribution_dois}")
assert chunk_dois == contribution_dois, "DOIs in chunks do not match those in contributions"

Number of unique DOIs in chunks after hydration: 10027
Number of unique DOIs in contributions: 10027
Same set of DOIS: True


In [7]:
examples = pd.read_json('data/dataset/nontrivial_llm.jsonl', lines=True)
print(f"Examples len: {len(examples)}")
# Filter for only those examples where all citation_dois are in the contributions_dois
examples = examples[examples['citation_dois'].apply(lambda x: all(doi in contribution_dois for doi in x))]
print(f"Examples after filtering: {len(examples)}")

Examples len: 15133
Examples after filtering: 14739
