In [8]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [9]:
df = pd.read_csv("PubMed_resultsx.csv")

In [10]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,          # characters; roughly ~250-350 tokens depending on text
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", " ", ""]
)

In [11]:
rows = []

In [12]:
for _, r in df.iterrows():
        pmid = str(r["PMID"])
        title = str(r.get("Title", "") or "")
        abstract = str(r.get("Abstract", "") or "")
        text = (title + "\n\n" + abstract).strip()

        if not text:
            continue

        chunks = splitter.split_text(text)
        for i, chunk in enumerate(chunks):
            rows.append({
                "chunk_id": f"{pmid}::c{i}",
                "pmid": pmid,
                "chunk_index": i,
                "text": chunk,
                "title": title,
                "journal": r.get("Journal", ""),
                "mesh_terms": r.get("Mesh Terms", ""),
                "url": r.get("URL", ""),
                "affiliations": r.get("Affiliations", "")
            })

In [13]:
chunks_df = pd.DataFrame(rows)

In [14]:
chunks_df.to_csv("PubMed_chunks.csv", index=False)