# Simple RAG for Scientific Papers

In [None]:
#uv pip install chromadb sentence-transformers

Note: you may need to restart the kernel to use updated packages.


[2mUsing Python 3.12.5 environment at: C:\Users\balog\AppData\Local\Programs\Python\Python312[0m
  [31m×[0m Failed to read `websocket-client==1.8.0`
[31m  ├─▶ [0mFailed to read metadata from installed package `websocket-client==1.8.0`
[31m  ╰─▶ [0mfailed to open file
[31m      [0m`C:\Users\balog\AppData\Local\Programs\Python\Python312\Lib\site-packages\websocket_client-1.8.0.dist-info\METADATA`:
[31m      [0mThe system cannot find the file specified. (os error 2)
[36m  help: [0m`[36mwebsocket-client[39m` ([36mv1.8.0[39m) was included because `[36mchromadb[39m`
        ([36mv1.4.1[39m) depends on `[36mkubernetes[39m[36m>=28.1.0[39m` ([36mv35.0.0[39m) which depends
        on `[36mwebsocket-client[39m[36m>=0.32.0, <0.40.0 | >0.40.0, <0.41.dev0 |
        >=0.43.dev0[39m`


In [1]:
# --- 1. Load scientific papers from JSON ---
import json
import os

papers_dir = "../papers_json_3/papers_json_3"

corpus = []
files = sorted([f for f in os.listdir(papers_dir) if f.endswith('.json')])[:200]

for filename in files:
    with open(os.path.join(papers_dir, filename), 'r', encoding='utf-8') as f:
        paper = json.load(f)
    corpus.append({
        "article_id": paper.get("article_id", filename.replace(".json", "")),
        "text": paper.get("abstract", "") + "\n\n" + paper.get("article", "")
    })

print(f"Loaded {len(corpus)} papers")

Loaded 200 papers


In [None]:
# --- 2. Overlapping chunking (window 150, overlap 50, only if > 100 tokens) ---

def chunk_text(text, chunk_size=150, overlap=50):
    words = text.split()
    if len(words) <= 100:
        return [text] if words else []
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks

# Build chunks with metadata
chunk_texts, metadatas, ids = [], [], []

for paper in corpus:
    chunks = chunk_text(paper["text"])
    for idx, ch in enumerate(chunks):
        chunk_texts.append(ch)
        metadatas.append({"article_id": paper["article_id"], "chunk_idx": idx})
        ids.append(f'{paper["article_id"]}_chunk_{idx}')

print(f"Total chunks: {len(chunk_texts)}")

Total chunks: 13141


In [3]:
# --- 3. Initialize ChromaDB ---
import chromadb

client = chromadb.PersistentClient(path="scientific_rag_db")
collection = client.get_or_create_collection("scientific_papers")

In [None]:
# --- 4. Embed and index chunks ---
from sentence_transformers import SentenceTransformer

# Using SBERT (Sentence-BERT) model
embedder = SentenceTransformer("all-mpnet-base-v2")

if collection.count() == 0:
    embs = embedder.encode(chunk_texts, batch_size=32).tolist()
    collection.add(
        documents=chunk_texts,
        embeddings=embs,
        metadatas=metadatas,
        ids=ids
    )
    print(f"Indexed {collection.count()} chunks")
else:
    print(f"Collection already has {collection.count()} chunks")




KeyboardInterrupt: 

In [None]:
# --- 5. Retrieval function ---
def retrieve(query, k=3):
    q_emb = embedder.encode([query]).tolist()[0]
    results = collection.query(query_embeddings=[q_emb], n_results=k)
    return results["documents"][0], results["metadatas"][0]

# Test retrieval
docs, metas = retrieve("random walk on networks")
for i, (doc, meta) in enumerate(zip(docs, metas)):
    print(f"[{i+1}] {meta['article_id']} (chunk {meta['chunk_idx']})")
    print(f"    {doc[:150]}...\n")

: 