# Simple RAG for Scientific Papers

In [None]:
#uv pip install chromadb sentence-transformers

Note: you may need to restart the kernel to use updated packages.


[2mUsing Python 3.12.5 environment at: C:\Users\balog\AppData\Local\Programs\Python\Python312[0m
  [31m×[0m Failed to read `websocket-client==1.8.0`
[31m  ├─▶ [0mFailed to read metadata from installed package `websocket-client==1.8.0`
[31m  ╰─▶ [0mfailed to open file
[31m      [0m`C:\Users\balog\AppData\Local\Programs\Python\Python312\Lib\site-packages\websocket_client-1.8.0.dist-info\METADATA`:
[31m      [0mThe system cannot find the file specified. (os error 2)
[36m  help: [0m`[36mwebsocket-client[39m` ([36mv1.8.0[39m) was included because `[36mchromadb[39m`
        ([36mv1.4.1[39m) depends on `[36mkubernetes[39m[36m>=28.1.0[39m` ([36mv35.0.0[39m) which depends
        on `[36mwebsocket-client[39m[36m>=0.32.0, <0.40.0 | >0.40.0, <0.41.dev0 |
        >=0.43.dev0[39m`


In [1]:
# --- 1. Load scientific papers from JSON ---
import json
import os

papers_dir = "../papers_json_3/papers_json_3"

corpus = []
files = sorted([f for f in os.listdir(papers_dir) if f.endswith('.json')])[:200]

for filename in files:
    with open(os.path.join(papers_dir, filename), 'r', encoding='utf-8') as f:
        paper = json.load(f)
    corpus.append({
        "article_id": paper.get("article_id", filename.replace(".json", "")),
        "text": paper.get("abstract", "") + "\n\n" + paper.get("article", "")
    })

print(f"Loaded {len(corpus)} papers")

Loaded 200 papers


In [None]:
# --- 2. Overlapping chunking (window 150, overlap 50, only if > 100 tokens) ---

def chunk_text(text, chunk_size=150, overlap=50):
    words = text.split()
    if len(words) <= 100:
        return [text] if words else []
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks

# Build chunks with metadata
chunk_texts, metadatas, ids = [], [], []

for paper in corpus:
    chunks = chunk_text(paper["text"])
    for idx, ch in enumerate(chunks):
        chunk_texts.append(ch)
        metadatas.append({"article_id": paper["article_id"], "chunk_idx": idx})
        ids.append(f'{paper["article_id"]}_chunk_{idx}')

print(f"Total chunks: {len(chunk_texts)}")

Total chunks: 13141


In [3]:
# --- 3. Initialize ChromaDB ---
import chromadb

client = chromadb.PersistentClient(path="scientific_rag_db")
collection = client.get_or_create_collection("scientific_papers")

In [8]:
# --- 4. Embed and index chunks ---
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Using smaller, faster SBERT model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

if collection.count() == 0:
    print(f"Embedding and indexing {len(chunk_texts)} chunks...")
    
    # Embed and add in batches (ChromaDB has max batch size ~5000)
    batch_size = 500
    for i in tqdm(range(0, len(chunk_texts), batch_size), desc="Indexing"):
        batch_texts = chunk_texts[i:i + batch_size]
        batch_metas = metadatas[i:i + batch_size]
        batch_ids = ids[i:i + batch_size]
        
        batch_embs = embedder.encode(batch_texts, show_progress_bar=False).tolist()
        
        collection.add(
            documents=batch_texts,
            embeddings=batch_embs,
            metadatas=batch_metas,
            ids=batch_ids
        )
    
    print(f"Indexed {collection.count()} chunks")
else:
    print(f"Collection already has {collection.count()} chunks")

Embedding and indexing 13141 chunks...


Indexing: 100%|██████████| 27/27 [16:32<00:00, 36.76s/it]

Indexed 13141 chunks





In [None]:
# --- RELOAD: Use this cell to reload everything without re-indexing if default saving is not working---
# Run this cell alone next time instead of cells 3-6

import chromadb
from sentence_transformers import SentenceTransformer

# Reload ChromaDB
client = chromadb.PersistentClient(path="scientific_rag_db")
collection = client.get_collection("scientific_papers")
print(f"Loaded {collection.count()} chunks from disk")

# Reload embedder
embedder = SentenceTransformer("all-MiniLM-L6-v2")
print("Embedder loaded")

In [9]:
# --- 5. Retrieval function ---
def retrieve(query, k=3):
    q_emb = embedder.encode([query]).tolist()[0]
    results = collection.query(query_embeddings=[q_emb], n_results=k)
    return results["documents"][0], results["metadatas"][0]

# Test retrieval
docs, metas = retrieve("random walk on networks")
for i, (doc, meta) in enumerate(zip(docs, metas)):
    print(f"[{i+1}] {meta['article_id']} (chunk {meta['chunk_idx']})")
    print(f"    {doc[:150]}...\n")

[1] article_1 (chunk 6)
    occupation probability , and coverage , have tight relationships with the structure of the graph upon which the walk takes place @xcite . for this rea...

[2] article_1 (chunk 9)
    walkers on the topological properties of the nodes at each layer in order to perform an efficient exploration of such systems . we notice that random ...

[3] article_1 (chunk 8)
    communities @xcite , and provide optimal exploration of a network using only local information @xcite . it has also been found that the dynamics of de...



In [None]:
# --- 6. Load LLM ---
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use GPU if available, otherwise CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if device == "cuda":
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )
else:
    # CPU: load without device_map
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model = model.to(device)

print(f"Model loaded: {model_name}")

Using device: cpu


`torch_dtype` is deprecated! Use `dtype` instead!


ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`

In [None]:
# --- 7. RAG answer generation ---
def rag_answer(query):
    docs, metas = retrieve(query)
    
    # Build context with source references
    context_parts = []
    for i, (doc, meta) in enumerate(zip(docs, metas), 1):
        context_parts.append(f"[{i}] Source: {meta['article_id']}\n{doc}")
    context = "\n\n".join(context_parts)

    prompt = f"""Use the following context to answer the question. Cite sources using [1], [2], etc.

CONTEXT:
{context}

QUESTION:
{query}

ANSWER:
"""
    tokens = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**tokens, max_new_tokens=200, do_sample=False)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract just the answer part
    if "ANSWER:" in answer:
        answer = answer.split("ANSWER:")[-1].strip()
    
    # Return answer and sources
    sources = [meta['article_id'] for meta in metas]
    return answer, sources

: 

: 

: 

In [None]:
# --- 8. Test queries ---
test_queries = [
    "What is a random walk on a network?",
    "How do biased random walks work?",
    "What is entropy rate?"
]

for q in test_queries:
    print(f"QUESTION: {q}")
    answer, sources = rag_answer(q)
    print(f"ANSWER: {answer.split('ANSWER:')[-1].strip()}")
    print(f"SOURCES: {', '.join(sources)}")
    print("=" * 60)

: 

: 

: 