# Simple RAG for Scientific Papers

In [11]:
# import tensorflow as tf

# print("TF:", tf.__version__)
# print("Devices:", tf.config.list_physical_devices())
# print("GPUs:", tf.config.list_physical_devices("GPU"))


In [12]:
#uv pip install chromadb sentence-transformers

In [13]:
# --- 1. Load scientific papers from JSON ---
import json
import os

papers_dir = "../papers_json_3"

corpus = []
files = sorted([f for f in os.listdir(papers_dir) if f.endswith('.json')])[:200]

for filename in files:
    with open(os.path.join(papers_dir, filename), 'r', encoding='utf-8') as f:
        paper = json.load(f)
    corpus.append({
        "article_id": paper.get("article_id", filename.replace(".json", "")),
        "text": paper.get("abstract", "") + "\n\n" + paper.get("article", "")
    })

print(f"Loaded {len(corpus)} papers")

Loaded 200 papers


In [14]:
# --- 2. Overlapping chunking (window 150, overlap 50, only if > 100 tokens) ---
'''
chunk size in this case refers to number of words (tokens)
'''
def chunk_text(text, chunk_size=150, overlap=50):
    words = text.split()
    if len(words) <= 100:
        return [text] if words else []
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks

# Build chunks with metadata
chunk_texts, metadatas, ids = [], [], []

for paper in corpus:
    chunks = chunk_text(paper["text"])
    for idx, ch in enumerate(chunks):
        chunk_texts.append(ch)
        metadatas.append({"article_id": paper["article_id"], "chunk_idx": idx})
        ids.append(f'{paper["article_id"]}_chunk_{idx}')

print(f"Total chunks: {len(chunk_texts)}")

Total chunks: 13141


In [15]:
# --- 3. Initialize ChromaDB ---
import chromadb

client = chromadb.PersistentClient(path="scientific_rag_db")
collection = client.get_or_create_collection("scientific_papers")

In [16]:
# --- 4. Embed and index chunks ---
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Using smaller, faster SBERT model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

if collection.count() == 0:
    print(f"Embedding and indexing {len(chunk_texts)} chunks...")
    
    # Embed and add in batches (ChromaDB has max batch size ~5000)
    batch_size = 500
    for i in tqdm(range(0, len(chunk_texts), batch_size), desc="Indexing"):
        batch_texts = chunk_texts[i:i + batch_size]
        batch_metas = metadatas[i:i + batch_size]
        batch_ids = ids[i:i + batch_size]
        
        batch_embs = embedder.encode(batch_texts, show_progress_bar=False).tolist()
        
        collection.add(
            documents=batch_texts,
            embeddings=batch_embs,
            metadatas=batch_metas,
            ids=batch_ids
        )
    
    print(f"Indexed {collection.count()} chunks")
else:
    print(f"Collection already has {collection.count()} chunks")

Collection already has 13141 chunks


In [17]:
# --- 5. Retrieval function ---
def retrieve(query, k=3):
    q_emb = embedder.encode([query]).tolist()[0]
    results = collection.query(query_embeddings=[q_emb], n_results=k)
    return results["documents"][0], results["metadatas"][0]

# Test retrieval
docs, metas = retrieve("random walk on networks")
for i, (doc, meta) in enumerate(zip(docs, metas)):
    print(f"[{i+1}] {meta['article_id']} (chunk {meta['chunk_idx']})")
    print(f"    {doc[:150]}...\n")

[1] article_1 (chunk 6)
    occupation probability , and coverage , have tight relationships with the structure of the graph upon which the walk takes place @xcite . for this rea...

[2] article_1 (chunk 9)
    walkers on the topological properties of the nodes at each layer in order to perform an efficient exploration of such systems . we notice that random ...

[3] article_1 (chunk 8)
    communities @xcite , and provide optimal exploration of a network using only local information @xcite . it has also been found that the dynamics of de...



In [18]:
# --- 6. Load LLM ---
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use GPU if available, otherwise CPU
if torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
    
print(f"Using device: {device}")

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype="auto"
)
print(f"Model loaded: {model_name}")

Using device: mps
Model loaded: Qwen/Qwen2.5-1.5B-Instruct


In [19]:
# --- 7. RAG answer generation ---
def rag_answer(query):
    docs, metas = retrieve(query)
    
    # Build context with source references
    context_parts = []
    for i, (doc, meta) in enumerate(zip(docs, metas), 1):
        context_parts.append(f"[{i}] Source: {meta['article_id']}\n{doc}")
    context = "\n\n".join(context_parts)

    prompt = f"""Use the following context to answer the question. Cite sources using [1], [2], etc.

CONTEXT:
{context}

QUESTION:
{query}

ANSWER:
"""
    tokens = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**tokens, max_new_tokens=200, do_sample=False)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract just the answer part
    if "ANSWER:" in answer:
        answer = answer.split("ANSWER:")[-1].strip()
    
    # Return answer and sources
    sources = [meta['article_id'] for meta in metas]
    return answer, sources

In [20]:
# --- 8. Test queries ---
test_queries = [
"Where is the jet energy dissipated relative to the cool core in the system discussed?",
"What determines the actual sensitivity of the observational setup mentioned?",
"What are the two possible mechanisms by which photons may be produced in the described process?",
"How are sources and destinations paired in the network model?",
"What conclusion is reached about the consistency of the constraint algebra for quantum Bohmian trajectories?",
"On what observational sample size and survey are the presented galaxy results based?",
"Which theoretical framework motivates the idea that the universe contains many light scalar fields?",
"Why is the mentioned gamma-ray binary system considered particularly interesting?",
"What key assumption about turbulence underlies the basis of the present work on local equilibrium patches?",
"How does the mass-ratio distribution of stellar companions to hot Jupiter systems compare to that of field star binaries?",
"Which numerical model is employed for microswimmers, and how are the swimmers represented in this model?",
"Why is the magnetic moment considered a fundamental property of the nucleon?",
"Under what restriction does the algebra of constraints remain closed, avoiding inconsistency in quantum geometrodynamics and quantum field theory?",
"What do clusters of galaxies trace in the context of large-scale cosmic structure?",
"Why are radiative decays of heavy quarkonia useful for studying color-singlet two-gluon systems?"
]

for q in test_queries:
    print(f"QUESTION: {q}")
    answer, sources = rag_answer(q)
    print(f"\nANSWER: {answer.split('ANSWER:')[-1].strip()}")
    print(f"\nSOURCES: {', '.join(sources)}")
    print("=" * 60)

QUESTION: Where is the jet energy dissipated relative to the cool core in the system discussed?

ANSWER: The jet energy is dissipated outside the cool core, according to the information provided in the text.
Sources:
[1]
[2]
[3]

SOURCES: article_4, article_4, article_4
QUESTION: What determines the actual sensitivity of the observational setup mentioned?

ANSWER: The actual sensitivity of the observational setup, as determined by the source material provided, depends on several factors including the type of experiment being conducted, the specific instruments used, and the theoretical models employed to predict the observable signals. The text mentions that "a near detector is also considered in the analysis," suggesting that this additional component could provide valuable information about the system's performance. Furthermore, it notes that "the assumed true values of the oscillation parameters are @xmath129 in the analysis," indicating that these parameters play a crucial role in 