# Simple RAG for Scientific Papers

In [4]:
# import tensorflow as tf

# print("TF:", tf.__version__)
# print("Devices:", tf.config.list_physical_devices())
# print("GPUs:", tf.config.list_physical_devices("GPU"))


In [5]:
#uv pip install chromadb sentence-transformers

In [6]:
# --- 1. Load scientific papers from JSON ---
import json
import os

papers_dir = "papers_json_3"

corpus = []
files = sorted([f for f in os.listdir(papers_dir) if f.endswith('.json')])[:200]

for filename in files:
    with open(os.path.join(papers_dir, filename), 'r', encoding='utf-8') as f:
        paper = json.load(f)
    corpus.append({
        "article_id": paper.get("article_id", filename.replace(".json", "")),
        "text": paper.get("abstract", "") + "\n\n" + paper.get("article", "")
    })

print(f"Loaded {len(corpus)} papers")

Loaded 200 papers


In [7]:
# --- 2. Overlapping chunking (window 150, overlap 50, only if > 100 tokens) ---
'''
chunk size in this case refers to number of words (tokens)
'''
def chunk_text(text, chunk_size=150, overlap=50):
    words = text.split()
    if len(words) <= 100:
        return [text] if words else []
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks

# Build chunks with metadata
chunk_texts, metadatas, ids = [], [], []

for paper in corpus:
    chunks = chunk_text(paper["text"])
    for idx, ch in enumerate(chunks):
        chunk_texts.append(ch)
        metadatas.append({"article_id": paper["article_id"], "chunk_idx": idx})
        ids.append(f'{paper["article_id"]}_chunk_{idx}')

print(f"Total chunks: {len(chunk_texts)}")

Total chunks: 13141


In [8]:
# --- 3. Initialize ChromaDB ---
import chromadb

client = chromadb.PersistentClient(path="scientific_rag_db")
collection = client.get_or_create_collection("scientific_papers")

In [9]:
# --- 4. Embed and index chunks ---
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Using smaller, faster SBERT model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

if collection.count() == 0:
    print(f"Embedding and indexing {len(chunk_texts)} chunks...")
    
    # Embed and add in batches (ChromaDB has max batch size ~5000)
    batch_size = 500
    for i in tqdm(range(0, len(chunk_texts), batch_size), desc="Indexing"):
        batch_texts = chunk_texts[i:i + batch_size]
        batch_metas = metadatas[i:i + batch_size]
        batch_ids = ids[i:i + batch_size]
        
        batch_embs = embedder.encode(batch_texts, show_progress_bar=False).tolist()
        
        collection.add(
            documents=batch_texts,
            embeddings=batch_embs,
            metadatas=batch_metas,
            ids=batch_ids
        )
    
    print(f"Indexed {collection.count()} chunks")
else:
    print(f"Collection already has {collection.count()} chunks")

  from .autonotebook import tqdm as notebook_tqdm


Collection already has 13141 chunks


In [10]:
# --- 5. Retrieval function ---
def retrieve(query, k=3):
    q_emb = embedder.encode([query]).tolist()[0]
    results = collection.query(query_embeddings=[q_emb], n_results=k)
    return results["documents"][0], results["metadatas"][0]

# Test retrieval
docs, metas = retrieve("random walk on networks")
for i, (doc, meta) in enumerate(zip(docs, metas)):
    print(f"[{i+1}] {meta['article_id']} (chunk {meta['chunk_idx']})")
    print(f"    {doc[:150]}...\n")

[1] article_1 (chunk 6)
    occupation probability , and coverage , have tight relationships with the structure of the graph upon which the walk takes place @xcite . for this rea...

[2] article_1 (chunk 9)
    walkers on the topological properties of the nodes at each layer in order to perform an efficient exploration of such systems . we notice that random ...

[3] article_1 (chunk 8)
    communities @xcite , and provide optimal exploration of a network using only local information @xcite . it has also been found that the dynamics of de...



In [11]:
# --- 6. Load LLM ---
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use GPU if available, otherwise CPU
if torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
    
print(f"Using device: {device}")

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype="auto"
)
print(f"Model loaded: {model_name}")

`torch_dtype` is deprecated! Use `dtype` instead!


Using device: mps
Model loaded: Qwen/Qwen2.5-1.5B-Instruct


In [12]:
# --- 7. RAG answer generation ---
def rag_answer(query):
    docs, metas = retrieve(query)
    
    # Build context with source references
    context_parts = []
    for i, (doc, meta) in enumerate(zip(docs, metas), 1):
        context_parts.append(f"[{i}] Source: {meta['article_id']}\n{doc}")
    context = "\n\n".join(context_parts)

    prompt = f"""Use the following context to answer the question. Cite sources using [1], [2], etc.

CONTEXT:
{context}

QUESTION:
{query}

ANSWER:
"""
    tokens = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**tokens, max_new_tokens=200, do_sample=False)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract just the answer part
    if "ANSWER:" in answer:
        answer = answer.split("ANSWER:")[-1].strip()
    
    # Return answer and sources
    sources = [meta['article_id'] for meta in metas]
    return answer, sources

In [None]:
# --- 8. Test queries ---
test_queries = [
"What is a random walk in the context of a network?",
"What is meant by a multiplex (multi-layer) network?",
"What does the term stationary probability distribution refer to?",
"What are scalar perturbations in cosmology?",
"What physical system do the airline transportation networks represent in the corpus?",
"How does a biased random walk differ from an unbiased random walk?",
"What is the role of entropy rate in characterizing a random walk?",
"Why are multi-layer networks considered more realistic than single-layer networks?",
"What does gauge invariance ensure in cosmological perturbation theory?",
"How does edge overlap affect diffusion on multiplex networks?",
"How are extensive and intensive bias functions defined, and why do they differ in parameter scaling?",
"In what way does the overlapping adjacency matrix differ from a simple aggregated network representation?",
"How does inter-layer degree correlation influence the dispersiveness of biased random walks?",
"Why does the extended electromagnetic vector field introduce additional scalar modes in cosmology?",
"How do real-world multiplex airline networks demonstrate a trade-off between diffusion efficiency and robustness?"
]

for q in test_queries:
    print(f"QUESTION: {q}")
    answer, sources = rag_answer(q)
    print(f"\nANSWER: {answer.split('ANSWER:')[-1].strip()}")
    print(f"\nSOURCES: {', '.join(sources)}")
    print("=" * 60)

QUESTION:  What is a random walk in the context of a network? 


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


ANSWER: A random walk in the context of a network involves exploring the network using only local information, where the movement of a walker depends on the topological properties of the nodes it encounters. The steady-state properties of a random walk, such as characteristic times, limiting occupation probabilities, and coverage, are closely tied to the structure of the graph on which the walk occurs. This allows researchers to use random walks as tools to study various aspects of network dynamics and properties. [1][2]
SOURCES: article_1, article_1, article_1
QUESTION:  What is meant by a multiplex (multi-layer) network? 
ANSWER: A multiplex (multi-layer) network, according to the provided text, refers to a system where "elementary units" can interact through various types of connections. This concept is particularly relevant for modeling systems like social networks, where individuals may have multiple types of relationships such as kinship, friendship, collaboration, competition, o