# Simple RAG for Scientific Papers

This notebook implements a Simple (Naive) RAG Model for answering questions based on scientific literature.

**Task:** Retrieval-Augmented Generation  
**Dataset:** 200 Scientific Papers (JSON format)  

**Features:**
- ChromaDB for vector storage
- Overlapping chunking (window size 150, overlap 50)
- Sentence Transformers for embeddings
- Source document references for transparency

In [None]:
# --- 1. Install dependencies ---
!pip install chromadb sentence-transformers transformers torch

In [None]:
# --- 2. Load scientific papers from JSON files ---
import json
import os
from tqdm import tqdm

papers_dir = "../papers_json_3/papers_json_3"

def load_papers(papers_dir, max_papers=200):
    """Load scientific papers from JSON files."""
    corpus = []
    files = sorted([f for f in os.listdir(papers_dir) if f.endswith('.json')])[:max_papers]
    
    for filename in tqdm(files, desc="Loading papers"):
        filepath = os.path.join(papers_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                paper = json.load(f)
                
            corpus.append({
                "article_id": paper.get("article_id", filename.replace(".json", "")),
                "abstract": paper.get("abstract", ""),
                "article": paper.get("article", ""),
                "section_names": paper.get("section_names", []),
                "filename": filename
            })
        except Exception as e:
            print(f"Error loading {filename}: {e}")
    
    return corpus

papers_corpus = load_papers(papers_dir)
print(f"Loaded {len(papers_corpus)} papers")
print(f"Sample paper: {papers_corpus[0]['article_id']}")

In [None]:
# --- 3. Overlapping chunking function ---
# Apply overlapping chunking: window size 150 tokens, overlap 50 tokens
# Only chunk text segments longer than 100 tokens

def chunk_text(text, chunk_size=150, overlap=50, min_length=100):
    """
    Split text into overlapping chunks.
    - chunk_size: window size in tokens (words)
    - overlap: number of overlapping tokens between chunks
    - min_length: minimum text length (in tokens) to apply chunking
    """
    words = text.split()
    
    # If text is shorter than min_length, return as single chunk
    if len(words) <= min_length:
        return [text] if words else []
    
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    
    return chunks

def build_chunks(corpus):
    """Build chunks from the paper corpus with metadata."""
    chunk_texts = []
    metadatas = []
    ids = []
    
    for paper in tqdm(corpus, desc="Chunking papers"):
        article_id = paper["article_id"]
        
        # Combine abstract and article for chunking
        full_text = paper["abstract"] + "\n\n" + paper["article"]
        chunks = chunk_text(full_text, chunk_size=150, overlap=50)
        
        for idx, chunk in enumerate(chunks):
            chunk_texts.append(chunk)
            metadatas.append({
                "article_id": article_id,
                "chunk_idx": idx,
                "source": f"{article_id}_chunk_{idx}",
                "filename": paper["filename"]
            })
            ids.append(f"{article_id}_chunk_{idx}")
    
    return chunk_texts, metadatas, ids

chunk_texts, metadatas, ids = build_chunks(papers_corpus)
print(f"Total chunks created: {len(chunk_texts)}")
print(f"Sample chunk metadata: {metadatas[0]}")

In [None]:
# --- 4. Initialize embedding model ---
from sentence_transformers import SentenceTransformer

# Using all-MiniLM-L6-v2 for embeddings (same as DSK821 example)
embedder = SentenceTransformer("all-MiniLM-L6-v2")
print(f"Embedding model loaded: all-MiniLM-L6-v2")

In [None]:
# --- 5. Initialize ChromaDB and create collection ---
import chromadb

persist_dir = "scientific_papers_rag_db"

# Create persistent client
client = chromadb.PersistentClient(path=persist_dir)

# Create or get collection
collection = client.get_or_create_collection("scientific_papers_rag")
print(f"ChromaDB collection created/loaded: {collection.name}")

In [None]:
# --- 6. Embed chunks and add to ChromaDB ---
# Only add if collection is empty (to avoid duplicates on re-run)

if collection.count() == 0:
    print("Embedding and indexing chunks...")
    
    # Embed in batches to avoid memory issues
    batch_size = 100
    for i in tqdm(range(0, len(chunk_texts), batch_size), desc="Indexing"):
        batch_texts = chunk_texts[i:i + batch_size]
        batch_metas = metadatas[i:i + batch_size]
        batch_ids = ids[i:i + batch_size]
        
        # Compute embeddings
        batch_embeddings = embedder.encode(batch_texts, show_progress_bar=False).tolist()
        
        # Add to collection
        collection.add(
            documents=batch_texts,
            embeddings=batch_embeddings,
            metadatas=batch_metas,
            ids=batch_ids
        )
    
    print(f"Indexed {collection.count()} chunks")
else:
    print(f"Collection already has {collection.count()} chunks")

In [None]:
# --- 7. Simple retrieval function ---
def retrieve(query, k=3):
    """
    Retrieve top-k most similar chunks for a given query.
    Returns documents with their metadata (source references).
    """
    q_emb = embedder.encode([query]).tolist()[0]
    results = collection.query(
        query_embeddings=[q_emb],
        n_results=k
    )
    
    return {
        "documents": results["documents"][0],
        "metadatas": results["metadatas"][0],
        "ids": results["ids"][0]
    }

# Test retrieval
test_results = retrieve("random walk on networks")
print("Test query: 'random walk on networks'")
print(f"Retrieved {len(test_results['documents'])} chunks")
for i, (doc, meta) in enumerate(zip(test_results['documents'][:2], test_results['metadatas'][:2])):
    print(f"\n[{i+1}] Source: {meta['article_id']} (chunk {meta['chunk_idx']})")
    print(f"Text preview: {doc[:200]}...")

In [None]:
# --- 8. Load LLM (Phi-3 Mini for generation) ---
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "microsoft/Phi-3-mini-4k-instruct"

print(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use GPU if available, otherwise CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype="auto"
)
print("Model loaded successfully!")

In [None]:
# --- 9. Build context with citations ---
def build_context_with_citations(retrieved_docs, limit_chars=3500):
    """
    Build context string with source citations.
    Each chunk is labeled with its source reference.
    """
    context = []
    total = 0
    
    for i, (doc, meta) in enumerate(zip(retrieved_docs["documents"], retrieved_docs["metadatas"]), 1):
        header = f"[{i}] Source: {meta['article_id']} (chunk {meta['chunk_idx']})"
        block = f"{header}\n{doc[:900]}\n"
        
        if total + len(block) > limit_chars:
            break
        
        context.append(block)
        total += len(block)
    
    return "\n".join(context)

In [None]:
# --- 10. Simple RAG answer generation ---
def rag_answer(query, k=3):
    """
    Generate an answer using RAG.
    - Retrieves relevant chunks
    - Builds context with citations
    - Generates answer using LLM
    - Returns answer with source references
    """
    # Retrieve relevant chunks
    retrieved = retrieve(query, k=k)
    context = build_context_with_citations(retrieved)
    
    # Build prompt
    prompt = f"""You are a helpful assistant answering questions based ONLY on the following scientific paper excerpts.
Use citations like [1], [2] referring to the sources provided.
If the context doesn't contain enough information to answer, say so.

==================== SOURCES ====================
{context}
=================================================

Question: {query}

Answer with citations:
"""
    
    # Generate answer
    tokens = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**tokens, max_new_tokens=250, do_sample=False)
    full_response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract answer part
    if "Answer with citations:" in full_response:
        answer = full_response.split("Answer with citations:")[-1].strip()
    else:
        answer = full_response
    
    # Return answer with source references
    sources = [meta['article_id'] for meta in retrieved['metadatas']]
    
    return {
        "answer": answer,
        "sources": sources,
        "retrieved_docs": retrieved
    }

In [None]:
# --- 11. Test the Simple RAG system ---
print("Testing Simple RAG System")
print("=" * 80)

test_queries = [
    "What is a random walk on a network?",
    "How do biased random walks work?",
    "What is entropy rate in the context of random walks?"
]

for query in test_queries:
    print(f"\nQUESTION: {query}")
    print("-" * 40)
    result = rag_answer(query)
    print(f"ANSWER: {result['answer']}")
    print(f"SOURCES: {', '.join(result['sources'])}")
    print("=" * 80)

## 15 Evaluation Questions

Below are 15 questions for evaluating the RAG system:
- **5 Simple Questions** (basic concepts)
- **5 Medium Questions** (require understanding of relationships)
- **5 Domain-Specific Detailed Questions** (technical depth)

In [None]:
# --- 12. Define 15 Evaluation Questions ---

evaluation_questions = {
    "simple": [
        {
            "question": "What is a random walk on a graph?",
            "expected_answer": "A random walk is a stochastic process where a walker moves between nodes of a graph by randomly selecting one of the edges connected to the current node."
        },
        {
            "question": "What is a multiplex network?",
            "expected_answer": "A multiplex network is a multi-layer network where nodes can be connected through different types of relationships organized in distinct and interacting layers."
        },
        {
            "question": "What is the overlapping adjacency matrix?",
            "expected_answer": "The overlapping adjacency matrix accounts for the total number of connections between two nodes across all layers of a multiplex network."
        },
        {
            "question": "What is an embedding in machine learning?",
            "expected_answer": "An embedding is a dense vector representation of data (like words, nodes, or documents) in a continuous vector space that captures semantic similarities."
        },
        {
            "question": "What is ChromaDB used for?",
            "expected_answer": "ChromaDB is a vector database used for storing embeddings and performing similarity search for retrieval-augmented generation systems."
        }
    ],
    "medium": [
        {
            "question": "How does the structure of a network affect random walk dynamics?",
            "expected_answer": "Network structure affects random walks through degree distributions, correlations between nodes, and the presence of hubs. Heterogeneous degree distributions and degree-degree correlations impact the stationary probability distribution and mixing properties."
        },
        {
            "question": "What is the relationship between entropy rate and walk dispersiveness?",
            "expected_answer": "Entropy rate measures the mixedness or dispersiveness of a walk. Higher entropy rate means the walk can explore remote regions of a graph within fewer steps, with all trajectories being more equiprobable."
        },
        {
            "question": "How do inter-layer degree correlations affect multiplex random walks?",
            "expected_answer": "Inter-layer degree correlations determine whether hubs at one layer are also hubs at other layers. Positive correlations (similar degree sequences across layers) affect the achievable entropy rate and stationary probability distribution."
        },
        {
            "question": "What is the difference between extensive and intensive bias functions?",
            "expected_answer": "Extensive bias functions depend on node degrees at each layer (parameters scale with layer count), while intensive bias functions depend on intrinsically multiplex properties like overlapping degree and participation coefficient (fixed parameters regardless of layers)."
        },
        {
            "question": "How does edge overlap affect network exploration efficiency?",
            "expected_answer": "High edge overlap (redundant connections across layers) reduces walk dispersiveness and increases heterogeneity in visiting probability. Lower overlap allows more efficient exploration but may reduce robustness."
        }
    ],
    "detailed": [
        {
            "question": "What is the mathematical expression for the stationary probability distribution of biased random walks on multiplex networks?",
            "expected_answer": "The stationary probability is π_i* = (o_i * f(i)) / Σ_j(o_j * f(j)), where o_i is the overlapping degree and f(i) is the biasing function value for node i."
        },
        {
            "question": "How is the participation coefficient defined and what does it measure in multiplex networks?",
            "expected_answer": "The participation coefficient P_i = M/(M-1) * [1 - Σ_α(k_i^[α]/o_i)²] measures how homogeneously a node's edges are distributed across layers. P_i ≈ 1 for truly multiplex nodes with equal degree across layers, P_i ≈ 0 for focused nodes."
        },
        {
            "question": "What is the theoretical upper bound of entropy rate for random walks on multiplex networks?",
            "expected_answer": "The maximum entropy rate is log(λ₁), where λ₁ is the maximum eigenvalue of the overlapping adjacency matrix. This represents the case where all trajectories of the same length have equal probability."
        },
        {
            "question": "How do additive and multiplicative degree-biased walks differ in their transition probabilities?",
            "expected_answer": "Additive walks use transition probabilities based on Σ_α(k_j^[α])^b_α while multiplicative walks use Π_α(k_j^[α])^b_α, where k_j^[α] is the degree at layer α and b_α are bias exponents."
        },
        {
            "question": "What trade-off exists between dispersiveness and robustness in airline transportation multiplex networks?",
            "expected_answer": "Real-world airline networks show higher edge overlap than random models, sacrificing dispersiveness for robustness to link failures. This results in lower maximum entropy rate and more heterogeneous visiting probability compared to randomized networks."
        }
    ]
}

# Print summary
print("Evaluation Questions Summary:")
print(f"- Simple questions: {len(evaluation_questions['simple'])}")
print(f"- Medium questions: {len(evaluation_questions['medium'])}")
print(f"- Detailed questions: {len(evaluation_questions['detailed'])}")
print(f"- Total: {sum(len(v) for v in evaluation_questions.values())} questions")

In [None]:
# --- 13. Run evaluation on all 15 questions ---
def run_evaluation(questions_dict, rag_func):
    """
    Run the RAG system on all evaluation questions.
    Returns results with answers and sources for each question.
    """
    results = {}
    
    for difficulty, questions in questions_dict.items():
        results[difficulty] = []
        print(f"\n{'='*80}")
        print(f"  {difficulty.upper()} QUESTIONS")
        print(f"{'='*80}")
        
        for i, q in enumerate(questions, 1):
            print(f"\n[{difficulty.upper()} Q{i}] {q['question']}")
            print("-" * 60)
            
            # Get RAG answer
            result = rag_func(q['question'], k=3)
            
            print(f"RAG ANSWER: {result['answer']}")
            print(f"SOURCES: {', '.join(result['sources'])}")
            print(f"EXPECTED: {q['expected_answer'][:200]}...")
            
            results[difficulty].append({
                "question": q['question'],
                "expected_answer": q['expected_answer'],
                "rag_answer": result['answer'],
                "sources": result['sources']
            })
    
    return results

# Run evaluation (uncomment to execute - may take time)
# evaluation_results = run_evaluation(evaluation_questions, rag_answer)

## Utility Functions

Helper functions for reloading the database and interactive querying.

In [None]:
# --- 14. Reload database (for subsequent runs) ---
# Use this cell to reload the existing database without re-indexing

def reload_database():
    """Reload the existing ChromaDB database."""
    import chromadb
    from sentence_transformers import SentenceTransformer
    
    persist_dir = "scientific_papers_rag_db"
    client = chromadb.PersistentClient(path=persist_dir)
    collection = client.get_collection("scientific_papers_rag")
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    
    print(f"Loaded collection with {collection.count()} chunks")
    return client, collection, embedder

# Uncomment to reload:
# client, collection, embedder = reload_database()

In [None]:
# --- 15. Interactive query function ---
def ask(question, k=3, show_sources=True):
    """
    Interactive function to ask questions to the RAG system.
    
    Args:
        question: The question to ask
        k: Number of chunks to retrieve
        show_sources: Whether to display source information
    """
    result = rag_answer(question, k=k)
    
    print(f"\nQuestion: {question}")
    print("-" * 60)
    print(f"Answer: {result['answer']}")
    
    if show_sources:
        print(f"\nSources:")
        for i, (source, meta) in enumerate(zip(result['sources'], result['retrieved_docs']['metadatas']), 1):
            print(f"  [{i}] {source} (chunk {meta['chunk_idx']})")
    
    return result

# Example usage:
# ask("What are biased random walks?")
# ask("How does entropy rate relate to network exploration?")

In [None]:
# --- 16. Display corpus statistics ---
def display_corpus_stats():
    """Display statistics about the indexed corpus."""
    print("=" * 60)
    print("CORPUS STATISTICS")
    print("=" * 60)
    print(f"Total papers loaded: {len(papers_corpus)}")
    print(f"Total chunks indexed: {collection.count()}")
    print(f"Average chunks per paper: {collection.count() / len(papers_corpus):.1f}")
    print(f"Chunking parameters: window=150 tokens, overlap=50 tokens")
    print(f"Embedding model: all-MiniLM-L6-v2")
    print(f"LLM model: microsoft/Phi-3-mini-4k-instruct")
    print("=" * 60)

display_corpus_stats()