In [1]:
import requests
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
class SimpleRAG:
    def __init__(self):
        # Initialize sentence transformer for embeddings
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.documents = []
        self.embeddings = []
        
    def add_wikipedia_docs(self, topics):
        for topic in topics:
            try:
                # Simple Wikipedia API call
                url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{topic}"
                response = requests.get(url)
                if response.status_code == 200:
                    data = response.json()
                    content = data.get('extract', '')
                    if content:
                        self.documents.append({
                            'title': data.get('title', topic),
                            'content': content,
                            'source': f"Wikipedia: {topic}"
                        })
                        print(f"Added: {data.get('title', topic)}")
            except Exception as e:
                print(f"Error fetching {topic}: {e}")
        
        # Create embeddings for all documents
        if self.documents:
            texts = [doc['content'] for doc in self.documents]
            self.embeddings = self.model.encode(texts)
            print(f"Created embeddings for {len(self.documents)} documents")
    
    def retrieve(self, query, top_k=3):
        if not self.documents:
            return []
        
        query_embedding = self.model.encode([query])
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        
        # Get top-k most similar documents
        top_indices = np.argsort(similarities)[::-1][:top_k]
        results = []
        
        for idx in top_indices:
            results.append({
                'document': self.documents[idx],
                'similarity': similarities[idx]
            })
        
        return results
    
    def answer_query(self, query):
        retrieved_docs = self.retrieve(query)
        
        if not retrieved_docs:
            return "No documents found in the knowledge base."
        
        # Format context from retrieved documents
        context = "\n\n".join([
            f"Source: {doc['document']['source']}\n{doc['document']['content']}"
            for doc in retrieved_docs
        ])
        
        return {
            'query': query,
            'retrieved_docs': retrieved_docs,
            'context': context,
            'similarity_scores': [doc['similarity'] for doc in retrieved_docs]
        }


In [5]:
# Initialize RAG system
rag = SimpleRAG()

# Add some Wikipedia documents (limited corpus)
wikipedia_topics = [
    "Artificial_intelligence",
    "Machine_learning", 
    "Python_(programming_language)",
    "Natural_language_processing",
    "Deep_learning"
]

print("Building RAG knowledge base...")
rag.add_wikipedia_docs(wikipedia_topics)

# Test questions where answers DON'T exist in our limited Wikipedia corpus
test_questions = [
    "What is the weather like in Tokyo today?",
    "What did I have for breakfast this morning?", 
    "What are the latest stock prices for Tesla?",
    "What is the recipe for my grandmother's secret cake?",
    "What happened in the last episode of a TV show that aired yesterday?"
]

print("\n" + "="*60)
print("TESTING RAG WITH QUESTIONS NOT IN CORPUS")
print("="*60)

results = []
for question in test_questions:
    print(f"\nQUESTION: {question}")
    print("-" * 40)
    
    result = rag.answer_query(question)
    results.append(result)
    
    print(f"Top similarity score: {max(result['similarity_scores']):.3f}")
    print(f"Retrieved from: {result['retrieved_docs'][0]['document']['title']}")
    print(f"Most relevant snippet: {result['retrieved_docs'][0]['document']['content'][:200]}...")

# Chain-of-Notes (CoN) Prompting Template
con_template = """
You are analyzing a RAG (Retrieval-Augmented Generation) system's response to a query.

QUERY: {query}

RETRIEVED CONTEXT (Top similarity score: {similarity:.3f}):
{context}

Please provide a Chain-of-Notes analysis following this format:

**Note 1: Query Analysis**
- What type of information is the user seeking?
- Is this information likely to be in a general Wikipedia corpus about AI/ML topics?

**Note 2: Context Relevance Assessment** 
- How relevant is the retrieved context to answering the query?
- What is the semantic similarity score telling us?

**Note 3: Information Gap Identification**
- What specific information is missing to answer the query?
- Why would this information not be in the current knowledge base?

**Note 4: Response Recommendation**
- Should the system attempt to answer based on the retrieved context?
- How should the system acknowledge the information gap?

**Final Assessment:**
Does the retrieved context contain relevant information to answer the query? Should the LLM acknowledge it cannot answer this question based on the available knowledge base?
"""

print("\n" + "="*60)
print("CHAIN-OF-NOTES (CoN) ANALYSIS")
print("="*60)
print("Copy the following prompts to your LLM (ChatGPT/Llama3) for analysis:")
print("="*60)

for i, result in enumerate(results):
    print(f"\n{'='*20} ANALYSIS PROMPT {i+1} {'='*20}")
    
    # Format the CoN prompt
    prompt = con_template.format(
        query=result['query'],
        similarity=max(result['similarity_scores']),
        context=result['context'][:500] + "..." if len(result['context']) > 500 else result['context']
    )
    
    print(prompt)
    print("\n" + "-"*60)

Building RAG knowledge base...
Added: Artificial intelligence
Added: Machine learning
Added: Python (programming language)
Added: Natural language processing
Added: Deep learning
Created embeddings for 5 documents

TESTING RAG WITH QUESTIONS NOT IN CORPUS

QUESTION: What is the weather like in Tokyo today?
----------------------------------------
Top similarity score: 0.037
Retrieved from: Machine learning
Most relevant snippet: Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalise to unseen data, and thus...

QUESTION: What did I have for breakfast this morning?
----------------------------------------
Top similarity score: 0.021
Retrieved from: Deep learning
Most relevant snippet: Deep learning is a subset of machine learning that focuses on utilizing multilayered neural networks to perform tasks such as classification, regression, and representation learning. The