<a href="https://colab.research.google.com/github/andrewinwlg/rag-poc/blob/main/rag_poc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🚀 RAG POC on Free GPU - Google Colab
print("🚀 RAG POC on Free GPU - Google Colab")
print("Make sure Runtime -> Change runtime type -> GPU is selected!")
print("="*60)

# Install dependencies
import subprocess
import sys

def install_and_import(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

packages = [
    "sentence-transformers",
    "transformers",
    "torch",
    "chromadb",
    "requests"
]

print("📦 Installing packages...")
for package in packages:
    install_and_import(package)

print("✅ All packages installed!")

🚀 RAG POC on Free GPU - Google Colab
Make sure Runtime -> Change runtime type -> GPU is selected!
📦 Installing packages...
✅ All packages installed!


In [None]:
# Clone RAG POC Repository
import os
if os.path.exists('rag-poc'):
    os.chdir('rag-poc')
    os.system('git pull')
else:
    os.system('git clone https://github.com/andrewinwlg/rag-poc.git')
    os.chdir('rag-poc')

print("✅ Repository ready!")
os.system('ls -la data/')

✅ Repository ready!


0

In [25]:
# Cell 3 - Qwen 2.5-Coder Setup
print("🤖 Setting up Ollama with Qwen 2.5-Coder...")
os.system('curl -fsSL https://ollama.ai/install.sh | sh')

import subprocess
import time
import requests

# Start Ollama server
ollama_process = subprocess.Popen(
    ["ollama", "serve"],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL
)

time.sleep(10)

# Download Qwen 2.5-Coder model
print("📥 Downloading Qwen 2.5-Coder-7B-Instruct (this is perfect for API docs!)...")
print("⏱️ This will take 3-4 minutes but it's worth it for much better responses...")

try:
    result = os.system('ollama pull qwen2.5-coder:7b-instruct')
    if result == 0:
        print("✅ Qwen 2.5-Coder ready! This model excels at technical documentation.")
        current_model = "qwen2.5-coder:7b-instruct"
    else:
        print("⚠️ Qwen download failed, falling back to gemma:2b")
        os.system('ollama pull gemma:2b')
        current_model = "gemma:2b"
except Exception as e:
    print(f"⚠️ Error downloading Qwen: {e}")
    print("📥 Falling back to gemma:2b...")
    os.system('ollama pull gemma:2b')
    current_model = "gemma:2b"

# Store the model globally
globals()['current_model'] = current_model

# Test connection
try:
    response = requests.get("http://localhost:11434/api/version", timeout=5)
    if response.status_code == 200:
        print(f"✅ Ollama ready with {current_model}!")
        print("🚀 This model should give much better API documentation responses!")
    else:
        print("⚠️ Ollama may not be ready - try again")
except:
    print("⚠️ Connection issue - run this cell again")

🤖 Setting up Ollama with Qwen 2.5-Coder...
📥 Downloading Qwen 2.5-Coder-7B-Instruct (this is perfect for API docs!)...
⏱️ This will take 3-4 minutes but it's worth it for much better responses...
✅ Qwen 2.5-Coder ready! This model excels at technical documentation.
✅ Ollama ready with qwen2.5-coder:7b-instruct!
🚀 This model should give much better API documentation responses!


In [23]:
# Enhanced Cell 4 - Better JSON Processing
import chromadb
from sentence_transformers import SentenceTransformer
from pathlib import Path
import uuid
import json

print("🧠 Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("🗄️ Setting up vector database...")
client = chromadb.Client()

# Clear existing collection
try:
    client.delete_collection("rag_documents")
    print("🗑️ Deleted existing collection")
except:
    pass

collection = client.create_collection("rag_documents")

def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = words[i:i+chunk_size]
        if len(chunk) > 0:
            chunks.append(" ".join(chunk))
    return chunks

def parse_openapi_json(data):
    """Convert OpenAPI/Swagger JSON to meaningful text chunks"""
    chunks = []

    # 1. API Overview
    if 'info' in data:
        info = data['info']
        overview = f"""
API Name: {info.get('title', 'Unknown')}
Version: {info.get('version', 'Unknown')}
Description: {info.get('description', 'No description')}
"""
        chunks.append(("API Overview", overview.strip()))

    # 2. Server Information
    if 'servers' in data:
        server_info = "API Servers:\n"
        for server in data['servers']:
            server_info += f"- URL: {server.get('url', '')}\n"
            server_info += f"  Description: {server.get('description', 'No description')}\n"
        chunks.append(("Server Information", server_info.strip()))

    # 3. Individual Endpoints (this is key!)
    if 'paths' in data:
        for path, methods in data['paths'].items():
            for method, details in methods.items():
                if isinstance(details, dict):
                    endpoint_text = f"""
Endpoint: {method.upper()} {path}
Summary: {details.get('summary', 'No summary')}
Description: {details.get('description', 'No description')}
"""

                    # Add parameters
                    if 'parameters' in details:
                        endpoint_text += "\nParameters:\n"
                        for param in details['parameters']:
                            endpoint_text += f"- {param.get('name', '')}: {param.get('description', '')}\n"
                            endpoint_text += f"  Type: {param.get('schema', {}).get('type', 'unknown')}\n"
                            endpoint_text += f"  Required: {param.get('required', False)}\n"

                    # Add request body info
                    if 'requestBody' in details:
                        endpoint_text += f"\nRequest Body Required: {details['requestBody'].get('required', False)}\n"
                        if 'content' in details['requestBody']:
                            endpoint_text += "Content Types: " + ", ".join(details['requestBody']['content'].keys()) + "\n"

                    # Add response info
                    if 'responses' in details:
                        endpoint_text += "\nResponses:\n"
                        for code, response in details['responses'].items():
                            endpoint_text += f"- {code}: {response.get('description', 'No description')}\n"

                    chunks.append((f"Endpoint {method.upper()} {path}", endpoint_text.strip()))

    # 4. Components/Schemas
    if 'components' in data and 'schemas' in data['components']:
        for schema_name, schema_def in data['components']['schemas'].items():
            schema_text = f"""
Data Model: {schema_name}
Type: {schema_def.get('type', 'object')}
Description: {schema_def.get('description', 'No description')}
"""
            if 'properties' in schema_def:
                schema_text += "\nProperties:\n"
                for prop_name, prop_def in schema_def['properties'].items():
                    schema_text += f"- {prop_name}: {prop_def.get('type', 'unknown')} - {prop_def.get('description', 'No description')}\n"

            chunks.append((f"Schema {schema_name}", schema_text.strip()))

    return chunks

# Process documents
documents = []
metadatas = []
ids = []

data_path = Path("data")
for filepath in data_path.glob("**/*"):
    if filepath.suffix in [".md", ".txt", ".py"]:
        # Handle text files normally
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()

            if text.strip():
                chunks = chunk_text(text)
                for i, chunk in enumerate(chunks):
                    documents.append(chunk)
                    metadatas.append({
                        "filename": filepath.name,
                        "chunk_index": i,
                        "source_type": "text"
                    })
                    ids.append(f"{filepath.name}_{i}_{str(uuid.uuid4())[:8]}")

                print(f"✅ Processed {filepath.name}: {len(chunks)} text chunks")
        except Exception as e:
            print(f"⚠️ Error processing {filepath.name}: {e}")

    elif filepath.suffix == ".json":
        # Enhanced JSON processing
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)

            # Check if it's an OpenAPI spec
            if 'openapi' in data or 'swagger' in data:
                structured_chunks = parse_openapi_json(data)

                for i, (section_name, content) in enumerate(structured_chunks):
                    documents.append(content)
                    metadatas.append({
                        "filename": filepath.name,
                        "chunk_index": i,
                        "source_type": "openapi",
                        "section": section_name
                    })
                    ids.append(f"{filepath.name}_{section_name}_{str(uuid.uuid4())[:8]}")

                print(f"✅ Processed {filepath.name}: {len(structured_chunks)} API chunks")
            else:
                # Fallback for other JSON
                text = json.dumps(data, indent=2)
                chunks = chunk_text(text)
                for i, chunk in enumerate(chunks):
                    documents.append(chunk)
                    metadatas.append({
                        "filename": filepath.name,
                        "chunk_index": i,
                        "source_type": "json"
                    })
                    ids.append(f"{filepath.name}_{i}_{str(uuid.uuid4())[:8]}")

                print(f"✅ Processed {filepath.name}: {len(chunks)} JSON chunks")

        except Exception as e:
            print(f"⚠️ Error processing JSON {filepath.name}: {e}")

# Add to vector database
if documents:
    collection.add(
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )
    print(f"✅ Vector database ready with {len(documents)} chunks!")

    # Show what we got
    print("\n📊 Chunk breakdown:")
    source_types = {}
    for meta in metadatas:
        source_type = meta.get('source_type', 'unknown')
        source_types[source_type] = source_types.get(source_type, 0) + 1

    for source_type, count in source_types.items():
        print(f"  {source_type}: {count} chunks")
else:
    print("❌ No documents found!")

🧠 Loading embedding model...
🗄️ Setting up vector database...
🗑️ Deleted existing collection
✅ Processed troubleshooting.md: 2 text chunks
✅ Processed openapi.json: 933 API chunks
✅ Processed sample.md: 1 text chunks
✅ Vector database ready with 936 chunks!

📊 Chunk breakdown:
  text: 3 chunks
  openapi: 933 chunks


In [26]:
# Cell 5 - Enhanced for Qwen 2.5-Coder
def query_rag(question, top_k=5, model=None, verbose=False):
    """Query the RAG system with Qwen 2.5-Coder optimizations"""
    if model is None:
        model = globals().get('current_model', 'qwen2.5-coder:7b-instruct')

    print(f"🔍 Searching for: '{question}'")

    # Query vector database
    results = collection.query(
        query_texts=[question],
        n_results=top_k
    )

    if not results['documents'][0]:
        return "❌ No relevant documents found."

    # Build context with better formatting for code models
    context_chunks = results['documents'][0]
    context_sections = []

    for i, (chunk, meta) in enumerate(zip(context_chunks, results['metadatas'][0])):
        section_name = meta.get('section', f"Section {i+1}")
        source_type = meta.get('source_type', 'unknown')
        filename = meta.get('filename', 'unknown')

        formatted_chunk = f"""
## {section_name} (from {filename})
{chunk}
"""
        context_sections.append(formatted_chunk)

    context = "\n".join(context_sections)

    if verbose:
        print("\n📄 Retrieved Context:")
        for i, (chunk, meta) in enumerate(zip(context_chunks, results['metadatas'][0])):
            section = meta.get('section', 'Unknown')
            source_type = meta.get('source_type', 'text')
            print(f"📋 Chunk {i+1} [{source_type}] ({section}): {chunk[:200]}...")
            print("-" * 40)

    # Specialized prompt for Qwen 2.5-Coder
    prompt = f"""You are a technical documentation assistant specializing in API documentation and code analysis.

Given the following technical documentation context, provide a precise and helpful answer to the user's question.

<context>
{context}
</context>

<question>
{question}
</question>

Instructions:
- Base your answer strictly on the provided context
- For API questions, include specific endpoint paths, HTTP methods, and parameter details
- For code questions, reference exact function names, parameters, and examples
- If information is missing from the context, clearly state what's not available
- Use proper technical terminology
- Be concise but complete

Answer:"""

    # Query LLM with optimized settings for Qwen
    try:
        print(f"🤖 Asking {model}...")
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": model,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.2,    # Lower for more factual responses
                    "top_p": 0.8,         # Focused responses
                    "repeat_penalty": 1.1, # Reduce repetition
                    "num_ctx": 4096       # Larger context window
                }
            },
            timeout=180  # Longer timeout for 7B model
        )

        if response.status_code == 200:
            return response.json().get("response", "No response")
        else:
            return f"❌ Error: Status {response.status_code}"
    except Exception as e:
        return f"❌ Error: {e}"

print("✅ Enhanced RAG function ready for Qwen 2.5-Coder!")
print("🎯 This model should excel at API documentation and technical questions!")

✅ Enhanced RAG function ready for Qwen 2.5-Coder!
🎯 This model should excel at API documentation and technical questions!


In [None]:
# Test the RAG System
test_questions = [
    "What is this RAG system about?",
    "How do I troubleshoot issues?",
    "What are the key components?"
]

for question in test_questions:
    print(f"\n{'='*50}")
    print(f"Question: {question}")
    print('='*50)

    answer = query_rag(question)
    print("\nAnswer:")
    print("-" * 30)
    print(answer)
    print("-" * 30)

print("\n🎉 RAG System working on free GPU!")


Question: What is this RAG system about?
🔍 Searching for: 'What is this RAG system about?'
🤖 Asking gemma:2b...

Answer:
------------------------------
This context does not provide information about what the RAG system is about, so I cannot answer this question from the provided context.
------------------------------

Question: How do I troubleshoot issues?
🔍 Searching for: 'How do I troubleshoot issues?'
🤖 Asking gemma:2b...

Answer:
------------------------------
Sure, here are some steps on how to troubleshoot issues with the Local RAG system:

1. Check the logs first.
2. Review the system requirements.
3. Consider posting detailed logs when seeking help.
4. Try with minimal configuration first.
5. Use specific, well-formed questions.
6. Keep documents well-structured.
7. Experiment with different chunk sizes.
8. Try various LLM models for different use cases.
------------------------------

Question: What are the key components?
🔍 Searching for: 'What are the key components?'
🤖 

In [None]:
# Check what documents are indexed
print("📚 Currently indexed documents:")
print("="*40)

# Get all documents in the collection
all_docs = collection.get()

# Count by filename
from collections import Counter
filenames = [meta['filename'] for meta in all_docs['metadatas']]
file_counts = Counter(filenames)

for filename, count in file_counts.items():
    print(f"📄 {filename}: {count} chunks")

print(f"\n🔢 Total chunks: {len(all_docs['documents'])}")

# Show a sample chunk
if all_docs['documents']:
    print(f"\n📝 Sample chunk from {all_docs['metadatas'][0]['filename']}:")
    print("-" * 30)
    print(all_docs['documents'][0][:200] + "...")

📚 Currently indexed documents:
📄 troubleshooting.md: 2 chunks
📄 sample.md: 1 chunks

🔢 Total chunks: 3

📝 Sample chunk from troubleshooting.md:
------------------------------
# Troubleshooting Guide This guide helps you resolve common issues with the local RAG system. ## Docker Issues ### Services Won't Start **Problem**: Docker containers fail to start or exit immediately...


In [28]:
# Edit this cell to ask your own questions:
# my_question = "How do I get the max offset of a given topic and partition?"
# my_question = "How do I create a connector?"
# my_question = "What does the Confluent Cloud API do?"
my_question = "Show me sample python code for creating a DatagenSource connector using the Confluent Cloud API"

print(f"🎯 Your Question: {my_question}")
answer = query_rag(my_question, verbose=True)
print(f"\n🤖 Answer:\n{answer}")

print("\n💡 Edit 'my_question' above and run again!")
print("\n🚀 Your RAG POC is running 5-10x faster on free GPU!")

🎯 Your Question: Show me sample python code for creating a DatagenSource connector using the Confluent Cloud API
🔍 Searching for: 'Show me sample python code for creating a DatagenSource connector using the Confluent Cloud API'

📄 Retrieved Context:
📋 Chunk 1 [openapi] (Schema connect.v1.CustomConnectorPluginList): Data Model: connect.v1.CustomConnectorPluginList
Type: object
Description: CustomConnectorPlugins objects represent Custom Connector Plugins on Confluent Cloud.
The API allows you to list, create, rea...
----------------------------------------
📋 Chunk 2 [openapi] (Schema pim.v1.IntegrationList): Data Model: pim.v1.IntegrationList
Type: object
Description: `Provider Integration` objects represent access to public cloud service provider (CSP) resources
that may be accessed by Confluent resource...
----------------------------------------
📋 Chunk 3 [openapi] (Server Information): API Servers:
- URL: https://api.confluent.cloud
  Description: Confluent Cloud API...
------------

In [29]:
# Test questions optimized for Qwen 2.5-Coder
technical_questions = [
    "How do I get the max offset of a given topic and partition?",
    "How do I create a connector?",
    "What does the Confluent Cloud API do?",
    "Show me sample python code for creating a DatagenSource connector using the Confluent Cloud API"
]

print("🧪 Testing Qwen 2.5-Coder's technical understanding...")

for question in technical_questions[:3]:  # Test first 3
    print(f"\n{'='*70}")
    print(f"🔍 Question: {question}")
    print('='*70)
    answer = query_rag(question, verbose=True)
    print(f"\n🤖 Qwen 2.5-Coder Answer:")
    print("-" * 50)
    print(answer)
    print("-" * 50)

🧪 Testing Qwen 2.5-Coder's technical understanding...

🔍 Question: How do I get the max offset of a given topic and partition?
🔍 Searching for: 'How do I get the max offset of a given topic and partition?'

📄 Retrieved Context:
📋 Chunk 1 [openapi] (Schema connect.v1.Offsets): Data Model: connect.v1.Offsets
Type: array
Description: Array of offsets which are categorised into partitions....
----------------------------------------
📋 Chunk 2 [openapi] (Endpoint GET /kafka/v3/clusters/{cluster_id}/topics/{topic_name}/partitions): Endpoint: GET /kafka/v3/clusters/{cluster_id}/topics/{topic_name}/partitions
Summary: List Partitions
Description: [![Generally Available](https://img.shields.io/badge/Lifecycle%20Stage-Generally%20Av...
----------------------------------------
📋 Chunk 3 [openapi] (Schema PartitionLevelTruncationData): Data Model: PartitionLevelTruncationData
Type: object
Description: No description

Properties:
- partition_id: integer - No description
- offset_truncated_to: integ