# Wire RAG <a href="https://colab.research.google.com/github/appunite/Wire-RAG/blob/main/main_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

Install dependencies for colab

In [None]:
!pip install haystack-ai pinecone-haystack sentence-transformers pinecone transformers
!wget -P utils https://raw.githubusercontent.com/appunite/Wire-RAG/main/utils/url_scraper.py
!wget -P utils https://raw.githubusercontent.com/appunite/Wire-RAG/main/utils/github_scraper.py

Enter api keys

In [None]:
import os
import getpass
os.environ["PINECONE_API_KEY"] = getpass.getpass("pinecone api key")
os.environ["OPENAI_API_KEY"] = getpass.getpass("open ai api key")
os.environ["GITHUB_API_TOKEN"] = getpass.getpass("github api token (PAT)")

Or load keys from .env file

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

## Populate Pinecone Database

### Scrape URLs


Whitelist: Allow any URL that begins with any element from the white_list.\
Blacklist: Block any URL that begins with any element from the black_list.

In [None]:
import nest_asyncio
from utils.url_scraper import start_scraping

# Apply the nest_asyncio patch to allow nested event loops in Jupyter
nest_asyncio.apply()

starting_url = "https://docs.wire.com"
depth_limit = 2

filter_list = {"white_list": ["https://docs.wire.com"], "black_list": []}
scraped_urls = await start_scraping(starting_url, depth_limit, filter_list)

print(f"Total URLs found: {len(scraped_urls)}")

### Extract metadata and content

In [None]:
from utils.url_scraper import extract_content_and_metadata, DATE_FORMATS, DATE_PATTERNS

scraped_urls_dict = []
for u in scraped_urls:
    scraped_urls_dict += extract_content_and_metadata(u, DATE_FORMATS, DATE_PATTERNS)
print(len(scraped_urls_dict))

### Scrape GitHub

In [None]:
from utils.github_scraper import scrape_md_files

md_dict = await scrape_md_files(org_name="wireapp", api_key=os.getenv("GITHUB_API_TOKEN"), repo_limit=None)
print(len(md_dict))

### Save / Load .json

In [None]:
import json

# with open("./github_docs.json", "w", encoding='utf-8') as json_file:
#     json.dump(md_dict, json_file, ensure_ascii=False, indent=4)
# 
# with open("./docs_wire.json", "w", encoding='utf-8') as json_file:
#     json.dump(scraped_urls_dict, json_file, ensure_ascii=False, indent=4)
    
with open("./github_docs.json", 'r', encoding='utf-8') as json_file:
    md_dict = json.load(json_file)
print(len(md_dict), md_dict[0]['metadata'], sep='\n')

with open("./docs_wire.json", 'r', encoding='utf-8') as json_file:
    scraped_urls_dict = json.load(json_file)
print(len(scraped_urls_dict), scraped_urls_dict[0]['metadata'], sep='\n')

### Populate database

To delete all records u need to `pip install "pinecone[grpc]"` and run the following code.

In [None]:
# Uncomment to delete all db records
# import os
# from pinecone import Pinecone
# Pinecone(api_key=os.getenv("PINECONE_API_KEY")).Index("wire-rag").delete(delete_all=True, namespace='docs-wire')

Initialize Pinecone Document Store

In [None]:
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack import Pipeline
from haystack import Document
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore

docs_wire_ds = PineconeDocumentStore(
    index="wire-rag",
    namespace="docs-wire",
    dimension=384,
    metric="cosine",
    spec={"serverless": {"region": "us-east-1", "cloud": "aws"}}
)

github_wireapp_ds = PineconeDocumentStore(
    index="wire-rag",
    namespace="github-wireapp",
    dimension=384,
    metric="cosine",
    spec={"serverless": {"region": "us-east-1", "cloud": "aws"}}
)

scraped_urls_documents = [Document(content=doc["content"], meta=doc["metadata"]) for doc in scraped_urls_dict]
print(f"Scraped URLs documents: {len(scraped_urls_documents)}")

github_documents = [Document(content=doc["content"], meta=doc["metadata"]) for doc in md_dict]
print(f"Github documents: {len(github_documents)}")

Create a pipelines to populate the Pinecone Document Store with both GitHub and docs.wire documents

In [None]:
# For all-MiniLM-L6-v2 default input text is 256 word pieces.
splitter_gh = DocumentSplitter(split_by="word", split_length=256, split_overlap=20)
embedder_gh = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
writer_gh = DocumentWriter(github_wireapp_ds)

pipeline_github = Pipeline()
pipeline_github.add_component(instance=splitter_gh, name="splitter_gh")
pipeline_github.add_component(instance=embedder_gh, name="embedder_gh")
pipeline_github.add_component(instance=writer_gh, name="writer_gh")

pipeline_github.connect("splitter_gh", "embedder_gh")
pipeline_github.connect("embedder_gh", "writer_gh")

In [None]:
cleaner_scraped = DocumentCleaner()
# For all-MiniLM-L6-v2 default input text is 256 word pieces.
splitter_scraped = DocumentSplitter(split_by="word", split_length=256, split_overlap=20)
embedder_scraped = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
writer_scraped = DocumentWriter(docs_wire_ds)

pipeline_scraped = Pipeline()
pipeline_scraped.add_component(instance=cleaner_scraped, name="cleaner_scraped")
pipeline_scraped.add_component(instance=splitter_scraped, name="splitter_scraped")
pipeline_scraped.add_component(instance=embedder_scraped, name="embedder_scraped")
pipeline_scraped.add_component(instance=writer_scraped, name="writer_scraped")

pipeline_scraped.connect("cleaner_scraped", "splitter_scraped")
pipeline_scraped.connect("splitter_scraped", "embedder_scraped")
pipeline_scraped.connect("embedder_scraped", "writer_scraped")

Run the pipeline

In [None]:
pipeline_github.run(data = {"splitter_gh": { "documents" : github_documents }})
pipeline_scraped.run(data = {"cleaner_scraped": { "documents" : scraped_urls_documents }})
# preprocessing_pipeline.show()

## Test RAG with Pinecone Document Store

Restart the kernel and run the following code to test the RAG pipeline with the populated Pinecone Document Store.\
Create pipeline to run a query

In [9]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.joiners.document_joiner import DocumentJoiner
from haystack import Pipeline
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
from dotenv import load_dotenv


system_prompt = """You are an expert assistant skilled in generating structured, comprehensive documentation. Your role is to create thorough, accurate documentation based strictly on the provided context, without using external knowledge or general information. The content must be divided into two main sections: one for non-technical users and one for technical users, with each section tailored to their needs. The output should be clear, detailed, and organized, including code snippets, examples, and conflict resolution where applicable. Your final output should be ready in Markdown format.
"""


template = """**Task:**  
Generate comprehensive documentation based solely on the provided documents. **Do not use any external knowledge** or information outside of the provided context. Divide the documentation into two main sections:

- **Non-Technical User:** Focus on explanations that are accessible to non-technical stakeholders (e.g., **business decision-makers or managers**). Highlight the purpose, value, and practical outcomes, avoiding technical jargon. Use examples and high-level explanations to ensure understanding.

- **Technical User:** Provide in-depth technical content for an audience of **developers, engineers, or system architects**. Include detailed explanations of key technical concepts, code snippets with thorough descriptions, and examples where applicable. Explain the functionality and purpose of each code fragment, along with any potential issues or caveats.

**Requirements:**

1. **Length:** The documentation should be detailed and cover all aspects of the provided content.

2. **Structure:**  
   - **Title Page:** Include a title reflecting the main theme of the documents.  
   - **Introduction:** Provide a clear overview of the content, purpose, and scope of the documentation.  
   - **Non-Technical User Section:**  
     - Clear, high-level explanations that avoid technical depth.  
     - Emphasize the **purpose, benefits, and outcomes** for non-technical users.  
     - Include examples and **highlight key points** with bold text where necessary.  
   - **Technical User Section:**  
     - Detailed explanations of key technical concepts.  
     - Include well-explained code snippets, with a focus on their **practical application** and any **potential pitfalls**.  
     - **Conflict Resolution:** If conflicting information is present, resolve it using the document date or highlight unresolved contradictions **inline** within the relevant sections.

3. **Content Guidelines:**  
   - Represent all information accurately from the documents.  
   - Expand bullet points into full sentences and paragraphs.  
   - Use bullet points, tables, or code fragments where necessary, with in-depth explanations.  
   - Ensure smooth transitions between sections.

4. **Style:**  
   - The non-technical section should be accessible and easy to understand.  
   - The technical section should be professional and formal, with precise technical language.

5. **Specifics:**  
   - Highlight any critical findings, data, or statistics from the documents.  
   - Emphasize unique aspects with full reasoning and analysis.  
   - If any sections lack sufficient detail, indicate **gaps or missing information** rather than assuming or fabricating content.


User Question: {{question}}
Documents to Analyze:
{% for doc in documents %}
Date: {{doc.meta['date']}}
Title: {{doc.meta['title']}} - {{doc.meta['headline']}}
Content: 
{{doc.content}}
{% endfor %}"""

load_dotenv()

github_wireapp_ds = PineconeDocumentStore(
    index="wire-rag",
    namespace="github-wireapp",
    dimension=384,
    metric="cosine",
    spec={"serverless": {"region": "us-east-1", "cloud": "aws"}}
)

docs_wire_ds = PineconeDocumentStore(
    index="wire-rag",
    namespace="docs-wire",
    dimension=384,
    metric="cosine",
    spec={"serverless": {"region": "us-east-1", "cloud": "aws"}}
)

text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
retriever_gh = PineconeEmbeddingRetriever(document_store=github_wireapp_ds, top_k=25)
retriever_docs_wire = PineconeEmbeddingRetriever(document_store=docs_wire_ds, top_k=25)
joiner = DocumentJoiner(join_mode="concatenate")
prompt_builder = PromptBuilder(template=template)
generator = OpenAIGenerator(model="gpt-4o-mini", system_prompt=system_prompt) # "gpt-4o-mini" "gpt-4o" "gpt-3.5-turbo"
answer_builder = AnswerBuilder()

rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", text_embedder)
rag_pipeline.add_component("retriever_gh", retriever_gh)
rag_pipeline.add_component("retriever_docs_wire", retriever_docs_wire)
rag_pipeline.add_component("joiner", joiner)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("generator", generator)
rag_pipeline.add_component("answer_builder", answer_builder)

rag_pipeline.connect("text_embedder.embedding", "retriever_gh.query_embedding")
rag_pipeline.connect("text_embedder.embedding", "retriever_docs_wire.query_embedding")
rag_pipeline.connect("retriever_docs_wire", "joiner")
rag_pipeline.connect("retriever_gh", "joiner")

rag_pipeline.connect("joiner", "prompt_builder.documents")
rag_pipeline.connect("joiner", "answer_builder.documents")
rag_pipeline.connect("prompt_builder", "generator")
rag_pipeline.connect("generator.replies", "answer_builder.replies")

with open("./pipeline.yml", "w") as file:
  rag_pipeline.dump(file)

Run the pipeline with a query

In [10]:
query = "Federations"
result = rag_pipeline.run({
    "text_embedder": {"text": query},
    "prompt_builder": {"question": query},
    "answer_builder": {"query": query}
})

print(result['answer_builder']['answers'][0].query)
print(result['answer_builder']['answers'][0].data)

with open("./output.md", "w") as f:
    f.write(result['answer_builder']['answers'][0].data)



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Federations
# Wire Federation Documentation

## Introduction

This documentation provides a comprehensive overview of the Wire Federation system, detailing its architecture, components, and the API interactions it facilitates between multiple backends. The documentation is structured in two distinct sections: one geared towards non-technical users, explaining the purpose and benefits of Wire Federation, and the other focused on technical users, providing in-depth information, code snippets, and practical guidance for implementation and troubleshooting.

---

## Non-Technical User Section

### Overview of Wire Federation

**Wire Federation** allows separate Wire backend installations to communicate with each other seamlessly, enabling users on different backends to interact just as they would if they were on the same backend. This is particularly valuable for organizations that have multiple client installations or those that require an isolated on-premises deployment but still want to 

In [11]:
print(f"Finish Reason: {result['generator']['meta'][0]['finish_reason']}\n" 
      f"Output Tokens: {result['generator']['meta'][0]['usage']['completion_tokens']}\n" 
      f"Input Tokens: {result['generator']['meta'][0]['usage']['prompt_tokens']}\n")
for i, doc in enumerate(result['answer_builder']['answers'][0].documents):
    print(f"{i + 1}. {doc.score} {doc.meta['title']} {doc.meta['headline']}\n    {doc.to_dict()['url']}")

Finish Reason: stop
Output Tokens: 1106
Input Tokens: 14202

1. 0.614980042 Federation — Wire 0.0.4 documentation Federation
    https://docs.wire.com/understand/configure-federation.html#generate-and-configure-tls-server-and-client-certificates
2. 0.614980042 Federation — Wire 0.0.4 documentation Federation
    https://docs.wire.com/understand/configure-federation.html#configure-helm-charts-federator-and-ingress-and-webapp-subcharts
3. 0.614980042 Federation — Wire 0.0.4 documentation Federation
    https://docs.wire.com/understand/configure-federation.html
4. 0.614980042 Federation — Wire 0.0.4 documentation Federation
    https://docs.wire.com/understand/configure-federation.html#choose-a-backend-domain
5. 0.614980042 Federation — Wire 0.0.4 documentation Federation
    https://docs.wire.com/understand/configure-federation.html#summary-of-necessary-steps-to-configure-federation
6. 0.614980042 Federation — Wire 0.0.4 documentation Federation
    https://docs.wire.com/understand/confi

In [None]:
for doc in result['answer_builder']['answers'][0].documents:
    print(doc.meta, doc.content)