# Wire RAG <a href="https://colab.research.google.com/github/appunite/Wire-RAG/blob/main/main_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

Install dependencies for colab

In [None]:
!pip install haystack-ai pinecone-haystack sentence-transformers pinecone transformers
!wget -P utils https://raw.githubusercontent.com/appunite/Wire-RAG/main/utils/url_scraper.py
!wget -P utils https://raw.githubusercontent.com/appunite/Wire-RAG/main/utils/github_scraper.py

Enter api keys

In [6]:
import os
import getpass
os.environ["PINECONE_API_KEY"] = getpass.getpass("pinecone api key")
os.environ["OPENAI_API_KEY"] = getpass.getpass("open ai api key")
os.environ["GITHUB_API_TOKEN"] = getpass.getpass("github api token (PAT)")

Or load keys from .env file

In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

True

## Populate Pinecone Database

### Scrape URLs


Whitelist: Allow any URL that begins with any element from the white_list.\
Blacklist: Block any URL that begins with any element from the black_list.

In [2]:
import nest_asyncio
from utils.url_scraper import start_scraping

# Apply the nest_asyncio patch to allow nested event loops in Jupyter
nest_asyncio.apply()

starting_url = "https://docs.wire.com"
depth_limit = 2

filter_list = {"white_list": ["https://docs.wire.com"], "black_list": []}
scraped_urls = await start_scraping(starting_url, depth_limit, filter_list)

print(f"Total URLs found: {len(scraped_urls)}")

Total URLs found: 429


### Extract metadata and content

In [3]:
from utils.url_scraper import extract_content_and_metadata, DATE_FORMATS, DATE_PATTERNS

scraped_urls_dict = []
for u in scraped_urls:
    scraped_urls_dict += extract_content_and_metadata(u, DATE_FORMATS, DATE_PATTERNS)
print(len(scraped_urls_dict))

6679


### Scrape GitHub

In [None]:
from utils.github_scraper import scrape_md_files

md_dict = await scrape_md_files(org_name="wireapp", api_key=os.getenv("GITHUB_API_TOKEN"), repo_limit=None)
print(len(md_dict))

### Save / Load .json

In [None]:
import json

# with open("./github_docs.json", "w", encoding='utf-8') as json_file:
#     json.dump(md_dict, json_file, ensure_ascii=False, indent=4)
# 
# with open("./docs_wire.json", "w", encoding='utf-8') as json_file:
#     json.dump(scraped_urls_dict, json_file, ensure_ascii=False, indent=4)
    
with open("./github_docs.json", 'r', encoding='utf-8') as json_file:
    md_dict = json.load(json_file)
print(len(md_dict), md_dict[0]['metadata'], sep='\n')

with open("./docs_wire.json", 'r', encoding='utf-8') as json_file:
    scraped_urls_dict = json.load(json_file)
print(len(scraped_urls_dict), scraped_urls_dict[0]['metadata'], sep='\n')

### Populate database

To delete all records u need to `pip install "pinecone[grpc]"` and run the following code.

In [None]:
# Uncomment to delete all db records
# import os
# from pinecone import Pinecone
# Pinecone(api_key=os.getenv("PINECONE_API_KEY")).Index("wire-rag").delete(delete_all=True, namespace='docs-wire')

Initialize Pinecone Document Store

In [None]:
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack import Pipeline
from haystack import Document
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore

docs_wire_ds = PineconeDocumentStore(
    index="wire-rag",
    namespace="docs-wire",
    dimension=384,
    metric="cosine",
    spec={"serverless": {"region": "us-east-1", "cloud": "aws"}}
)

github_wireapp_ds = PineconeDocumentStore(
    index="wire-rag",
    namespace="github-wireapp",
    dimension=384,
    metric="cosine",
    spec={"serverless": {"region": "us-east-1", "cloud": "aws"}}
)

scraped_urls_documents = [Document(content=doc["content"], meta=doc["metadata"]) for doc in scraped_urls_dict]
print(f"Scraped URLs documents: {len(scraped_urls_documents)}")

github_documents = [Document(content=doc["content"], meta=doc["metadata"]) for doc in md_dict]
print(f"Github documents: {len(github_documents)}")

Create a pipelines to populate the Pinecone Document Store with both GitHub and docs.wire documents

In [None]:
# For all-MiniLM-L6-v2 default input text is 256 word pieces.
splitter_gh = DocumentSplitter(split_by="word", split_length=256, split_overlap=20)
embedder_gh = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
writer_gh = DocumentWriter(github_wireapp_ds)

pipeline_github = Pipeline()
pipeline_github.add_component(instance=splitter_gh, name="splitter_gh")
pipeline_github.add_component(instance=embedder_gh, name="embedder_gh")
pipeline_github.add_component(instance=writer_gh, name="writer_gh")

pipeline_github.connect("splitter_gh", "embedder_gh")
pipeline_github.connect("embedder_gh", "writer_gh")

In [None]:
cleaner_scraped = DocumentCleaner()
# For all-MiniLM-L6-v2 default input text is 256 word pieces.
splitter_scraped = DocumentSplitter(split_by="word", split_length=256, split_overlap=20)
embedder_scraped = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
writer_scraped = DocumentWriter(docs_wire_ds)

pipeline_scraped = Pipeline()
pipeline_scraped.add_component(instance=cleaner_scraped, name="cleaner_scraped")
pipeline_scraped.add_component(instance=splitter_scraped, name="splitter_scraped")
pipeline_scraped.add_component(instance=embedder_scraped, name="embedder_scraped")
pipeline_scraped.add_component(instance=writer_scraped, name="writer_scraped")

pipeline_scraped.connect("cleaner_scraped", "splitter_scraped")
pipeline_scraped.connect("splitter_scraped", "embedder_scraped")
pipeline_scraped.connect("embedder_scraped", "writer_scraped")

Run the pipeline

In [None]:
pipeline_github.run(data = {"splitter_gh": { "documents" : github_documents }})
pipeline_scraped.run(data = {"cleaner_scraped": { "documents" : scraped_urls_documents }})
# preprocessing_pipeline.show()

## Test RAG with Pinecone Document Store

Restart the kernel and run the following code to test the RAG pipeline with the populated Pinecone Document Store.\
Create pipeline to run a query

In [None]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.joiners.document_joiner import DocumentJoiner
from haystack import Pipeline
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
from dotenv import load_dotenv


system_prompt = """Task: Generate comprehensive documentation based on the provided documents. The documentation should capture all essential details and information without any information loss, including relevant code fragments from the files. Ensure that the output does not cut corners on tokens; generate as much content as possible within the limits of the provided documents. Do not reference or include information from any sources other than the given documents, do not rely on your knowledge.

Requirements:

1. Length: The documentation should be detailed and extensive, covering all aspects of the content provided in the documents.

2. Structure:
   - Title Page: Include a title that reflects the main theme of the documents.
   - Introduction: Provide a complete and thorough overview of the content, purpose, and scope of the documentation. Explain the objectives of the documents and how the information will be structured.
   - Main Sections: Divide the documentation into logical sections and subsections based on the themes and topics found in the documents. Each section should include:
     - A clear, descriptive heading that reflects the content within the section.
     - Detailed explanations of key concepts and ideas. Expand on each idea or concept with thorough descriptions, definitions, and context. Ensure the reader fully understands the material.
     - Code fragments: Where applicable, include relevant code snippets from the documents. These should be properly formatted and explained in detail. For each code snippet, provide a clear explanation of its purpose, functionality, and how it fits into the overall system or process described in the document.
     - Examples: Provide relevant examples where applicable, explaining them fully to ensure understanding.
     - Conflict Resolution: If there are contradictory pieces of information across different documents, resolve them using the date provided at the beginning of each document, prioritizing the most recent information. If a document’s date is unknown and conflicts cannot be resolved, explicitly indicate which parts contradict one another, then clearly explain the contradiction. Mention all unresolved conflicts at the end of each section.

3. Content Guidelines:
   - Ensure that all information from the documents is included and accurately represented.
   - The content should not just list bullet points but should be expanded into full sentences and paragraphs where necessary, with elaboration and explanations for all points.
   - Use clear and logical transitions between sections to maintain the flow and coherence of the document.
   - Use bullet points, tables, code fragments, or diagrams where appropriate to enhance understanding and readability, but always accompany these elements with in-depth explanations.

4. Style: The writing style should be professional, formal, and suitable for an audience that may include technical experts, stakeholders, or general readers interested in the topic. Avoid jargon unless it is explained in detail.

5. Specifics: Highlight any critical findings, data, or statistics that are present in the documents. For each significant point, explain its relevance and implications. Emphasize any unique or innovative aspects that stand out, and provide complete reasoning or analysis where needed.

Output Format: Ensure that the entire documentation is generated in Markdown format without any additional text or formatting. The output should be ready to be saved directly into a markdown file.
"""


template = """
User Question: {{question}}
Documents to Analyze:
{% for doc in documents %}
Date: {{doc.meta['date']}}
Title: {{doc.meta['title']}} - {{doc.meta['headline']}}
Content: 
{{doc.content}}
{% endfor %}"""

load_dotenv()

github_wireapp_ds = PineconeDocumentStore(
    index="wire-rag",
    namespace="github-wireapp",
    dimension=384,
    metric="cosine",
    spec={"serverless": {"region": "us-east-1", "cloud": "aws"}}
)

docs_wire_ds = PineconeDocumentStore(
    index="wire-rag",
    namespace="docs-wire",
    dimension=384,
    metric="cosine",
    spec={"serverless": {"region": "us-east-1", "cloud": "aws"}}
)

text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
retriever_gh = PineconeEmbeddingRetriever(document_store=github_wireapp_ds, top_k=25)
retriever_docs_wire = PineconeEmbeddingRetriever(document_store=docs_wire_ds, top_k=25)
joiner = DocumentJoiner(join_mode="concatenate")
prompt_builder = PromptBuilder(template=template)
generator = OpenAIGenerator(model="gpt-4o-mini", system_prompt=system_prompt) # "gpt-4o-mini" "gpt-4o" "gpt-3.5-turbo"
answer_builder = AnswerBuilder()

rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", text_embedder)
rag_pipeline.add_component("retriever_gh", retriever_gh)
rag_pipeline.add_component("retriever_docs_wire", retriever_docs_wire)
rag_pipeline.add_component("joiner", joiner)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("generator", generator)
rag_pipeline.add_component("answer_builder", answer_builder)

rag_pipeline.connect("text_embedder.embedding", "retriever_gh.query_embedding")
rag_pipeline.connect("text_embedder.embedding", "retriever_docs_wire.query_embedding")
rag_pipeline.connect("retriever_docs_wire", "joiner")
rag_pipeline.connect("retriever_gh", "joiner")

rag_pipeline.connect("joiner", "prompt_builder.documents")
rag_pipeline.connect("joiner", "answer_builder.documents")
rag_pipeline.connect("prompt_builder", "generator")
rag_pipeline.connect("generator.replies", "answer_builder.replies")

with open("./pipeline.yml", "w") as file:
  rag_pipeline.dump(file)

Run the pipeline with a query

In [None]:
query = "Generate full documentation about legal hold"
result = rag_pipeline.run({
    "text_embedder": {"text": query},
    "prompt_builder": {"question": query},
    "answer_builder": {"query": query}
})

print(result['answer_builder']['answers'][0].query)
print(result['answer_builder']['answers'][0].data)

with open("./output.md", "w") as f:
    f.write(result['answer_builder']['answers'][0].data)

In [None]:
print(f"Finish Reason: {result['generator']['meta'][0]['finish_reason']}\n" 
      f"Output Tokens: {result['generator']['meta'][0]['usage']['completion_tokens']}\n" 
      f"Input Tokens: {result['generator']['meta'][0]['usage']['prompt_tokens']}\n")
for i, doc in enumerate(result['answer_builder']['answers'][0].documents):
    print(f"{i + 1}. {doc.score} {doc.meta['title']} {doc.meta['headline']}\n    {doc.to_dict()['url']}")

In [None]:
for doc in result['answer_builder']['answers'][0].documents:
    print(doc.meta, doc.content)