### simple RAG using [VARAG](https://github.com/adithya-s-k/VARAG)


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/adithya-s-k/CompanionLLadithya-s-k/VARAG/blob/main/docs/simpleRAG.ipynb)

Requirement to RUN this notebook - CPU or t4(if using OCR and need fast ocr)

In [None]:
!git clone https://github.com/adithya-s-k/VARAG
%cd omniparse
%pwd

In [None]:
!apt-get update && apt-get install -y && apt-get install -y poppler-utils

In [None]:
%pip install -e .

In [None]:
from sentence_transformers import SentenceTransformer
from varag.rag import SimpleRAG
from varag.llms import OpenAI
from varag.chunking import FixedTokenChunker
import lancedb
import os
from dotenv import load_dotenv

os.environ["OPENAI_API_KEY"] = "api_key"

load_dotenv()

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)
# embedding_model = SentenceTransformer("BAAI/bge-base-en", trust_remote_code=True)
# embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5", trust_remote_code=True)
# embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5", trust_remote_code=True)

# Initialize shared database
shared_db = lancedb.connect("~/shared_rag_db")

# Initialize TextRAG with shared database
text_rag = SimpleRAG(
    text_embedding_model=embedding_model,
    db=shared_db,
    table_name="textDemo",
)


# Initialize OpenAI LLM
llm = OpenAI()

In [None]:
text_rag.index(
        "./examples/data",
        recursive=False,
        chunking_strategy=FixedTokenChunker(chunk_size=1000),
        metadata={"source": "gradio_upload"},
        overwrite=True,
        verbose=True,
        ocr=True,
    )

In [None]:
query = "Use Query"
num_results = 5

search_results = text_rag.search(query, k=num_results)

print("This was the retrieved Context")
for i, r in enumerate(search_results):
    print(f"{'==='*50}")
    print(f"\n\nChunk {i+1}:")
    print(f"Text: {r['text']}")
    print(f"Chunk Index: {r['chunk_index']}")
    print(f"Document Name: {r['document_name']}")
    print(f"\n\n{'==='*50}")

In [None]:
context = "\n".join([r["text"] for r in search_results])
response = llm.query(
    context=context,
    system_prompt="Given the below information answer the questions",
    query=query,
)

### Run Gradio Demo

In [None]:
%cd examples 
!python textDemo.py --share