In [1]:
!pip install pypdf==4.0.0 PyPDF2==3.0.1 pandas==2.2.0 chromadb==0.4.22 google-generativeai==0.3.2



In [2]:
import os
os.environ["GEMINI_API_KEY"]=""

In [3]:
#loading data
from pypdf import PdfReader

def load_pdf(file_path):
  reader = PdfReader(file_path)

  text = ""
  for page in reader.pages:
    text += page.extract_text()

    return text
pdf_text = load_pdf(file_path="/content/pdf-sample.pdf")

In [4]:
#spilt text
import re
def split_text(text:str):
  split_text = re.split('\n \n',text)
  return [i for i in split_text if i !=""]

chunked_text = split_text(text=pdf_text)

In [5]:
#Embedded
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]

In [6]:
#store data
import chromadb
from typing import List
def create_chroma_db(documents:List, path:str, name:str):
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))

    return db, name

db,name =create_chroma_db(documents=chunked_text,
                          path='/content/db_chroma/', #replace with your path
                          name="rag_experiment")

In [8]:
#load the ChromaDB collection
def load_chroma_collection(path,name):
  chroma_client = chromadb.PersistentClient(path=path)
  db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

  return db

db = load_chroma_collection(path="/content/db_chroma/", name = "rag_experiment")

In [15]:
#Retrival step
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

#Example usage
relevant_text = get_relevant_passage(query="The implementation of RAG",db=db,n_results=3)



In [16]:
#prompt
def make_rag_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
  However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
  strike a friendly and converstional tone. \
  If the passage is irrelevant to the answer, you may ignore it.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

  ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [28]:
import google.generativeai as genai
def generate_content(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text

In [31]:
def generate_answer(db, query):
    relevant_text = get_relevant_passage(query, db, n_results=3)
    prompt = make_rag_prompt(query, relevant_passage="".join(relevant_text))

    answer = generate_content(prompt)

    return answer

In [32]:
db=load_chroma_collection(path="/content/db_chroma/",
                          name="rag_experiment")
answer = generate_answer(db, query="RAG")
print(answer)




Retrieval-Augmented Generation, or RAG, is a technique in AI that combines the advantages of retrieval-based models (like search engines that scan the web) with generative models (like GPT that can create new text). RAG allows AI to not only generate new text but also access external data sources like databases or documents to ensure its responses are accurate and up-to-date. This makes RAG particularly useful in situations where AI needs to have the most recent information or when dealing with specialized topics where knowledge might be limited.
