In [2]:
import os
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

In [3]:
from pypdf import PdfReader

def load_pdf(file_path):
    reader = PdfReader(file_path)

    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

pdf_text = load_pdf(file_path="../assets/data/BRTS_Chatbot_Dataset.pdf")

In [5]:
import re

def split_text(text: str):
    split_text = re.split('\n \n', text)
    return [i for i in split_text if i != ""]

chunked_text = split_text(text=pdf_text)

In [6]:
import os
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings

class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import chromadb
from typing import List

def create_chroma_db(documents:List, path:str, name:str):
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))

    return db, name

db,name =create_chroma_db(documents=chunked_text, 
                          path=r"../assets/data/chroma",
                          name="chatbot_rag_collection")