In [2]:
! pip install -q -r requirements.txt

In [1]:
import sys
import os
import glob
import importlib
from typing import List, Optional
import chromadb

import gemini_auth
importlib.reload(gemini_auth)


from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, DocArrayInMemorySearch
from langchain.chains import RetrievalQA


DOC_FOLDER = './doc'
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
VECTOR_DIR = './out/vector_dir'


def get_files(doc_folder):
    return glob.glob(f'{doc_folder}/*.pdf')

def split_docs(files):
    docs = []
    for file in files:
      # load pdf documents
      loader = PyPDFLoader(file)
      documents = loader.load()
      
      # split the documents into chunks
      text_splitter = RecursiveCharacterTextSplitter(chunk_size = CHUNK_SIZE, chunk_overlap = CHUNK_OVERLAP)
      docs.extend(text_splitter.split_documents(documents))
      
    return docs

# need to create a class to convert the output of the embedding class to a list
class GeminiEmbeddings(GoogleGenerativeAIEmbeddings):
    def embed_documents(self, texts: List[str],
                        task_type: Optional[str] = None,
                        titles: Optional[List[str]] = None,
                        output_dimensionality: Optional[int] = None) -> List[List[float]]:

        embeddings = super().embed_documents(texts, task_type, titles, output_dimensionality)
        # Convert Repeated type to list type
        return list(map(list, embeddings))

def add_data_into_vector_db(docs, embedding_function):
    vectordb = Chroma.from_documents(
      docs, embedding_function, 
      persist_directory=VECTOR_DIR
    )


def retirve_chunks(question, embembedding_function):
    '''
    collect similar chunks from an existing vector db
    inputs:
        1. db
        2. question
    output: docs: similarity docs 
    '''
    vectordb = Chroma.from_documents(
      persist_directory=VECTOR_DIR, 
      embedding_function=embedding_function
    )

    docs = vectordb.similarity_search(question, k=3)
    return docs

def retrieve_info(llm, question):
    vectordb = Chroma.from_documents(
      persist_directory=VECTOR_DIR, 
      embedding_function=embedding_function
    )
    
    qa_chain = RetrievalQA.from_chain_type(
        llm, retriever=vectordb.as_retriever()
    )
    
    result = qa_chain({'query': question})
    return result['result']


if __name__ == '__main__':

    files = get_files('./doc')
    docs = split_docs(files)
    question = 'what is the scope of this document'

    embedding_function = GeminiEmbeddings(model="models/embedding-001")
    llm = ChatGoogleGenerativeAI(model='gemini-pro')

    answer = retrieve_info(llm, question)
    print(answer)  
  


  from .autonotebook import tqdm as notebook_tqdm
