# Setting up RAG dependencies

## Installing LangChain

In [1]:
!pip install langchain
!pip install langchain_community



## Setting up and importing libraries

In [3]:
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain.schema.output_parser import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain.vectorstores.utils import filter_complex_metadata

## Creating a class to process PDFs

In [5]:
class ChatPDF:
    # Initialize vector store
    vector_store = None

    # Initiaize retriever token
    retriever = None

    # Initialize Chain
    chain = None

    def __init__(self):
        self.model = ChatOllama(model="mistral")
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
        self.prompt = PromptTemplate.from_template(
            """
            <s> [INST] You are an assistant for answering questions. Use the following context elements to answer the question. 
            If you don’t know the answer, simply say that you don’t know. Use a maximum of three sentences and be concise in your response. [/INST] </s> 
            [INST] Question: {question} 
            Context: {context} 
            Answer: [/INST]
            """
        )

    def ingest(self, pdf_file_path: str):
        # Load in the PDF from the specified file path
        docs = PyPDFLoader(file_path = pdf_file_path).load()

        # Split the documents into smaller chunks
        chunks = self.text_splitter.split_documents(docs)
        chunks = filter_complex_metadata(chunks)

        vector_store = Chroma.from_documents(documents = chunks, embedding = FastEmbedEmbeddings())
        self.retriever = vector_store.as_retriever(
            search_type = "similarity_score_threshold",
            search_kwargs = {
                "k": 3,
                "score_threshold": 0.5,
            },
        )

        self.chain = ({"context": self.retriever, "question": RunnablePassthrough()}
                         | self.prompt 
                         | self.model
                         | StrOutputParser())

    def ask(self, query: str):
        if not self.chain:
            return "Please, add a PDF document first"

        return self.chain.invoke(query)

    def clear(self):
        self.vector_store = None
        self.retriever = None
        self.chain = None