Creating a Simple Retrieval-Augmented Generation (RAG) Application Using Open-Source Models

In [17]:
#CUSTOM UTIL SPLIT TO SPLIT PDF BY CHUNKS


from langchain_core.documents import Document
import PyPDF2
from typing import List
import os

class PDFReader:
    def __init__(self, file_path: str, num_chunks: int, chunk_overlap: int):
        self.file_path = file_path
        self.file_name = os.path.basename(file_path)
        self.num_chunks = num_chunks
        self.chunk_overlap = chunk_overlap

    def split_into_chunks(self, text: str, num_chunks: int, chunk_overlap: int) -> List[str]:
        chunk_size = max(1, len(text) // num_chunks)
        chunks = []
        for i in range(0, len(text), chunk_size - chunk_overlap):
            chunk = text[i:i + chunk_size]
            chunks.append(chunk)
            if len(chunk) < chunk_size:
                break
        return chunks

    def read_and_split(self) -> List[Document]:
        documents = []
        with open(self.file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text = page.extract_text()
                chunks = self.split_into_chunks(text, self.num_chunks, self.chunk_overlap)
                for chunk in chunks:
                    doc = Document(
                        page_content=chunk,
                        metadata={"page_number": page_num + 1, "source": self.file_name}
                    )
                    documents.append(doc)
        return documents



pdf_reader = PDFReader("sample.pdf", num_chunks=2, chunk_overlap=10)
documents = pdf_reader.read_and_split()
len(documents)





33

Init model

In [32]:
import os
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = "gpt-3.5-turbo"
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
embeddings = OpenAIEmbeddings()



AIMessage(content="The softmax function is a mathematical function that converts a vector of numbers into a probability distribution. It is commonly used in machine learning and neural networks to output probabilities for multiple classes.\n\nThe formula for the softmax function is:\n\n\\[ \\sigma(z)_i = \\frac{e^{z_i}}{\\sum_{j=1}^{K}e^{z_j}} \\]\n\nWhere:\n- \\( \\sigma(z)_i \\) is the output probability for class \\( i \\)\n- \\( e \\) is the base of the natural logarithm (Euler's number)\n- \\( z_i \\) is the input value for class \\( i \\)\n- \\( K \\) is the total number of classes\n\nThe softmax function exponentiates each input value and then normalizes the values by dividing by the sum of all exponentiated values. This results in a probability distribution where the sum of all probabilities is equal to 1.", response_metadata={'token_usage': {'completion_tokens': 186, 'prompt_tokens': 11, 'total_tokens': 197}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_r

Prompting

In [7]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="Here is a question")

'\nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Here is some context\n\nQuestion: Here is a question\n'

Parser

In [2]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

Sample chaining with langchain

In [9]:
chain = prompt | model | parser

chain.invoke({"context": "I like to watch Naruto", "question": "What's my favourite show'?"})

'Your favourite show is Naruto.'

![Sample Image](images/image_1.jpg)

Init Vector Db

In [15]:
from langchain_chroma import Chroma

database = Chroma(persist_directory="./rag_db", embedding_function=embeddings)

Process PDF file for embeddings

In [11]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("sample.pdf")
docs = loader.load_and_split()
print(len(docs))

12


Embed file content to vector DB

In [16]:

db = database.from_documents(documents=docs,embedding=embeddings,persist_directory="./rag_db")


<langchain_chroma.vectorstores.Chroma object at 0x1342e33a0>


Query from vector db

In [35]:

from operator import itemgetter


retriever = database.as_retriever(search_kwargs={"k":3})


chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

chain.input_schema()
chain.invoke({'question':'What is softmax formula'})


'The softmax formula is given by softmax(QK^T / √dk)V.'