<a href="https://colab.research.google.com/github/almutareb/advanced-rag-system-anatomy/blob/main/Core_Advanced_RAG_components.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install requirements

In [None]:
import sys
import os
!pip install -qU langchain langchain-community --no-warn-script-location > /dev/null
!pip install -qU beautifulsoup4 --no-warn-script-location > /dev/null
!pip install -qU faiss-cpu --no-warn-script-location > /dev/null
# use the gpu optimized version of FAISS for better performance
#!pip install -qU faiss-gpu --no-warn-script-location > /dev/null
!pip install -qU chromadb --no-warn-script-location > /dev/null
!pip install -qU validators --no-warn-script-location > /dev/null
!pip install -qU sentence_transformers typing-extensions==4.8.0 unstructured --no-warn-script-location > /dev/null
!pip install -qU gradio==3.48.0 --no-warn-script-location > /dev/null

Download Documents

In [None]:
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup

# List of URLs to scrape
urls = ["https://langchain-doc.readthedocs.io/en/latest"
        "https://python.langchain.com/docs/get_started"]

# Initialize an empty list to store the documents
docs = []
# Looping through each URL in the list - this could take some time!
for url in urls:
  # max_depth set to 2 for demo purpose, should be increased for real scenario results, e.g. at least 5
  loader = RecursiveUrlLoader(url=url, max_depth=4, extractor=lambda x: Soup(x, "html.parser").text)
  docs.extend(loader.load())
print(f'Downloaded a total of {len(docs)} documents')

Chunking documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,       # The size of each text chunk
    chunk_overlap  = 50,    # Overlap between chunks to ensure continuity
)

# Stage one: read all the docs, split them into chunks.
st = time.time() # Start time for performance measurement
print('Loading documents ...')

# Split each document into chunks using the configured text splitter
chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])

et = time.time() - st # Calculate time taken for splitting
print(f'created {len(chunks)} chunks in {et} seconds.')

Build VectorStore: Vectorization

In [None]:
from langchain.vectorstores import FAISS
from langchain.vectorstores.utils import filter_complex_metadata
from langchain.embeddings import HuggingFaceEmbeddings

# Path for saving the FAISS index
FAISS_INDEX_PATH = "./vectorstore/lc-faiss-multi-mpnet-500"


#Stage two: embed the docs.
# use multi-qa-mpnet-base-dot-v1 sentence transformer to convert pieces of text in vectors to store them in the vector store
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

# use the GPU for faster processing
#model_kwargs = {"device": "cuda"}

# Initialize HuggingFace embeddings with the specified model
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
#    model_kwargs=model_kwargs  # uncomment when using a GPU, like T4 - requires extended RAM!
    )

print(f'Loading chunks into vector store ...')
st = time.time() # Start time for performance measurement

# Create a FAISS vector store from the document chunks and save it locally
db = FAISS.from_documents(filter_complex_metadata(chunks), embeddings)
# persist vectorstore
db.save_local(FAISS_INDEX_PATH)

et = time.time() - st
print(f'Time taken: {et} seconds.')

Load LLM

In [None]:
from dotenv import load_dotenv
# HF libraries
from langchain.llms import HuggingFaceHub

# Load environment variables from a .env file
CONFIG = load_dotenv(".env")

# Retrieve the Hugging Face API token from environment variables
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# load HF Token
HUGGINGFACEHUB_API_TOKEN=os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Load the model from the Hugging Face Hub
model_id = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1", model_kwargs={
    "temperature":0.1,
    "max_new_tokens":1024,
    "repetition_penalty":1.2,
    "return_full_text":False
    })


Retriever

In [None]:
from langchain.embeddings import HuggingFaceHubEmbeddings
# vectorestore
from langchain.vectorstores import FAISS

# Load and Initialize the vector store as a retriever for the RAG pipeline
db = FAISS.load_local(FAISS_INDEX_PATH, embeddings)

retriever = db.as_retriever()

Template and Chat logic

In [None]:
# retrieval chain
from langchain.chains import RetrievalQA
# prompt template
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory


global qa
template = """
You are the friendly documentation buddy Arti, who helps novice programmers in using LangChain with simple explanations and examples.\
    Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to answer the question :
------
<ctx>
{context}
</ctx>
------
<hs>
{history}
</hs>
------
{question}
Answer:
"""
# Create a PromptTemplate object with specified input variables and the defined template
prompt = PromptTemplate.from_template(
    template=template,
)
prompt.format(context="context", history="history", question="question")

# Create a memory buffer to manage conversation history
memory = ConversationBufferMemory(memory_key="history", input_key="question")

# Initialize the RetrievalQA object with the specified model,
# retriever, and additional configurations
qa = RetrievalQA.from_chain_type(llm=model_id, chain_type="stuff", retriever=retriever, verbose=True, return_source_documents=True, chain_type_kwargs={
    "verbose": True,
    "memory": memory,
    "prompt": prompt
}
    )

UI - Gradio

In [None]:
history=[]
query="draft a function to calculate a mxn matrix"
question=query
response=qa({"query": query, "history": history, "question": question})
print(*response)

In [None]:
print(response['result'])

In [None]:
import gradio as gr

# Function to add a new input to the chat history
def add_text(history, text):
  # Append the new text to the history with a placeholder for the response
    history = history + [(text, None)]
    return history, ""

# Function representing the bot's response mechanism
def bot(history):
    response = infer(history[-1][0], history)
    history[-1][1] = response['result']
    return history

# Function to infer the response using the RAG model
def infer(question, history):
    query =  question
    result = qa({"query": query, "history": history, "question": question})
    return result

# Building the Gradio interface
with gr.Blocks() as demo:
    with gr.Column(elem_id="col-container"):
        chatbot = gr.Chatbot([], elem_id="chatbot")
        clear = gr.Button("Clear")

        # Create a row for the question input
        with gr.Row():
            question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter ")

    # Define the action when the question is submitted
    question.submit(add_text, [chatbot, question], [chatbot, question], queue=False).then(
        bot, chatbot, chatbot
    )

    # Define the action for the clear button
    clear.click(lambda: None, None, chatbot, queue=False)

# Launch the Gradio demo interface
demo.launch(share=False)