# Basic RAG Pipeline Modularised

This notebook contains a modularised version of the codecamp tutorial code, contained under one callable function that starts the model.

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
import os
import numpy as np
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

In [None]:
from chathistory_db import ChatHistoryDB

history_db = ChatHistoryDB()

In [2]:
MODEL_NAME = "llama3.2"

I have created a function to start a model, this will be updated to include our vector store of embedded data when the model is started.

In [4]:
def load_docs():
    
    document_loader = []

    for root, dirs, files in os.walk("."):
        # Skip chroma_db folder
        if "faiss" in root or "git" in root:
            continue
        for file in files:
            if file.endswith(".pdf"):
                document_loader.append(file)

    return document_loader

In [5]:
document_loader = load_docs()
document_loader

['ENSC3016_Course_Notes_Part_1_Electromagnetism_Transformers.pdf',
 'Three Phase Power System Fundamentals.pdf',
 'ENSC3016_Course_Notes_Part_2_Electric_Machines.pdf',
 'Electric Machinery Fundamentals Textbook -- Chapman.pdf',
 'ENSC3016 Study Guide 1-Review of Circuit Fundamentals.pdf']

In [3]:
embedding_model ="sentence-transformers/all-MiniLM-L6-v2" #embedding matrix model

def embed_splitting(document_loader, embedding_model):
    embeddings = HuggingFaceEmbeddings(model = embedding_model, encode_kwargs={'normalize_embeddings': True})

    doc_store = []
    for file in document_loader:
        loader = PyPDFLoader(file)
        doc = loader.load()
        doc_store += doc

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size = 400,
        chunk_overlap = 64
        )
    
    #Make splits
    splits = text_splitter.split_documents(doc_store)

    return embeddings, splits


In [6]:
embeddings, splits = embed_splitting(document_loader, embedding_model)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
embeddings

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={'normalize_embeddings': True}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [8]:
len(splits)

402

In [11]:
def cosine_similarity(input):

    input_vec = embeddings.embed_query(input)
    texts = [doc.page_content for doc in splits]
    vectors = embeddings.embed_documents(texts)

    mag_ivec = np.linalg.norm(input_vec)

    dot_product = []
    mag_ovec = []

    for context_vec in vectors:
        dot_product.append(np.dot(input_vec, context_vec))
        mag_ovec.append(np.linalg.norm(context_vec))

    cos_sim = []
    for i in range(len(mag_ovec)):
        mag_ovec[i] *= mag_ivec
        cos_sim.append(dot_product[i]/mag_ovec[i])

    cossim_sort = list(enumerate(cos_sim))
    cossim_sort.sort(key=lambda x: x[1], reverse=True)

    return cossim_sort


In [9]:
import numpy as np

def calculate_cosine_similarity(input1, input2):

    vec1 = embeddings.embed_query(input1)
    vec2 = embeddings.embed_query(input2)
    dot_product = np.dot(vec1, vec2)
    magnitude1 = np.linalg.norm(vec1)
    magnitude2 = np.linalg.norm(vec2)

    # Handle the case where one or both magnitudes are zero to avoid division by zero
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0  # Or handle as appropriate for your application

    return dot_product / (magnitude1 * magnitude2)


In [11]:
calculate_cosine_similarity("Please explain transformers", "How do AC machines work?")

0.3938152084688825

In [41]:
cossim_sort = cosine_similarity("Explain transformers")
print(cossim_sort[:5])

[(106, 0.5388659022778391), (104, 0.500932552907889), (108, 0.4869626486402548), (115, 0.4810365954142485), (144, 0.4785985428899976)]


In [None]:
i = 0
while i < 3:
    print(f"The number {i+1} document is the {cossim_sort[i][0]} chunk, and reads the following: \n\n{splits[cossim_sort[i][0]].page_content}\n")
    i += 1

In [14]:
dim = len(embeddings.embed_query("test sentence"))
index = faiss.IndexFlatL2(dim)

if os.path.exists("faiss_index"):
    print("Loading FAISS index from disk...")
    vector_store = FAISS.load_local("faiss_index", embeddings=embeddings, allow_dangerous_deserialization=True)
else:
    print("Building FAISS index from scratch...")
    dim = len(embeddings.embed_query("test sentence"))
    index = faiss.IndexFlatL2(dim)
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
    )
    vector_store.add_documents(splits)
    vector_store.save_local("faiss_index")

Loading FAISS index from disk...


In [29]:
# create the retriever object once
semantic_retriever = vector_store.as_retriever(search_kwargs={'k': 4})

# define your function to query it
def semantic_search(retriever_obj, input_context: str):
    return retriever_obj.invoke(input_context)

# call the function with retriever and query string
results = semantic_search(semantic_retriever, "Explain transformers")

In [30]:
semantic_retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x312e6b310>, search_kwargs={'k': 4})

In [31]:
results

[Document(id='065ed451-14c6-4500-8de0-d852bce2b40a', metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2019-07-27T15:04:48+08:00', 'author': 'Ali Kharrazi', 'moddate': '2019-07-27T15:04:48+08:00', 'source': 'ENSC3016_Course_Notes_Part_1_Electromagnetism_Transformers.pdf', 'total_pages': 76, 'page': 51, 'page_label': '52'}, page_content='Transformer 52 \n \n \n \n   Figure 6-3 Shell-type transformers. \n \n \n \nFigure 6-4 Flux plot: shell-type transformer \n \n \nToroidal transformers exploit the remarkable properties of toroidal coils described in section 3.6. \nAlthough they are more expensive than shell-type transformers, the performance is better. They are used \nin high -quality electronic equipment and for instrument transformers (see section 6.3) where \nmeasurement accuracy is important. Typical toroidal transformers are shown in figure 6-5. \n \nFigure 6-5 Toroidal transformers.\uf020\n \n \n \n6.2 Transformer Principle: \nThe ac

In [None]:
for i, doc in enumerate(results):
    print(i+1, "\n")
    print(doc.page_content, "\n")

In [19]:
bm25_retriever = BM25Retriever.from_documents(splits)
bm25_retriever.k = 4

def bm25_keyword_search_lc(query):
    return bm25_retriever.invoke(query)

In [None]:
keyword_results = bm25_keyword_search_lc("Explan transformers")
for i, doc in enumerate(keyword_results):
    print(f"Document {i+1}:\n{doc.page_content}\n")

In [22]:
ensemble_retriever = EnsembleRetriever(retrievers= [semantic_retriever, bm25_retriever], weights = [0.67, 0.33], search_kwargs={"k": 3})

def hybrid_search(retriever_obj, input_context: str):
    return retriever_obj.invoke(input_context)

hybrid_results = hybrid_search(ensemble_retriever, "Explain transformers")

In [23]:
len(hybrid_results)

7

In [None]:
for i, doc in enumerate(hybrid_results):
    print(f"Document {i+1}:\n{doc.page_content}\n")

In [25]:
#We need to create functions that create embeddings, load documents and split text

In [26]:
def pipeline_combined(model_name = MODEL_NAME):

    llm = OllamaLLM(model = MODEL_NAME)

    template = """You are an expert assistant answering based only on the provided context.

    Here are 3 relevant document chunks retrieved:

    Chunk 1:
    {chunk1}

    Chunk 2:
    {chunk2}

    Chunk 3:
    {chunk3}
    
    Chunk 4:
    {chunk4}
    
    Use all relevant information above to answer the question below. If the answer isn't found in the chunks, say:
    "I cannot answer this question because the necessary information was not found in the provided documents."

    When answering, cite the **source file name** and **slide/page number** if available.

    Question: {question}
    """

    prompt = PromptTemplate.from_template(template)
    chain = prompt | llm
    print(f"\n Model {model_name} has been initiated. Please feel free to ask any questions or type 'exit' to end this session")
    
    while True:
        user_input = input("You:")
        if user_input.lower() in ['exit', 'quit']:
            print("Have a good day.")
            break

        context_docs = hybrid_search(ensemble_retriever, user_input)[:4]

        # Pass context and question into the chain
        chunks = [
            f"Source: {doc.metadata.get('source', 'unknown')}, Page: {doc.metadata.get('page', 'unknown')}\n{doc.page_content}"
            for doc in context_docs
        ]

        response = chain.invoke({
            "chunk1": chunks[0],
            "chunk2": chunks[1],
            "chunk3": chunks[2],
            "chunk4": chunks[3],
            "question": user_input
        })

        print(f"LLM: {response}\n")

In [None]:
pipeline_combined()