# Basic RAG Pipeline Modularised

This notebook contains a modularised version of the codecamp tutorial code, contained under one callable function that starts the model.

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import ollama
from langchain_ollama import OllamaLLM
from langchain_chroma import Chroma
from langchain.prompts import PromptTemplate
import os

In [None]:
MODEL_NAME = "llama3.2"

I have created a function to start a model, this will be updated to include our vector store of embedded data when the model is started.

In [3]:
def load_docs():
    
    document_loader = []

    for root, dirs, files in os.walk("."):
        # Skip chroma_db folder
        if "chroma_db" in root or "git" in root:
            continue
        for file in files:
            if file.endswith(".pdf"):
                document_loader.append(file)

    return document_loader

In [4]:
document_loader = load_docs()
document_loader

['Lecture 4 Faraday’s law and induction.pdf',
 'Lecture 1 Magnetic fields and magnetic force.pdf']

In [5]:
embedding_model ="sentence-transformers/all-MiniLM-L6-v2" #embedding matrix model

def embed_splitting(document_loader, embedding_model):
    embeddings = HuggingFaceEmbeddings(model = embedding_model, encode_kwargs={'normalize_embeddings': True})

    doc_store = []
    for file in document_loader:
        loader = PyPDFLoader(file)
        doc = loader.load()
        doc_store += doc

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size = 400,
        chunk_overlap = 64
        )
    
    #Make splits
    splits = text_splitter.split_documents(doc_store)

    return embeddings, splits

In [6]:
embeddings, splits = embed_splitting(document_loader, embedding_model)

vectorstore = Chroma.from_documents(
        documents=splits,  # these are already LangChain `Document` objects
        embedding=embeddings,
        collection_name="circuit_docs",
        persist_directory="./chroma_db"
    )


In [7]:
# create the retriever object once
retriever = vectorstore.as_retriever(search_kwargs={'k': 3})

# define your function to query it
def context_retriever(retriever_obj, input_context: str):
    return retriever_obj.invoke(input_context)

# call the function with retriever and query string
results = context_retriever(retriever, "Explain Faraday's and Lenz's law")

In [8]:
print(results)

[Document(id='7f428148-0968-4f8d-9913-8cf3bac1cca1', metadata={'total_pages': 10, 'page': 0, 'creationdate': "D:20230224061645Z00'00'", 'moddate': "D:20230311062840Z00'00'", 'author': 'David Gozzard', 'creator': 'Microsoft® Word 2016', 'source': 'Lecture 4 Faraday’s law and induction.pdf', 'page_label': '1', 'producer': 'macOS Version 12.5 (Build 21G72) Quartz PDFContext, AppendMode 1.1'}, page_content='PHYS2003: Physics for Electrical Engineers, semester 1 2023 \nLecture 4 \n1 \n \nLecture 4: Faraday’s law and induction \nLearning outcomes \nAssessable \n• Use Faraday’s law and Lenz’s law to calculate induced current and emf. \n• Calculate energy transfer due to induction. \nUnderstanding \n• Visualize and conceptually explain induced currents and electric fields. \n1. Summary \nThis lecture will cover Faraday’s law, Lenz’s law, and induction and energy transfer. \nConsider a loop of conducting wire connected to an ammeter. If we move a bar magnet \ntowards the loop, we will observe: 

In [None]:
#We need to create functions that create embeddings, load documents and split text

In [None]:
def pipeline_combined(model_name = MODEL_NAME):

    llm = OllamaLLM(model = MODEL_NAME)

    template = """Answer the following question only using the following context:
    {context}

    If the answer is not contained in the context, respond with:
    "I cannot answer this question because the necessary information was not found in the provided documents."

    When answering, include the **source file name** and **slide/page number** if available.

    Question: {question}
    """

    prompt = PromptTemplate.from_template(template)
    chain = prompt | llm
    print(f"\n Model {model_name} has been initiated. Please feel free to ask any questions or type 'exit' to end this session")
    
    while True:
        user_input = input("You:")
        if user_input.lower() in ['exit', 'quit']:
            print("Have a good day.")
            break

        context_docs = context_retriever(retriever, user_input)

        context = "\n\n".join(
        f"Source: {doc.metadata.get('source', 'unknown')}, Page: {doc.metadata.get('page', 'unknown')}\n{doc.page_content}"
        for doc in context_docs
        )

        # Pass context and question into the chain
        response = chain.invoke({
            "context": context,
            "question": user_input
        })

        print(f"LLM: {response}\n")

In [None]:
pipeline_combined()