# RAG Pipeline With History

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
import os
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

In [None]:
from session_history import SessionLocal, SessionMemoryTableOps, InSessionMemoryOps 
from datetime import datetime
import uuid
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

In [5]:
MODEL_NAME = "llama3.2"
llm = OllamaLLM(model= MODEL_NAME)

In [6]:
def load_docs():
    
    document_loader = []

    for root, dirs, files in os.walk("."):
        # Skip chroma_db folder
        if "faiss" in root or "git" in root:
            continue
        for file in files:
            if file.endswith(".pdf"):
                document_loader.append(file)

    return document_loader

In [7]:
document_loader = load_docs()
document_loader

['ENSC3016_Course_Notes_Part_1_Electromagnetism_Transformers.pdf',
 'ENSC3016_Course_Notes_Part_2_Electric_Machines.pdf',
 'Electric Machinery Fundamentals Textbook -- Chapman.pdf',
 'ENSC3016 Study Guide 1-Review of Circuit Fundamentals.pdf',
 'Three Phase Power System Fundamentals.pdf']

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
model.save("./local_models/all-MiniLM-L6-v2")

In [9]:
embedding_model ="./local_models/all-MiniLM-L6-v2" #embedding matrix model

def embed_splitting(document_loader, embedding_model):
    embeddings = HuggingFaceEmbeddings(model = embedding_model, encode_kwargs={'normalize_embeddings': True})

    doc_store = []
    for file in document_loader:
        loader = PyPDFLoader(file)
        doc = loader.load()
        doc_store += doc

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size = 400,
        chunk_overlap = 64
        )
    
    #Make splits
    splits = text_splitter.split_documents(doc_store)

    return embeddings, splits


In [10]:
embeddings, splits = embed_splitting(document_loader, embedding_model)

In [11]:
splits[0]

Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2019-07-27T15:04:48+08:00', 'author': 'Ali Kharrazi', 'moddate': '2019-07-27T15:04:48+08:00', 'source': 'ENSC3016_Course_Notes_Part_1_Electromagnetism_Transformers.pdf', 'total_pages': 76, 'page': 0, 'page_label': '1'}, page_content='Electrical \nMachines and \nSystems \n \n \n \nPart 1:  Electromagnetism \nand Transformers')

In [12]:
embeddings

HuggingFaceEmbeddings(model_name='./local_models/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={'normalize_embeddings': True}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [13]:
len(splits)

402

In [14]:
dim = len(embeddings.embed_query("test sentence"))
index = faiss.IndexFlatL2(dim)

if os.path.exists("faiss_index"):
    print("Loading FAISS index from disk...")
    vector_store = FAISS.load_local("faiss_index", embeddings=embeddings, allow_dangerous_deserialization=True)
else:
    print("Building FAISS index from scratch...")
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
    )
    vector_store.add_documents(splits)
    vector_store.save_local("faiss_index")

Loading FAISS index from disk...


In [15]:
# create the retriever object once
semantic_retriever = vector_store.as_retriever(search_kwargs={'k': 4})

In [16]:
semantic_retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x13c35ce60>, search_kwargs={'k': 4})

In [17]:
bm25_retriever = BM25Retriever.from_documents(splits)
bm25_retriever.k = 4

In [18]:
ensemble_retriever = EnsembleRetriever(retrievers= [semantic_retriever, bm25_retriever], weights = [0.6, 0.4], search_kwargs={"k": 3})

In [None]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, ensemble_retriever, contextualize_q_prompt
)

In [23]:
type(history_aware_retriever)

langchain_core.runnables.base.RunnableBinding

In [24]:
user_input = "Explain transformers"
history_results = history_aware_retriever.invoke({"input": user_input, "chat_history": []})

{'producer': 'Microsoft® Word 2013',
 'creator': 'Microsoft® Word 2013',
 'creationdate': '2019-07-27T15:04:48+08:00',
 'author': 'Ali Kharrazi',
 'moddate': '2019-07-27T15:04:48+08:00',
 'source': 'ENSC3016_Course_Notes_Part_1_Electromagnetism_Transformers.pdf',
 'total_pages': 76,
 'page': 51,
 'page_label': '52'}

In [None]:
for i, doc in enumerate(history_results):
    print(f"\nDocument {i+1}")
    print(f"Source: {doc.metadata.get('source', 'unknown')}")
    print(f"Content:\n{doc.page_content}")

In [None]:
user_input_prompt = """You are an expert assistant answering based only on the provided context.

    Context:
    {context}
    
    Use all relevant information above to answer the question below. If the answer isn't found in the chunks, say:
    "I cannot answer this question because the necessary information was not found in the provided documents."

    When answering, cite the **source file name** and **slide/page number** if available.
    """

In [None]:
prompt_template = ChatPromptTemplate(
    [
        ("system", user_input_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [None]:
history = SessionMemoryTableOps(SessionLocal)

In [None]:
### Statefully manage chat history ###
chat_history_cache = {}

def get_session_history(session_id: str):
    if session_id not in chat_history_cache:
        chat_history_cache[session_id] = InSessionMemoryOps(session_id, db=history)
    return chat_history_cache[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [None]:
import time

In [None]:
def pipeline_combined():
     
    while True:
        session_id = input("Enter session ID to resume, or press Enter to start new: ").strip()
        if not session_id:
            session_id = str(uuid.uuid4())[:8]
            print(f"Starting new session: {session_id}")
            history.add_session(session_id=session_id, turns_used=0)
            break
        else:
            if history.session_exists(session_id):
                print(f"Resuming session: {session_id}")
                break
            else:
                print(f"Session ID '{session_id}' not found. Please try again.")

    print(f"\nModel {MODEL_NAME} has been initiated with memory. Please feel free to ask questions or type 'exit' to quit.")
    while True:
        user_input = input("You: ")
        
        if user_input.lower() in ["exit", "quit"]:
            print("Session ended. Have a good day.")
            break

        start_time = time.time()
        print(f"Time is {start_time}")

        response = conversational_rag_chain.invoke(
            {"input": user_input},
            config={"configurable": {"session_id": session_id}},
        )
        print(f"LLM: {response['answer']}\n")

        end_time = time.time()
        response_time = end_time - start_time
        print(f"The response time for this prompt is {round(response_time, 2)} seconds")

        # Note: The memory is managed by the chain via get_session_history
        # So you don't need to manually add messages here

In [None]:
pipeline_combined()

In [None]:
conversational_rag_chain.invoke(
    {"input": "Explain transformers?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

In [None]:
conversational_rag_chain.invoke(
    {"input": "What materials are they made of?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

In [None]:
print(get_session_history("abc123").messages)