In [2]:
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
if not groq_api_key:
    raise ValueError("Groq API key not found in .env file")

: 

In [2]:
# imports for precessing text
import os
import pickle
import re

# imports for creating pipeline for rag
from langchain.embeddings import HuggingFaceEmbeddings      # for embeddings
from langchain.vectorstores import Chroma                  # for vector store
from langchain.document_loaders import TextLoader        # for loading text
from langchain.text_splitter import RecursiveCharacterTextSplitter  # acting as base class for splitting text 
from langchain_groq import ChatGroq                         # for initializing LLM from groq
from langchain.schema import Document                   # converting simple text to document object

# For compressing the context of retrieved documents
from langchain.retrievers import ContextualCompressionRetriever     # for compressing retrieved documents context
from langchain.retrievers.document_compressors import LLMChainExtractor     # used in compression

# For creating the retrieval chain for chat history
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# For creating history aware retriever
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage     # for messages in history aware retriever


# Extra for now
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

In [3]:
# Initialize the HuggingFace Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


In [4]:
def load_documents(file_path):
    loader = TextLoader('resources/9thComputerScience_cleaned.txt')
    documents = loader.load()
    return documents

In [5]:
# Custom RecursiveCharacterTextSplitter with regex patterns for subtopics and chapters
class CustomTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, **kwargs):
        subtopic_pattern = re.compile(r'(\d+(\.\d+)+)')
        chapter_separator = 'chapter end -------------------------------------'

        # Initialize with any other parameters, and add your separators
        super().__init__(separators=[chapter_separator], **kwargs)
        self.subtopic_pattern = subtopic_pattern

    def split_text(self, text):
        # First, split by chapters
        texts = super().split_text(text)
        documents = []
        
        # For each chapter, split by subtopic using the subtopic regex
        chapter_number = 1
        for chapter in texts:
            subtopic_splits = self._split_by_subtopic(chapter, chapter_number)
            documents.extend(subtopic_splits)
            chapter_number += 1
        
        return documents

    def _split_by_subtopic(self, text, chapter_number):
        # Use the subtopic regex to split text
        matches = list(self.subtopic_pattern.finditer(text))
        if not matches:
            # No subtopics found, return the full text as a single Document
            return [Document(page_content=text.strip(), metadata={"chapter": chapter_number})]
        
        subtopics = []
        start_idx = 0
        subtopic_number = 1
        
        for match in matches:
            end_idx = match.start()
            if start_idx != end_idx:
                subtopics.append(Document(
                    page_content=text[start_idx:end_idx].strip(),
                    metadata={"chapter": chapter_number, "subtopic": subtopic_number}
                ))
            start_idx = end_idx
            subtopic_number += 1
            
        # Append the remaining part as a subtopic
        subtopics.append(Document(
            page_content=text[start_idx:].strip(),
            metadata={"chapter": chapter_number, "subtopic": subtopic_number}
        ))
        
        return subtopics


In [6]:
# Create embeddings and handle storage
def embed_documents(split_docs, embedding_model):
    EMBEDDINGS_FOLDER = "embeddings"
    EMBEDDINGS_FILE = os.path.join(EMBEDDINGS_FOLDER, "emb01.pkl")

    if not os.path.exists(EMBEDDINGS_FOLDER):
        os.makedirs(EMBEDDINGS_FOLDER)

    if os.path.exists(EMBEDDINGS_FILE):
        print(f"Loading existing embeddings from {EMBEDDINGS_FILE}...")
        with open(EMBEDDINGS_FILE, 'rb') as f:
            embedded_docs = pickle.load(f)
            print("Embeddings loaded successfully.")
    else:
        print("Creating new embeddings...")
        texts = [doc.page_content for doc in split_docs]
        embedded_docs = embedding_model.embed_documents(texts)

        with open(EMBEDDINGS_FILE, 'wb') as f:
            pickle.dump(embedded_docs, f)
            print(f"Embeddings saved to {EMBEDDINGS_FILE}")

    return embedded_docs


In [7]:
# Store embeddings in Chroma vector store
def store_embeddings(split_docs, embedding_model):
    vector_store = Chroma.from_documents(split_docs, embedding_model) 
    return vector_store

In [8]:
def getting_retriever(llm,vector_store):
    """Opiton 01: Creating ContextualCompressionRetriever
    Contextual Compression will find the relevant records and only contains the relevant data from chunks instead of whole chunks
    Maximum Marginal Relevance (mmr) is used to get diverse set of documents.
    Option 02: SelfQueryRetrieval for filtering based on sources"""
    # Option 01
    compressor = LLMChainExtractor.from_llm(llm)
    compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vector_store.as_retriever(search_kwargs={"k": 10})
    #search_type = "mmr"
    )
    return compression_retriever

In [9]:
# Initialize the LLM
def initialize_llm(model_name="llama-3.1-70b-versatile", temperature=0):
    llm = ChatGroq(
        model= model_name,
        temperature=temperature,
    )
    return llm

In [10]:

# Load your document
documents = load_documents('resources/9thComputerScience_cleaned.txt')

# Split the text into smaller chunks
text_splitter = CustomTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = text_splitter.split_text(documents[0].page_content)
# print(split_docs[10])

# Create embeddings and store them
# embedded_docs = embed_documents(split_docs, embedding_model)
vector_store = store_embeddings(split_docs, embedding_model)
llm = initialize_llm()

# Create the retriever having contextual compression
retriever = getting_retriever(llm,vector_store)


In [11]:
# Just for testing the normal chain without chat history
system_prompt = (
    "Act as a conversational assistant similar to ChatGPT. Engage in natural dialogue and answer questions based on the context provided through the chat history or retrieved using Retrieval-Augmented Generation (RAG). If the relevant context is not found either in the conversation or via RAG, respond by stating that the information is unavailable or ask for more clarification from the user. Do not provide speculative or out-of-context information. Always ensure responses are precise and contextually relevant."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [12]:
# result = rag_chain.invoke({"input": "can you give me an interesting topic from 1st chapter"})
# result

In [13]:
# result = rag_chain.invoke({"input": "guide me in each step further"})
# result

## Implementing Chat history for context

In [14]:
contextualize_q_system_prompt = (
    "Act as a conversational assistant similar to ChatGPT. Engage in natural dialogue and answer questions based on the context provided through the chat history or retrieved using Retrieval-Augmented Generation (RAG). If the relevant context is not found either in the conversation or via RAG, respond by stating that the information is unavailable or ask for more clarification from the user. Do not provide speculative or out-of-context information. Always ensure responses are precise and contextually relevant."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# Creating a new retirever that is aware of the chat history. Rest of the things are same.
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [15]:
system_prompt = (
    "Act as a conversational assistant similar to ChatGPT. Engage in natural dialogue and answer questions based on the context provided through the chat history or retrieved using Retrieval-Augmented Generation (RAG). If the relevant context is not found either in the conversation or via RAG, respond by stating that the information is unavailable or ask for more clarification from the user. Do not provide speculative or out-of-context information. Always ensure responses are precise and contextually relevant."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

### Chat History Working

In [16]:
chat_history = []   # List of messages in the chat history

In [17]:
query = "Divide and conquer"
result = rag_chain.invoke({"input": query,"chat_history": chat_history})
# Append the user query and the system response to the chat history
chat_history.extend(
    [
        HumanMessage(content=query),
        AIMessage(content=result["answer"]),
    ]
)
print("Overall Result Object: ",result)
print("-----------------------------------")
print("Answer:",result["answer"])
print("-----------------------------------")
print("Context:",result["context"])
print("-----------------------------------")
print("Chat History:",chat_history)

Overall Result Object:  {'input': 'Divide and conquer', 'chat_history': [HumanMessage(content='Divide and conquer', additional_kwargs={}, response_metadata={}), AIMessage(content="It seems like you're referring to the Divide and Conquer strategy. This is a problem-solving approach that involves breaking down a complex problem into smaller, more manageable sub-problems. Each sub-problem is then solved individually, and the solutions are combined to form the final solution to the original problem.\n\nCan you tell me more about the context in which you're interested in applying the Divide and Conquer strategy? Are you working on a specific problem or project?", additional_kwargs={}, response_metadata={})], 'context': [Document(metadata={'chapter': 5, 'subtopic': 33}, page_content='e Divide and Conquer: This strategy divides a complex problem into smaller problems.'), Document(metadata={'chapter': 1, 'subtopic': 6}, page_content='e Divide and Conquer: This strategy divides a complex proble

In [18]:
print(len(result['context']))

for item in result['context']:
    print(item)

2
page_content='e Divide and Conquer: This strategy divides a complex problem into smaller problems.' metadata={'chapter': 5, 'subtopic': 33}
page_content='e Divide and Conquer: This strategy divides a complex problem into smaller problems.' metadata={'chapter': 1, 'subtopic': 6}


In [19]:
print(result["answer"])

It seems like you're referring to the Divide and Conquer strategy. This is a problem-solving approach that involves breaking down a complex problem into smaller, more manageable sub-problems. Each sub-problem is then solved individually, and the solutions are combined to form the final solution to the original problem.

Can you tell me more about the context in which you're interested in applying the Divide and Conquer strategy? Are you working on a specific problem or project?


In [20]:
# # Follow up question without mentioning any particular term to test the memory
# query = "chemistry"
# result = rag_chain.invoke({"input": query,"chat_history": chat_history})
# chat_history.extend(
#     [
#         HumanMessage(content=query),
#         AIMessage(content=result["answer"]),
#     ]
# )
# print("Overall Result Object: ",result)
# print("-----------------------------------")
# print("Answer:",result["answer"])
# print("-----------------------------------")
# print("Context:",result["context"])
# print("-----------------------------------")
# print("Chat History:",chat_history)