In [1]:
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')

if not groq_api_key:
    raise ValueError("Groq API key not found in .env file")

In [2]:
import os
import pickle
import re
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.schema import Document

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo



# Initialize the HuggingFace Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  return _bootstrap._gcd_import(name[level:], package, level)
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


In [3]:
# Load documents from a text file and add metadata
def load_documents(file_path):
    loader = TextLoader(file_path)
    documents = loader.load()
    
    # Add metadata to each document (e.g., file name)
    for doc in documents:
        doc.metadata["source"] = file_path
    return documents


In [4]:
# Define your subtopic and chapter separators

# Custom RecursiveCharacterTextSplitter with regex patterns for subtopics and chapters
class CustomTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, **kwargs):
        subtopic_pattern = re.compile(r'(\d+(\.\d+)+)')
        chapter_separator = 'chapter end -------------------------------------'

        # Initialize with any other parameters, and add your separators
        super().__init__(separators=[chapter_separator], **kwargs)
        self.subtopic_pattern = subtopic_pattern

    def split_text(self, text):
        # First, split by chapters
        texts = super().split_text(text)
        documents = []
        
        # For each chapter, split by subtopic using the subtopic regex
        chapter_number = 1
        for chapter in texts:
            subtopic_splits = self._split_by_subtopic(chapter, chapter_number)
            documents.extend(subtopic_splits)
            chapter_number += 1
        
        return documents

    def _split_by_subtopic(self, text, chapter_number):
        # Use the subtopic regex to split text
        matches = list(self.subtopic_pattern.finditer(text))
        if not matches:
            # No subtopics found, return the full text as a single Document
            return [Document(page_content=text.strip(), metadata={"chapter": chapter_number})]
        
        subtopics = []
        start_idx = 0
        subtopic_number = 1
        
        for match in matches:
            end_idx = match.start()
            if start_idx != end_idx:
                subtopics.append(Document(
                    page_content=text[start_idx:end_idx].strip(),
                    metadata={"chapter": chapter_number, "subtopic": subtopic_number}
                ))
            start_idx = end_idx
            subtopic_number += 1
            
        # Append the remaining part as a subtopic
        subtopics.append(Document(
            page_content=text[start_idx:].strip(),
            metadata={"chapter": chapter_number, "subtopic": subtopic_number}
        ))
        
        return subtopics


In [5]:
# Create embeddings and handle storage
def embed_documents(split_docs, embedding_model):
    EMBEDDINGS_FOLDER = "embeddings"
    EMBEDDINGS_FILE = os.path.join(EMBEDDINGS_FOLDER, "emb01.pkl")

    if not os.path.exists(EMBEDDINGS_FOLDER):
        os.makedirs(EMBEDDINGS_FOLDER)

    if os.path.exists(EMBEDDINGS_FILE):
        print(f"Loading existing embeddings from {EMBEDDINGS_FILE}...")
        with open(EMBEDDINGS_FILE, 'rb') as f:
            embedded_docs = pickle.load(f)
    else:
        print("Creating new embeddings...")
        texts = [doc.page_content for doc in split_docs]
        embedded_docs = embedding_model.embed_documents(texts)

        with open(EMBEDDINGS_FILE, 'wb') as f:
            pickle.dump(embedded_docs, f)

    return embedded_docs


In [6]:
# Store embeddings in Chroma vector store
def store_embeddings(split_docs, embedding_model):
    vector_store = Chroma.from_documents(split_docs, embedding_model) 
    return vector_store

In [7]:

def getting_retriever(llm,vector_store):
    # Opiton 01:
    # Creating ContextualCompressionRetriever
    compressor = LLMChainExtractor.from_llm(llm)
    compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vector_store.as_retriever(search_type = "mmr")
    )
    return compression_retriever
    # Contextual Compression will find the relevant records and only contains the relevant data from chunks instead of whole chunks
    # Maximum Marginal Relevance (mmr) is used to get diverse set of documents

    # # Option 02:
    # document_content_description = "Content from text book"
    # metadata_field_info = [
    #     AttributeInfo(
    #         name="source",
    #         description="The chapter number from which the topic is taken from",
    #         type="string",
    #     )
    # ]
    # retriever = SelfQueryRetriever.from_llm(
    #     llm,
    #     vector_store,
    #     document_content_description,
    #     metadata_field_info,
    #     # verbose=True
    # )
    # return retriever



In [8]:

# Initialize the LLM
def initialize_llm():
    llm = ChatGroq(
        model="llama-3.1-70b-versatile",
        temperature=0.1,
    )
    return llm

In [9]:
# Querying the retriever and LLM
def query_llm(llm, retriever, query, qa):
    # Retrieve relevant documents
    results = retriever.get_relevant_documents(query)

    # Capture the actual text chunks used
    relevant_texts = [doc.page_content for doc in results]

    # Use the LLM to process the retrieved documents
    if results:
        # Combine results for the LLM prompt, and track their sources
        context = "\n".join(relevant_texts)
        prompt = f"""Use relevant information from 9th to 12th-grade textbooks to answer the student's query. If the context is helpful, incorporate it; otherwise, provide a general explanation. Avoid mentioning irrelevance and instead say, "I cannot find relevant data from your book but I will explain the general concept." Encourage the student to ask follow-up questions related to the topics in the books.
        
        Context:
        {context}
        Student Query:
        {query}
        """
        
        response = qa({"question": prompt})
        
        # Return both the response and relevant texts
        return response, relevant_texts
    else:
        return "No relevant documents found.", []


In [10]:
# Main execution flow
if __name__ == "__main__":
    # Load your document
    # documents = load_documents('resources/9thComputerScience_cleaned.txt')
    with open('resources/9thComputerScience_cleaned.txt', 'r') as file:
        documents = file.read()
    # Split the text into smaller chunks
    text_splitter = CustomTextSplitter(chunk_size=1000, chunk_overlap=100)
    split_docs = text_splitter.split_text(documents)
    # print(split_docs[10])
    embedded_docs = embed_documents(split_docs, embedding_model)
    vector_store = store_embeddings(split_docs, embedding_model)
    llm = initialize_llm()
    retriever = getting_retriever(llm,vector_store)
    


Loading existing embeddings from embeddings\emb01.pkl...


In [18]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
system_prompt = (
    "Use the following pieces of retrieved context to answer "
    "the question. If foudn relevant to user query then do use it otherwise give it  "
    "from your own knowledge."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [19]:
result = rag_chain.invoke({"input": "can you give me an interesting topic from 1st chapter"})

In [20]:
result

{'input': 'can you give me an interesting topic from 1st chapter',
 'context': [Document(metadata={'chapter': 1, 'subtopic': 6}, page_content='1.1.3 Planning a Solution\nAfter analyzing a problem, we formulate a plan that may lead us towards the solution of a problem. This phase includes finding the right strategy for problem solving. Some of the strategies are:\ne Divide and Conquer: This strategy divides a complex problem into\nsmaller problems. Figure 1-3 Planning for success\ne Guess, Check and Improve: The designer guesses a solution to a problem and then checks the correctness of the solution. If the solution is not according to expectations, then he/she refines the solution. The refinement is an iterative process.\ne Act it Out: In this strategy the designer defines the list of “to-do” tasks. Afterwards he/she performs the task.\ne Prototype (Draw): This technique draws a pictorial representation of the solution. It is not the final solution. However, it may help a designer to u

In [None]:
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. Also ask the follow up questions for related topics or another topic from the same chapter. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

In [16]:
# Example query 
query = "can you tell me an interesting topic from chapter 1"
memory = ConversationBufferMemory(
            memory_key="chat_history",
            return_messages=True
        )
qa = ConversationalRetrievalChain.from_llm(
            llm,
            retriever=retriever,
            memory=memory,
            return_source_documents=True,
            # metadata={"output_key": "answer"},
            # output_messages_key = 'answer'
            # chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
        )
# response, relevant_texts = query_llm(llm, retriever, query, qa)
result = qa({"question": query})

# Output response and relevant text chunks
print(result)
# print("==============================================")
# print("\nRelevant text chunks used in the response:")
# for text in relevant_texts:
#     print("Chunk: ==============================")
#     print(text[:300])

ValidationError: 1 validation error for ConversationalRetrievalChain
return_only_outputs
  Extra inputs are not permitted [type=extra_forbidden, input_value=True, input_type=bool]
    For further information visit https://errors.pydantic.dev/2.9/v/extra_forbidden

In [22]:
for key, val in result.items():
    print(key, val)

question can you tell me an interesting topic from chapter 1
chat_history [HumanMessage(content='can you tell me an interesting topic from chapter 1', additional_kwargs={}, response_metadata={}), AIMessage(content='One interesting topic from chapter 1 is the different strategies for problem-solving, specifically the "Divide and Conquer" strategy, which involves breaking down a complex problem into smaller, more manageable problems.', additional_kwargs={}, response_metadata={})]
answer One interesting topic from chapter 1 is the different strategies for problem-solving, specifically the "Divide and Conquer" strategy, which involves breaking down a complex problem into smaller, more manageable problems.


In [23]:
result = qa({"question": "Yes tell me how to apply them"})
for key, val in result.items():
    print(key,':', val)

question : Yes tell me how to apply them
chat_history : [HumanMessage(content='can you tell me an interesting topic from chapter 1', additional_kwargs={}, response_metadata={}), AIMessage(content='One interesting topic from chapter 1 is the different strategies for problem-solving, specifically the "Divide and Conquer" strategy, which involves breaking down a complex problem into smaller, more manageable problems.', additional_kwargs={}, response_metadata={}), HumanMessage(content='Yes tell me how to apply them', additional_kwargs={}, response_metadata={}), AIMessage(content='The "Divide and Conquer" strategy is a problem-solving approach that involves breaking down a complex problem into smaller, more manageable sub-problems. Here\'s a step-by-step guide on how to apply this strategy:\n\n1. **Identify the problem**: Clearly define the problem you want to solve. Understand the problem\'s scope, constraints, and goals.\n2. **Break down the problem**: Divide the problem into smaller sub-

In [12]:
response.keys()

dict_keys(['question', 'chat_history', 'answer'])

In [13]:
for text in relevant_texts:
    print("=====================================")
    print(text)

1.1.3 Planning a Solution
After analyzing a problem, we formulate a plan that may lead us towards the solution of a problem. This phase includes finding the right strategy for problem solving. Some of the strategies are:
e Divide and Conquer: This strategy divides a complex problem into
smaller problems. Figure 1-3 Planning for success
e Guess, Check and Improve: The designer guesses a solution to a problem and then checks the correctness of the solution. If the solution is not according to expectations, then he/she refines the solution. The refinement is an iterative process.
e Act it Out: In this strategy the designer defines the list of “to-do” tasks. Afterwards he/she performs the task.
e Prototype (Draw): This technique draws a pictorial representation of the solution. It is not the final solution. However, it may help a designer to understand the important components of the solution.


In [12]:
query = "yes tell me how to apply them"
response, relevant_texts = query_llm(llm, retriever, query, qa)

# Output response and relevant text chunks
print(response['answer'])
print("==============================================")
print("\nRelevant text chunks used in the response:")
for text in relevant_texts:
    print("Chunk: ==============================")
    print(text[:300])

TypeError: string indices must be integers, not 'str'

In [61]:
response

'No relevant documents found.'

In [55]:
# # Example query 
# query = "who gave the idea of boolean values and on what date"
# response, relevant_texts = query_llm(llm, retriever, query)

# # Output response and relevant text chunks
# print(response.content)
# print("==============================================")
# print("\nRelevant text chunks used in the response:")
# for text in relevant_texts:
#     print("Chunk: ==============================")
#     print(text[:300])

In [56]:
# # Example query 
# query = "how ip4 and ip6 works"
# response, relevant_texts = query_llm(llm, retriever, query)

# # Output response and relevant text chunks
# print(response.content)
# print("==============================================")
# print("\nRelevant text chunks used in the response:")
# for text in relevant_texts:
#     print("Chunk: ==============================")
#     print(text[:300])