In [2]:
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')

if not groq_api_key:
    raise ValueError("Groq API key not found in .env file")

In [3]:
import os
import pickle
import re
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.schema import Document

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo



# Initialize the HuggingFace Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  return _bootstrap._gcd_import(name[level:], package, level)
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


In [4]:
# Load documents from a text file and add metadata
def load_documents(file_path):
    loader = TextLoader(file_path)
    documents = loader.load()
    
    # Add metadata to each document (e.g., file name)
    for doc in documents:
        doc.metadata["source"] = file_path
    return documents


In [5]:
# Define your subtopic and chapter separators

# Custom RecursiveCharacterTextSplitter with regex patterns for subtopics and chapters
class CustomTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, **kwargs):
        subtopic_pattern = re.compile(r'(\d+(\.\d+)+)')
        chapter_separator = 'chapter end -------------------------------------'

        # Initialize with any other parameters, and add your separators
        super().__init__(separators=[chapter_separator], **kwargs)
        self.subtopic_pattern = subtopic_pattern

    def split_text(self, text):
        # First, split by chapters
        texts = super().split_text(text)
        documents = []
        
        # For each chapter, split by subtopic using the subtopic regex
        chapter_number = 1
        for chapter in texts:
            subtopic_splits = self._split_by_subtopic(chapter, chapter_number)
            documents.extend(subtopic_splits)
            chapter_number += 1
        
        return documents

    def _split_by_subtopic(self, text, chapter_number):
        # Use the subtopic regex to split text
        matches = list(self.subtopic_pattern.finditer(text))
        if not matches:
            # No subtopics found, return the full text as a single Document
            return [Document(page_content=text.strip(), metadata={"chapter": chapter_number})]
        
        subtopics = []
        start_idx = 0
        subtopic_number = 1
        
        for match in matches:
            end_idx = match.start()
            if start_idx != end_idx:
                subtopics.append(Document(
                    page_content=text[start_idx:end_idx].strip(),
                    metadata={"chapter": chapter_number, "subtopic": subtopic_number}
                ))
            start_idx = end_idx
            subtopic_number += 1
            
        # Append the remaining part as a subtopic
        subtopics.append(Document(
            page_content=text[start_idx:].strip(),
            metadata={"chapter": chapter_number, "subtopic": subtopic_number}
        ))
        
        return subtopics


In [6]:
# Create embeddings and handle storage
def embed_documents(split_docs, embedding_model):
    EMBEDDINGS_FOLDER = "embeddings"
    EMBEDDINGS_FILE = os.path.join(EMBEDDINGS_FOLDER, "emb01.pkl")

    if not os.path.exists(EMBEDDINGS_FOLDER):
        os.makedirs(EMBEDDINGS_FOLDER)

    if os.path.exists(EMBEDDINGS_FILE):
        print(f"Loading existing embeddings from {EMBEDDINGS_FILE}...")
        with open(EMBEDDINGS_FILE, 'rb') as f:
            embedded_docs = pickle.load(f)
    else:
        print("Creating new embeddings...")
        texts = [doc.page_content for doc in split_docs]
        embedded_docs = embedding_model.embed_documents(texts)

        with open(EMBEDDINGS_FILE, 'wb') as f:
            pickle.dump(embedded_docs, f)

    return embedded_docs


In [7]:
# Store embeddings in Chroma vector store
def store_embeddings(split_docs, embedding_model):
    vector_store = Chroma.from_documents(split_docs, embedding_model) 
    return vector_store

In [8]:

def build_rag_pipeline(llm,vector_store):
    # Creating ContextualCompressionRetriever
    compressor = LLMChainExtractor.from_llm(llm)
    compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vector_store.as_retriever(search_type = "mmr")
    )
    return compression_retriever
    # Contextual Compression will find the relevant records and only contains the relevant data from chunks instead of whole chunks


In [9]:

# Initialize the LLM
def initialize_llm():
    llm = ChatGroq(
        model="llama-3.1-70b-versatile",
        temperature=0.1,
    )
    return llm

In [10]:
# Querying the retriever and LLM
def query_llm(llm, retriever, query):
    # Retrieve relevant documents
    results = retriever.get_relevant_documents(query)

    # Capture the actual text chunks used
    relevant_texts = [doc.page_content for doc in results]

    # Use the LLM to process the retrieved documents
    if results:
        # Combine results for the LLM prompt, and track their sources
        context = "\n".join(relevant_texts)
        prompt = f"""Use relevant information from 9th to 12th-grade textbooks to answer the student's query. If the context is helpful, incorporate it; otherwise, provide a general explanation. Avoid mentioning irrelevance and instead say, "I cannot find relevant data from your book but I will explain the general concept." Encourage the student to ask follow-up questions related to the topics in the books.
        
        Context:
        {context}
        Student Query:
        {query}
        """

        response = llm.invoke(prompt)
        
        # Return both the response and relevant texts
        return response, relevant_texts
    else:
        return "No relevant documents found.", []


In [11]:
# Main execution flow
if __name__ == "__main__":
    # Load your document
    # documents = load_documents('resources/9thComputerScience_cleaned.txt')
    with open('resources/9thComputerScience_cleaned.txt', 'r') as file:
        documents = file.read()
    # Split the text into smaller chunks
    text_splitter = CustomTextSplitter(chunk_size=1000, chunk_overlap=100)
    split_docs = text_splitter.split_text(documents)
    print(split_docs[10])
    embedded_docs = embed_documents(split_docs, embedding_model)
    vector_store = store_embeddings(split_docs, embedding_model)
    llm = initialize_llm()
    retriever = build_rag_pipeline(llm,vector_store)
    


page_content='1.2.1 Definition
A flowchart is a graphical presentation of the steps to solve
Take shoes and socks
Wear socks Wear shoes
a problem. We use symbols for each step, and these symbols are connected with the help of arrows to show the flow of processing.
Figure 1-6 shows a flowchart for the simple problem of wearing shoes with socks. It shows that not only the steps - | are important but also the order to complete a process. A Figure 1-6
Sample flowchart
Unit 1 — Problem Solving
flowchart is used to visually communicate the steps in a process.' metadata={'chapter': 1, 'subtopic': 11}
Creating new embeddings...


In [12]:
# # Example query 
query = "explain boolean proposition"
response, relevant_texts = query_llm(llm, retriever, query)

# # Output response and relevant text chunks
print(response.content)
print("==============================================")
print("\nRelevant text chunks used in the response:")
for text in relevant_texts:
    print("Chunk: ==============================")
    print(text[:300])

  results = retriever.get_relevant_documents(query)


A Boolean proposition is a statement that can be either true or false. It's a sentence that has a clear and specific meaning, and its truth value can be determined. In other words, a Boolean proposition is a statement that can be classified as either true (T) or false (F).

To be considered a Boolean proposition, a statement must meet two conditions:

1. It must be a declarative sentence, meaning it must state something that can be verified as true or false.
2. It must have a clear and specific meaning, leaving no room for ambiguity or uncertainty.

Examples of Boolean propositions include:

* "Someone from our school can join the Pakistani Cricket Team." (This statement is either true or false.)
* "I will get an A+ grade in the board exam." (This statement is either true or false.)
* "This year's Pakistan Super League (PSL) final match will be played in Lahore." (This statement is either true or false.)

On the other hand, sentences that are not Boolean propositions include:

* Questi

In [13]:
for text in relevant_texts:
    print("=====================================")
    print(text)

2.5.1 Boolean Proposition
A proposition is a sentence that can either be true or false. For example, the following sentences are propositions.
1. “Someone from our school can join Pakistani Cricket Team”
2. “I will get A+ grade in board exam"
3. “I want to excel in mathematics”
4. “This year Pakistan Super League (PSL) final match will be played in Lahore”
5. “I play chess”.
But the following sentences are not propositions 1. How are you? 2. Close the door. 3. Is it hot outside?
True and False are called Boolean values. The idea was given by George Boole (2 November 1815 —8 December 1864) in his book “The Laws of Thought”.


In [14]:
# # Example query 
# query = "who gave the idea of boolean values and on what date"
# response, relevant_texts = query_llm(llm, retriever, query)

# # Output response and relevant text chunks
# print(response.content)
# print("==============================================")
# print("\nRelevant text chunks used in the response:")
# for text in relevant_texts:
#     print("Chunk: ==============================")
#     print(text[:300])

In [15]:
# # Example query 
# query = "how ip4 and ip6 works"
# response, relevant_texts = query_llm(llm, retriever, query)

# # Output response and relevant text chunks
# print(response.content)
# print("==============================================")
# print("\nRelevant text chunks used in the response:")
# for text in relevant_texts:
#     print("Chunk: ==============================")
#     print(text[:300])