In [1]:
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')

if not groq_api_key:
    raise ValueError("Groq API key not found in .env file")

In [2]:
import os
import pickle
import re
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.schema import Document


# Initialize the HuggingFace Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  return _bootstrap._gcd_import(name[level:], package, level)
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


In [3]:
# Load documents from a text file and add metadata
def load_documents(file_path):
    loader = TextLoader(file_path)
    documents = loader.load()
    
    # Add metadata to each document (e.g., file name)
    for doc in documents:
        doc.metadata["source"] = file_path
    return documents


In [4]:
# Define your subtopic and chapter separators
subtopic_pattern = re.compile(r'(\d+(\.\d+)+)')
chapter_separator = 'chapter end -------------------------------------'


# Custom RecursiveCharacterTextSplitter with regex patterns for subtopics and chapters
class CustomTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, **kwargs):
        # Initialize with any other parameters, and add your separators
        super().__init__(separators=[chapter_separator], **kwargs)
        self.subtopic_pattern = subtopic_pattern

    def split_text(self, text):
        # First, split by chapters
        texts = super().split_text(text)
        documents = []
        
        # For each chapter, split by subtopic using the subtopic regex
        chapter_number = 1
        for chapter in texts:
            subtopic_splits = self._split_by_subtopic(chapter, chapter_number)
            documents.extend(subtopic_splits)
            chapter_number += 1
        
        return documents

    def _split_by_subtopic(self, text, chapter_number):
        # Use the subtopic regex to split text
        matches = list(self.subtopic_pattern.finditer(text))
        if not matches:
            # No subtopics found, return the full text as a single Document
            return [Document(page_content=text.strip(), metadata={"chapter": chapter_number})]
        
        subtopics = []
        start_idx = 0
        subtopic_number = 1
        
        for match in matches:
            end_idx = match.start()
            if start_idx != end_idx:
                subtopics.append(Document(
                    page_content=text[start_idx:end_idx].strip(),
                    metadata={"chapter": chapter_number, "subtopic": subtopic_number}
                ))
            start_idx = end_idx
            subtopic_number += 1
            
        # Append the remaining part as a subtopic
        subtopics.append(Document(
            page_content=text[start_idx:].strip(),
            metadata={"chapter": chapter_number, "subtopic": subtopic_number}
        ))
        
        return subtopics


In [5]:
# Create embeddings and handle storage
def embed_documents(split_docs, embedding_model):
    EMBEDDINGS_FOLDER = "embeddings"
    EMBEDDINGS_FILE = os.path.join(EMBEDDINGS_FOLDER, "emb01.pkl")

    if not os.path.exists(EMBEDDINGS_FOLDER):
        os.makedirs(EMBEDDINGS_FOLDER)

    if os.path.exists(EMBEDDINGS_FILE):
        print(f"Loading existing embeddings from {EMBEDDINGS_FILE}...")
        with open(EMBEDDINGS_FILE, 'rb') as f:
            embedded_docs = pickle.load(f)
    else:
        print("Creating new embeddings...")
        texts = [doc.page_content for doc in split_docs]
        embedded_docs = embedding_model.embed_documents(texts)

        with open(EMBEDDINGS_FILE, 'wb') as f:
            pickle.dump(embedded_docs, f)

    return embedded_docs


In [6]:
# Store embeddings in Chroma vector store
def store_embeddings(split_docs, embedding_model):
    vector_store = Chroma.from_documents(split_docs, embedding_model) 
    return vector_store

In [7]:
# Build the RAG pipeline
def build_rag_pipeline(vector_store):
    retriever = vector_store.as_retriever()
    return retriever


In [8]:

# Initialize the LLM
def initialize_llm():
    llm = ChatGroq(
        model="llama-3.1-70b-versatile",
        temperature=0.1,
    )
    return llm

In [9]:
# Querying the retriever and LLM
def query_llm(llm, retriever, query):
    # Retrieve relevant documents
    results = retriever.get_relevant_documents(query)

    # Use the LLM to process the retrieved documents
    if results:
        # Combine results for the LLM prompt, and track their sources
        context = "\n".join([doc.page_content for doc in results])
        prompt = f"""Use relevant information from 9th to 12th-grade textbooks to answer the student's query. If the context is helpful, incorporate it; otherwise, provide a general explanation. Avoid mentioning irrelevance and instead say, "I cannot find relevant data from your book but I will explain the general concept." Encourage the student to ask follow-up questions related to the topics in the books.
        
        Context:
        {context}
        Student Query:
        {query}
        """

        response = llm.invoke(prompt)
        
        # Capture the actual text chunks used
        relevant_texts = [doc.page_content for doc in results]
        
        # Return both the response and relevant texts
        return response, relevant_texts
    else:
        return "No relevant documents found.", []


In [10]:
# Main execution flow
if __name__ == "__main__":
    # Load your document
    # documents = load_documents('resources/9thComputerScience_cleaned.txt')
    with open('resources/9thComputerScience_cleaned.txt', 'r') as file:
        documents = file.read()
    # Split the text into smaller chunks
    text_splitter = CustomTextSplitter(chunk_size=1000, chunk_overlap=100)
    split_docs = text_splitter.split_text(documents)
    print(split_docs[10])
    embedded_docs = embed_documents(split_docs, embedding_model)
    vector_store = store_embeddings(split_docs, embedding_model)
    retriever = build_rag_pipeline(vector_store)
    
    # Initialize the LLM
    llm = initialize_llm()


page_content='1.2.1 Definition
A flowchart is a graphical presentation of the steps to solve
Take shoes and socks
Wear socks Wear shoes
a problem. We use symbols for each step, and these symbols are connected with the help of arrows to show the flow of processing.
Figure 1-6 shows a flowchart for the simple problem of wearing shoes with socks. It shows that not only the steps - | are important but also the order to complete a process. A Figure 1-6
Sample flowchart
Unit 1 — Problem Solving
flowchart is used to visually communicate the steps in a process.' metadata={'chapter': 1, 'subtopic': 11}
Creating new embeddings...


In [11]:
# # Example query 
query = "explain boolean proposition"
response, relevant_texts = query_llm(llm, retriever, query)

# # Output response and relevant text chunks
print(response.content)
print("==============================================")
print("\nRelevant text chunks used in the response:")
for text in relevant_texts:
    print("Chunk: ==============================")
    print(text[:300])

  results = retriever.get_relevant_documents(query)


A Boolean proposition is a statement that can either be true or false. It's a sentence that has a clear meaning and can be classified as either true (T) or false (F). 

In the context of your textbook, a proposition is defined as a sentence that can either be true or false. For example:

1. "Someone from our school can join Pakistani Cricket Team" is a proposition because it can be either true or false.
2. "I will get A+ grade in board exam" is also a proposition because it can be either true or false.

On the other hand, sentences like "How are you?" or "Close the door" are not propositions because they are questions or commands, and they don't have a clear true or false value.

In Boolean algebra, propositions are often represented by letters, such as P or Q. For example:

P = "I play chess"
Q = "I want to excel in mathematics"

When we say P, it means that we are referring to the proposition "I play chess", and when we say Q, it means that we are referring to the proposition "I want

In [12]:
for text in relevant_texts:
    print("=====================================")
    print(text)

2.5 Boolean Algebra
2.7
Draw the truth table to verify A+ (B-C) = (A+ B): (A+ C0)
e Identity Law If a variable is OR’ed with a False, the result is always equal to that variable. And if a variable is AND‘ed with a True, the result is always equal to that variable.
a) A OR False = A, A variable OR’ed with False is always equal to that variable
b) A AND True =A, A variable AND’ed with True is always equal to that variable
2.5.5 Laws of Boolean Algebra The laws of Boolean Algebra help us to simplify complex Boolean expressions. Some laws are discussed in the following.
e Commutative Law Commutative Law states that the order of application of two separate propositions is not important. So,
a) A.B=B8B.A (The order in which two variables are AND’ed makes no difference.)
b) A+B=B+A _ (The order in which two variables are OR’ed makes no difference.)
We can use truth tables (Table 2-13a, Table 2-13b) to verify this law for AND and OR operations respectively.
Table 2-13a Table 2-13b
We can obser

In [13]:
# # Example query 
# query = "who gave the idea of boolean values and on what date"
# response, relevant_texts = query_llm(llm, retriever, query)

# # Output response and relevant text chunks
# print(response.content)
# print("==============================================")
# print("\nRelevant text chunks used in the response:")
# for text in relevant_texts:
#     print("Chunk: ==============================")
#     print(text[:300])

In [14]:
# # Example query 
# query = "how ip4 and ip6 works"
# response, relevant_texts = query_llm(llm, retriever, query)

# # Output response and relevant text chunks
# print(response.content)
# print("==============================================")
# print("\nRelevant text chunks used in the response:")
# for text in relevant_texts:
#     print("Chunk: ==============================")
#     print(text[:300])