In [141]:
from dotenv import load_dotenv
import os
from pathlib import Path
from typing import List, Dict
import pickle
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.schema import Document
# Load .env file
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')

if not groq_api_key:
    raise ValueError("Groq API key not found in .env file")

In [142]:
# Initialize the HuggingFace Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



In [143]:
# def get_all_text_files(base_folder: str) -> List[str]:
#     text_files = []
#     for chapter_folder in os.listdir(base_folder):
#         chapter_path = os.path.join(base_folder, chapter_folder)
#         if os.path.isdir(chapter_path):
#             for file_name in os.listdir(chapter_path):
#                 if file_name.endswith(".txt"):
#                     text_files.append(os.path.join(chapter_path, file_name))
#     return sorted(text_files)

In [144]:
# def read_text_from_file(file_path: str) -> str:
#     with open(file_path, 'r', encoding='utf-8') as file:
#         return file.read()

In [145]:
# def create_chunks_by_files(base_folder: str) -> Dict[str, List[str]]:
#     text_files = get_all_text_files(base_folder)
#     chapters = {f"Chapter{num}": [] for num in range(1, 6)}
#     for file_path in text_files:
#         chapter_name = Path(file_path).parent.name
#         file_content = read_text_from_file(file_path)
#         chapters[chapter_name].append(file_content)
    
#     return chapters

In [146]:
# def split_documents(base_folder: str):
#     chunks = []
#     chapter_chunks = create_chunks_by_files(base_folder)
#     for chapter, chunks in chapter_chunks.items():
#         print(f"Chunks for {chapter} (Total: {len(chunks)}):\n")
#         for i, chunk in enumerate(chunks):
#             chunks.append(f"Chunk {i+1}:\n{chunk}\n{'-'*50}\n")
#     return chunks

In [147]:
import os
from pathlib import Path
from typing import List, Dict

def get_all_text_files(base_folder: str) -> List[str]:
    """
    Collects all text files inside the specified base folder.

    Parameters:
    - base_folder: The path to the main folder containing chapter subfolders.

    Returns:
    - List of paths to all text files found within subfolders.
    """
    text_files = []
    for chapter_folder in os.listdir(base_folder):
        chapter_path = os.path.join(base_folder, chapter_folder)
        # Ensure only directories are considered as chapter folders
        if os.path.isdir(chapter_path):
            for file_name in os.listdir(chapter_path):
                if file_name.endswith(".txt"):
                    text_files.append(os.path.join(chapter_path, file_name))
    return sorted(text_files)

def read_text_from_file(file_path: str) -> str:
    """
    Reads the content of a single text file.

    Parameters:
    - file_path: Path to a text file.

    Returns:
    - A string containing the content of the text file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def create_chunks_by_files(base_folder: str) -> List[Dict[str, str]]:
    """
    Creates chunks by reading each text file as an individual chunk and wraps them in a dictionary.

    Parameters:
    - base_folder: Path to the folder containing chapter subfolders.

    Returns:
    - A list of dictionaries with 'page_content' and 'metadata' attributes.
    """
    text_files = get_all_text_files(base_folder)
    chunks = []

    # Read each file and format the content as a dictionary for embeddings
    for file_path in text_files:
        file_content = read_text_from_file(file_path)
        # Create a dictionary format for each file chunk
        chunk = {
            'page_content': file_content,          # The actual text content of the file
            'metadata': {'source': file_path}       # Add metadata (e.g., file path or chapter)
        }
        chunks.append(chunk)

    return chunks

def split_documents(base_folder: str) -> List[Dict[str, str]]:
    """
    Main function to read text files and create chunks with the expected format for embeddings.

    Parameters:
    - base_folder: Path to the folder containing chapter subfolders.

    Returns:
    - A list of formatted chunks, each containing 'page_content' and 'metadata'.
    """
    if not os.path.exists(base_folder):
        raise ValueError(f"Error: The folder '{base_folder}' does not exist.")

    split_docs = create_chunks_by_files(base_folder)
    documents = [Document(page_content=doc['page_content'], metadata=doc['metadata']) for doc in split_docs]
    return documents


In [148]:
def embed_documents(split_docs, embedding_model):
    EMBEDDINGS_FOLDER = "embeddings"
    EMBEDDINGS_FILE = os.path.join(EMBEDDINGS_FOLDER, "emb02.pkl")

    if not os.path.exists(EMBEDDINGS_FOLDER):
        os.makedirs(EMBEDDINGS_FOLDER)

    if os.path.exists(EMBEDDINGS_FILE):
        print(f"Loading existing embeddings from {EMBEDDINGS_FILE}...")
        with open(EMBEDDINGS_FILE, 'rb') as f:
            embedded_docs = pickle.load(f)
    else:
        print("Creating new embeddings...")
        texts = [doc.page_content for doc in split_docs]

        embedded_docs = embedding_model.embed_documents(texts)

        with open(EMBEDDINGS_FILE, 'wb') as f:
            pickle.dump(embedded_docs, f)

    return embedded_docs


In [149]:
def store_embeddings(split_docs, embedding_model):
    
    vector_store = Chroma.from_documents(split_docs, embedding_model) 
    return vector_store

In [150]:
def build_rag_pipeline(vector_store):
    retriever = vector_store.as_retriever()
    return retriever

In [151]:
def initialize_llm():
    llm = ChatGroq(
        model="llama-3.1-70b-versatile",
        temperature=0,
    )
    return llm

In [158]:
def query_llm(llm, retriever, query):
    results = retriever.get_relevant_documents(query,num_chunks=10)
    if results:
        context = "\n".join([doc.page_content for doc in results])
        prompt = f"""Please consider information from 9th to 12th-grade textbooks. If the provided context is relevant to the student's query, use it to give a precise answer. Otherwise, answer based on your own knowledge, considering the material covered in books for these grades. Don't mention in response that the context is not relevant. You can simply say "I cannot find relevant data from your book but I will explain you the general concept about" and so on. At the end, encourage student to ask conversational questions related to the topics in the book based on the query.
        
        Context:
        {context}
        Student Query:
        {query}
        """
        response = llm.invoke(prompt)
        relevant_texts = [doc.page_content for doc in results]
        return response, relevant_texts
    else:
        return "No relevant documents found.", []

In [155]:
if __name__ == "__main__":
    split_docs = split_documents('resources/9ComputerScience')
    for item in split_docs:
        print(item)
    # embedded_docs = embed_documents(split_docs, embedding_model)
    # vector_store = store_embeddings(split_docs, embedding_model)
    # retriever = build_rag_pipeline(vector_store)
    # llm = initialize_llm()


page_content='Defining a Problem
A well-defined problem is the one that does not contain ambiguities. All the conditions are clearly specified and it has a clear goal. It is easy to understand and solve.
Given a problem statement, first we need to see whether the problem is defined well or not. If the problem is not defined well then we can use one of the following strategies to define the problem.
Gain Background Knowledge: We try to know the situation and circumstances in which the problem is happening. In this way, we can identify the given state. It also helps to know what a good solution will look like. How we shall be able to measure the solution.
Use Guesses: We try to guess the unknown information through appropriate guesses. These guesses may be bases upon our past experiences.
Draw a Picture: If the problem is not well-defined, we can draw a picture and fill the undefined information. Figure 1-1 shows pictorial representation of a problem.
FOdTTSSGTOdo ded ecedes Can you solv

In [159]:
query = "explain boolean proposition"
response, relevant_texts = query_llm(llm, retriever, query)

print(response.content)
print("==============================================")
print("\nRelevant text chunks used in the response:")
for text in relevant_texts:
    print("Chunk: ==============================")
    print(text[:300])

In Boolean Algebra, a Boolean proposition is a statement that can be either true (T) or false (F). It's a fundamental concept in logic and computer science. A Boolean proposition is typically represented by a single variable, such as p or q, and can be combined with other propositions using logical operators like AND (∧), OR (∨), and NOT (¬).

For example, consider the following Boolean propositions:

p: It is raining.
q: The sky is blue.

These propositions can be combined using logical operators to form more complex statements, such as:

p ∧ q: It is raining and the sky is blue. (This statement is true only if both p and q are true.)
p ∨ q: It is raining or the sky is blue. (This statement is true if either p or q is true.)
¬p: It is not raining. (This statement is true if p is false.)

Boolean propositions are used to represent and analyze logical statements in a systematic and rigorous way. They form the basis of digital electronics, computer programming, and many other fields that