In [43]:
import os
import pandas as pd
from dotenv import load_dotenv

In [44]:
# Load environment variables
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
if not groq_api_key:
    raise ValueError("GROQ API key not found in .env file")

os.environ["GROQ_API_KEY"] = groq_api_key
# BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
BASE_DIR = os.getcwd()

In [45]:
# Function to load all CSV files from input_data folder
def load_data_from_files(folder_path):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    all_data = pd.DataFrame()
    
    # Load each CSV file and concatenate
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        paper_data = pd.read_csv(file_path)
        all_data = pd.concat([all_data, paper_data], ignore_index=True)
        all_data = all_data.dropna()
    return all_data

# Creating Rag Chains for different subjects

In [46]:
import pickle
import re

# imports for creating pipeline for rag
from langchain.embeddings import HuggingFaceEmbeddings      # for embeddings
from langchain.vectorstores import Chroma                  # for vector store
from langchain.document_loaders import TextLoader        # for loading text
from langchain.text_splitter import RecursiveCharacterTextSplitter  # acting as base class for splitting text 
from langchain_groq import ChatGroq                         # for initializing LLM from groq
from langchain.schema import Document                   # converting simple text to document object

# For compressing the context of retrieved documents
from langchain.retrievers import ContextualCompressionRetriever     # for compressing retrieved documents context
from langchain.retrievers.document_compressors import LLMChainExtractor     # used in compression

# For creating the retrieval chain for chat history
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# For creating history aware retriever
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage     # for messages in history aware retriever

In [47]:
embedding_model = None
documents = None
text_splitter = None
split_docs = None
vector_store = None
embedded_docs = None
llm = None
retriever = None
contextualize_q_system_prompt = None
contextualize_q_prompt = None
history_aware_retriever = None
system_prompt = None
qa_prompt = None
question_answer_chain = None
book_file_path = 'input_data/9thComputerScience_cleaned.txt'

In [48]:
def initialize_embeddings_model():
    global embedding_model
    if embedding_model is None:
        embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


# Load the Text Data and Create Documents
def load_documents(file_path):
    loader = TextLoader(file_path,encoding='utf-8')
    documents = loader.load()
    return documents

# Custom RecursiveCharacterTextSplitter with regex patterns for subtopics and chapters
class CustomTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, **kwargs):
        subtopic_pattern = re.compile(r'(\d+(\.\d+)+)')
        chapter_separator = 'chapter end -------------------------------------'

        # Initialize with any other parameters, and add your separators
        super().__init__(separators=[chapter_separator], **kwargs)
        self.subtopic_pattern = subtopic_pattern

    def split_text(self, text):
        # First, split by chapters
        texts = super().split_text(text)
        documents = []
        
        # For each chapter, split by subtopic using the subtopic regex
        chapter_number = 1
        for chapter in texts:
            subtopic_splits = self._split_by_subtopic(chapter, chapter_number)
            documents.extend(subtopic_splits)
            chapter_number += 1
        
        return documents

    def _split_by_subtopic(self, text, chapter_number):
        # Use the subtopic regex to split text
        matches = list(self.subtopic_pattern.finditer(text))
        if not matches:
            # No subtopics found, return the full text as a single Document
            return [Document(page_content=text.strip(), metadata={"chapter": chapter_number})]
        
        subtopics = []
        start_idx = 0
        subtopic_number = 1
        
        for match in matches:
            end_idx = match.start()
            if start_idx != end_idx:
                subtopics.append(Document(
                    page_content=text[start_idx:end_idx].strip(),
                    metadata={"chapter": chapter_number, "subtopic": subtopic_number}
                ))
            start_idx = end_idx
            subtopic_number += 1
            
        # Append the remaining part as a subtopic
        subtopics.append(Document(
            page_content=text[start_idx:].strip(),
            metadata={"chapter": chapter_number, "subtopic": subtopic_number}
        ))
        
        return subtopics

# Create embeddings and handle storage
def embed_documents(split_docs, embedding_model):
    EMBEDDINGS_FOLDER = "entrytest\\embeddings"
    EMBEDDINGS_FILE = os.path.join(EMBEDDINGS_FOLDER, "9thComputerScience.pkl")

    if not os.path.exists(EMBEDDINGS_FOLDER):
        os.makedirs(EMBEDDINGS_FOLDER)

    if os.path.exists(EMBEDDINGS_FILE):
        print(f"Loading existing embeddings from {EMBEDDINGS_FILE}...")
        with open(EMBEDDINGS_FILE, 'rb') as f:
            embedded_docs = pickle.load(f)
            print("Embeddings loaded successfully.")
    else:
        print("Creating new embeddings...")
        texts = [doc.page_content for doc in split_docs]
        embedded_docs = embedding_model.embed_documents(texts)

        with open(EMBEDDINGS_FILE, 'wb') as f:
            pickle.dump(embedded_docs, f)
            print(f"Embeddings saved to {EMBEDDINGS_FILE}")

    return embedded_docs

# Store embeddings in Chroma vector store
def store_embeddings(split_docs, embedding_model):
    vector_store = Chroma.from_documents(split_docs, embedding_model) 
    return vector_store

def getting_retriever(llm,vector_store):
    """Opiton 01: Creating ContextualCompressionRetriever
    Contextual Compression will find the relevant records and only contains the relevant data from chunks instead of whole chunks
    Maximum Marginal Relevance (mmr) is used to get diverse set of documents.
    Option 02: SelfQueryRetrieval for filtering based on sources"""
    # Option 01
    compressor = LLMChainExtractor.from_llm(llm)
    compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vector_store.as_retriever(search_kwargs={"k": 10})
    #search_type = "mmr"
    )
    return compression_retriever

# Initialize the LLM
def initialize_llm(model_name="llama-3.1-70b-versatile", temperature=0):
    llm = ChatGroq(
        model= model_name,
        temperature=temperature,
    )
    return llm

def initialize_documents(book_file_path):
    global documents
    # Load your document
    documents = load_documents(book_file_path)
def initialize_text_splitter():
    global text_splitter
    text_splitter = CustomTextSplitter(chunk_size=1000, chunk_overlap=0)

def intialize_split_docs():
    global split_docs, documents, text_splitter
    split_docs = text_splitter.split_text(documents[0].page_content)

def initialize_vector_store():
    global vector_store, split_docs, embedding_model
    vector_store = store_embeddings(split_docs, embedding_model)
def initialize_embedded_docs():
    global embedded_docs, split_docs, embedding_model
    embedded_docs = embed_documents(split_docs, embedding_model)

def initialize_chat_llm():
    global llm
    llm = initialize_llm()

def initialize_retriever():
    # Create the retriever having contextual compression
    global retriever, llm, vector_store
    retriever = getting_retriever(llm, vector_store)

def initialize_contextualize_q_system_prompt():
    global contextualize_q_system_prompt
    contextualize_q_system_prompt = (
        "Act as a conversational assistant similar to ChatGPT. Engage in natural dialogue and answer questions based on the context provided through the chat history or retrieved using Retrieval-Augmented Generation (RAG). If the relevant context is not found either in the conversation or via RAG, respond by stating that the information is unavailable or ask for more clarification from the user. Do not provide speculative or out-of-context information. Always ensure responses are precise and contextually relevant."
    )

def initialize_contextualize_q_prompt():
    global contextualize_q_prompt
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )

def initialize_history_aware_retriever():
    # Creating a new retirever that is aware of the chat history. Rest of the things are same.
    global llm, retriever, contextualize_q_prompt, history_aware_retriever
    history_aware_retriever = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
    )

def initialize_system_prompt():
    global system_prompt
    system_prompt = (
        "Act as a conversational assistant similar to ChatGPT. Engage in natural dialogue and answer questions based on the context provided through the chat history or retrieved using Retrieval-Augmented Generation (RAG). If the relevant context is not found either in the conversation or via RAG, respond by stating that the information is unavailable or ask for more clarification from the user. Do not provide speculative or out-of-context information. Always ensure responses are precise and contextually relevant."
        "\n\n"
        "{context}"
    )

def initialize_qa_prompt():
    global qa_prompt
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )

def initialize_question_answer_chain():
    global llm, qa_prompt, question_answer_chain
    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

In [49]:
class RAGChainInitializer:
    _instance = None
    _rag_chain = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(RAGChainInitializer, cls).__new__(cls)
        return cls._instance

    def initialize_chain(self, book_file_path):
        if self._rag_chain is None:
            initialize_embeddings_model()
            initialize_documents(book_file_path)
            initialize_text_splitter()
            intialize_split_docs()
            initialize_vector_store()
            initialize_chat_llm()
            initialize_retriever()
            initialize_system_prompt()
            initialize_qa_prompt()
            initialize_question_answer_chain()
            self._rag_chain = create_retrieval_chain(retriever, question_answer_chain)
        return self._rag_chain

# usage
def create_rag_chain(book_file_path):
    initializer = RAGChainInitializer()
    return initializer.initialize_chain(book_file_path)

In [50]:
rag_chain = create_rag_chain('input_data/9thComputerScience_cleaned.txt')



In [51]:
result = rag_chain.invoke({"input": "can you give me an interesting topic from 1st chapter"})
result

{'input': 'can you give me an interesting topic from 1st chapter',
 'context': [Document(metadata={'chapter': 1, 'subtopic': 2}, page_content='1.1 Problem Solving Steps\nIn order to solve a problem, it is important to follow a systematic approach. In the following we discuss different steps that we can follow to solve a problem systematically.'),
  Document(metadata={'chapter': 1, 'subtopic': 2}, page_content='1.1 Problem Solving Steps\nIn order to solve a problem, it is important to follow a systematic approach. In the following we discuss different steps that we can follow to solve a problem systematically.')],
 'answer': 'Based on the provided text, it seems like we\'re discussing the topic of "Problem Solving Steps" from the 1st chapter. \n\nA potentially interesting topic from this chapter could be the actual steps involved in solving a problem systematically. Unfortunately, the provided text doesn\'t go into detail about the specific steps. However, I can try to provide some gene

In [52]:
result = rag_chain.invoke({"input": "can you give concise answer of previous question"})
result

{'input': 'can you give concise answer of previous question',
 'context': [],
 'answer': "There is no previous question to provide an answer for. This conversation has just started. Please feel free to ask a question, and I'll do my best to provide a concise answer."}

In [53]:
# Define the number of questions for each subject
subjects_questions = {
    "Biology": 68,
    "Chemistry": 54,
    "Physics": 54,
    "English": 18,
    "Logical Reasoning": 6
}

# Load all data from input_data folder
# data_folder = os.path.join(BASE_DIR, 'entrytest/input_data')
data_folder = 'input_data'
data = load_data_from_files(data_folder)

# Sample questions for each subject
biology_questions = data[data['Subject'] == 'Biology'].sample(n=subjects_questions["Biology"])
chemistry_questions = data[data['Subject'] == 'Chemistry'].sample(n=subjects_questions["Chemistry"])
physics_questions = data[data['Subject'] == 'Physics'].sample(n=subjects_questions["Physics"])
english_questions = data[data['Subject'] == 'English'].sample(n=subjects_questions["English"])
logical_reasoning_questions = data[data['Subject'] == 'Logical Reasoning'].sample(n=subjects_questions["Logical Reasoning"])
# Combine the sampled questions while maintaining subject order
combined_questions = pd.concat([
    biology_questions, chemistry_questions, 
    physics_questions, english_questions, 
    logical_reasoning_questions
])
# Assign sequential IDs to the questions
combined_questions['ID'] = range(1, len(combined_questions) + 1)
# Convert to a list of dictionaries for the response
mcq_paper = []
answer_map = {
        'A': 0,
        'B': 1,
        'C': 2,
        'D': 3
    }
for _, item in combined_questions.iterrows():
    mcq_paper.append({
        "id": item['ID'],
        "question": item['Question'],
        "options": [
            item['Option 1'],
            item['Option 2'],
            item['Option 3'],
            item['Option 4']
        ],
        "subject": item['Subject'],
        "answer": answer_map[item['Answers']],
    })

In [None]:
mcq_paper

[{'id': 1,
  'question': 'Which of the following is a muscle component that act as store for energy?',
  'options': ['Myoglobin', 'Creatine-PO4', 'ATP', 'Creatinine-PO4'],
  'subject': 'Biology',
  'answer': 2},
 {'id': 2,
  'question': 'Most of the monosaccharides form a ring structure when in solution. For example ribose will form a five cornered ring known as:',
  'options': ['Ribofuranose',
   'Acetaldehyde',
   'Glucopyranose',
   'Glyceraldehyde'],
  'subject': 'Biology',
  'answer': 2},
 {'id': 3,
  'question': 'Zinc ion attached at the active site of the enzyme carboxypeptidase. The zinc ion functions are',
  'options': ['An activator',
   'A coenzyme molecule',
   'An inhibitor molecule',
   'Controller or Allosteric site'],
  'subject': 'Biology',
  'answer': 2},
 {'id': 4,
  'question': 'Only those genes can assort independently whose loci are on:',
  'options': ['Homologous chromosomes',
   'Same chromosomes',
   'Non-homologous chromosomes',
   'Same chromatids'],
  'subje