In [38]:

import os
import pandas as pd
from langchain.schema import HumanMessage

In [42]:

def load_data_from_files(folder_path):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    all_data = pd.DataFrame()
    
    # Load each CSV file and concatenate
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        paper_data = pd.read_csv(file_path)
        all_data = pd.concat([all_data, paper_data], ignore_index=True)
        all_data = all_data.dropna()
    return all_data
# Load all data from input_data folder
data_folder = 'input_data'
data = load_data_from_files(data_folder)

# Sample questions for each subject
biology_questions = data[data['Subjects'] == 'Biology'].sample(n=5)


In [51]:
def generate_related_mcq(llm, question, vector_store, k=1):
    """
    Generates a new MCQ based on the semantic search result.
    
    Parameters:
    - llm: The language model instance (e.g., ChatGroq).
    - question: The input question to search related content.
    - vector_store: The vector store for semantic search.
    - k: Number of relevant chunks to retrieve.

    Returns:
    - A new MCQ generated by the language model.
    """
    # Perform semantic search to get relevant chunks
    search_results = vector_store.similarity_search(question, k=k)
    
    # Extract the most relevant chunk's content (can concatenate if k > 1)
    response_chunk = "\n".join([doc.page_content for doc in search_results])
    
    print('=' * 10)  # For debugging
    print(response_chunk)  # Print the retrieved chunk
    print('=' * 10)

    # Formulate the prompt to generate a new MCQ
    prompt = f"""
    Based on the following information: {response_chunk}, generate a new multiple-choice question related to the topic in the following JSON format:
    
    {{
        "question": "Your new question here",
        "options": [
            "Option A",
            "Option B",
            "Option C",
            "Option D"
        ],
        "answer": "A"  # Correct option as one of A, B, C, or D
    }}
    Ensure the question and options are relevant to the topic provided.
    """

    # Use the LLM to generate the new MCQ
    new_mcq_response = llm.invoke([HumanMessage(content=prompt)])

    return new_mcq_response.content

In [52]:
import os
import glob
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma  # Or use FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain.embeddings import HuggingFaceEmbeddings

# Load environment variables
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
if not groq_api_key:
    raise ValueError("GROQ API key not found in .env file")

os.environ["GROQ_API_KEY"] = groq_api_key

# Set base directory and folder containing the text files
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "ocrbooks")

# Load all text files from the folder
def load_files_from_directory(directory):
    text_files = glob.glob(os.path.join(directory, "*.txt"))
    docs = []
    for file in text_files:
        try:
            with open(file, "r", encoding="utf-8") as f:
                text = f.read()
        except UnicodeDecodeError:
            with open(file, "r", encoding="ISO-8859-1") as f:  # Fallback to ISO-8859-1
                text = f.read()
        docs.append(text)
    return docs


# Load and read all text files in the ocrbooks folder
documents = load_files_from_directory(DATA_DIR)

# Split documents into semantic chunks using RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust chunk size based on document size
    chunk_overlap=50,  # Overlap for context continuity
)

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
chunks = []
for doc in documents:
    chunks.extend(text_splitter.split_text(doc))


# Create ChromaDB vector store with the chunks
persist_directory = "bio_embeddings"

# # Load or Create Chroma Vector Store
if not os.path.exists(persist_directory):
    vector_store = Chroma.from_texts(chunks, embedding_model, persist_directory=persist_directory)
else:
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)





llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.7)
# Create a RetrievalQA chain using a language model and vector store
retrieval_chain = RetrievalQA.from_chain_type(
    llm= llm, 
    retriever=vector_store.as_retriever(),
    chain_type="stuff"
)

new_mcqs = []

# Iterate through the biology_questions DataFrame
for index, row in biology_questions.iterrows():
    question = row["Question"]
    new_mcq = generate_related_mcq(llm, question, vector_store)
    new_mcqs.append(new_mcq)

# Assuming you want to store new MCQs in a DataFrame
new_mcqs_df = pd.DataFrame(new_mcqs, columns=["New MCQ"])

# Combine the original and new MCQs (if needed)
combined_df = pd.concat([biology_questions, new_mcqs_df], axis=1)

# Display the combined DataFrame
combined_df.head()



Bone : It is the most rigid form of connective tissue. The collagen fibers of bone are hardened by
deposit of calcium phosphate. Bones supporting your arms and legs consist of an outer shell of
compact bone, with spongy bone in the interior. Compact bone is dense and strong and provides
an attachment site for a muscle. Spongy bone is light, rich in blood vessels, and highly porous. The
cavities of spongy bone contain bone marrow where blood cells are formed. There are three types
of cells associated with bone:

Bone-forming cell (osteoblast ), mature bone cell (osteocyte ), and bone dissolving cells (osteoclast

).

Stem Cell Osteoblast

Fig. 16.3 Cells of bone

@)

16. Support and Movement eLearn.Punjab

Early in development, when bone is replacing cartilage, the osteoclasts invade and dissolve the
cartilage. Then osteoblasts replace it with bone. As bones grow, the matrix of bone is hardened
and the osteoblasts are gradually entrapped within it.
1. Receptors

The neuron fibres and ce

Unnamed: 0,ID,Question,Option 1,Option 2,Option 3,Option 4,Subjects,Answers,New MCQ
66,67.0,Bone forming cells are:,Osteocytes,Osteons,Osteoblasts,Osteoclasts,Biology,A,
28,29.0,Taste receptor is an example of,Chemo receptors,Mechano receptors,Nociceptor,Photo receptors,Biology,D,
65,66.0,Which of the following is NOT a bone of upper ...,Ulna,Radius,Humerus,Femur,Biology,C,
13,14.0,Polysaccharides in plants are synthesized by t...,Oxidation,Condensation,Glycolysis,Hydrolysis,Biology,C,
31,32.0,Induced fit model of enzyme activity suggests ...,Can catalyze related reaction,Cannot modify its active sites,Usually belongs to non-regulatory enzyme,Can bind to a single substrate,Biology,C,


In [53]:
combined_df.to_csv('biology_new_mcqs.csv', index=False)