In [1]:
# from dotenv import load_dotenv
# import os
# import pandas as pd
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import Chroma
# from langchain_groq import ChatGroq
# from langchain.text_splitter import CharacterTextSplitter
# from langchain.chains import RetrievalQA
# from langchain.schema import Document
# # from langchain.schema import Metadata

In [2]:
# # Load .env file
# load_dotenv()
# groq_api_key = os.getenv('GROQ_API_KEY')
# if not groq_api_key:
#     raise ValueError("Groq API key not found in .env file")

In [3]:


# # Step 1: Load CSV Data
# df = pd.read_csv("UHS_24.csv")  # Adjust delimiter if needed

# # Step 2: Prepare Chunks for Each MCQ
# def create_chunks(row):
#     question = f"Q: {row['Question']}\n"
#     options = f"Options:\nA. {row['Option 1']}\nB. {row['Option 2']}\nC. {row['Option 3']}\nD. {row['Option 4']}\n"
#     answer = f"Answer: {row['Answers']}\n"
#     chunk = question + options + answer
#     metadata = {"subject": row["Subjects"]}
#     return Document(page_content=chunk, metadata=metadata)

# # Step 3: Create a List of Documents (Chunks)
# docs = [create_chunks(row) for index, row in df.iterrows() if row['Answers'] != "Deleted"]

# # Step 4: Create HuggingFace Embeddings
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# # Step 5: Store Embeddings in Chroma
# persist_directory = "uhs_24_chroma_store"  # Directory to store Chroma DB
# vector_store = Chroma.from_documents(docs, embedding_model, persist_directory=persist_directory)



In [4]:
# # Step 1: Set up Embeddings and Vector Store
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# persist_directory = "uhs_24_chroma_store"  # Ensure this directory has the stored vector DB
# vector_store = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

# # Step 2: Initialize the LLM and RetrievalQA System
# llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.7)
# retriever = vector_store.as_retriever()
# qa_system = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# # Step 3: Generate Questions with Structured Formatting
# def generate_question_paper(qa_system, num_questions=10):
#     questions = []
#     for i in range(num_questions):
#         prompt = (
#             "Generate a new multiple choice question based on past paper content. "
#             "The question should align with subject textbooks and be structured clearly for an exam, "
#             "with options labeled as A, B, C, D. Provide a challenging but fair question."
#         )
#         response = qa_system.run(prompt)
#         questions.append(f"Q{i + 1}:\n{response}\n\n")

#     return questions

# # Step 4: Save the Questions to a File
# def save_question_paper(questions, filename="MCQ_Paper.txt"):
#     with open(filename, "w") as file:
#         file.writelines(questions)

#     print(f"Question paper generated and saved as '{filename}'.")

# # Step 5: Generate and Save the Question Paper
# questions = generate_question_paper(qa_system, num_questions=10)
# save_question_paper(questions)


In [2]:
from dotenv import load_dotenv
import os
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.schema import Document

In [3]:
# Load environment variables
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
if not groq_api_key:
    raise ValueError("Groq API key not found in .env file.")

In [4]:
# Step 1: Load CSV Data
df = pd.read_csv("input_data/UHS_24.csv")  # Ensure the correct delimiter is used (default is ',').

# Step 2: Prepare Chunks for Each MCQ
def create_chunks(row):
    question = f"Q: {row['Question']}\n"
    options = (
        f"Options:\nA. {row['Option 1']}\nB. {row['Option 2']}\n"
        f"C. {row['Option 3']}\nD. {row['Option 4']}\n"
    )
    answer = f"Answer: {row['Answers']}\n"
    chunk = question + options + answer
    metadata = {"subject": row["Subjects"]}
    return Document(page_content=chunk, metadata=metadata)

# Step 3: Create a List of Documents (Chunks)
docs = [create_chunks(row) for _, row in df.iterrows() if row['Answers'] != "Deleted"]

print(docs)

[Document(metadata={'subject': 'Biology'}, page_content='Q: Example of viruses having a polyhedral capsid that is with 252 capsomeres is :\nOptions:\nA. Bacteriophage\nB. Influenza virus\nC. Tobacco Mosaic Virus\nD. Adenovirus\nAnswer: A\n'), Document(metadata={'subject': 'Biology'}, page_content='Q: The causative organisms of measles is\nOptions:\nA. Picovirus\nB. Papuvirus\nC. Paramyxovirus\nD. Poxivirus\nAnswer: D\n'), Document(metadata={'subject': 'Biology'}, page_content='Q: In the life cycle of bacteriophage, the lysozymes are required in which of the following steps of infection process\nOptions:\nA. Replication\nB. Penetration\nC. Genome injection\nD. Adsorption\nAnswer: B\n'), Document(metadata={'subject': 'Biology'}, page_content='Q: _____ is transmitted through infected blood and hypodermic syringes\nOptions:\nA. Morbilli virus ( Measles )\nB. HIV\nC. Vibrio cholerae ( Cholera )\nD. Influenza virus\nAnswer: A\n'), Document(metadata={'subject': 'Biology'}, page_content='Q: In

In [5]:
# Step 4: Set up Embeddings and Vector Store
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
persist_directory = "uhs_24_chroma_store"

# # Load or Create Chroma Vector Store
if not os.path.exists(persist_directory):
    vector_store = Chroma.from_documents(docs, embedding_model, persist_directory=persist_directory)
else:
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


In [6]:
# # Step 5: Initialize the LLM and RetrievalQA System
llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.7)

In [7]:
retriever = vector_store.as_retriever()
qa_system = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

In [8]:
# # Step 6: Generate Questions with Structured Formatting
def generate_question_paper(qa_system, num_questions=10):
    for i in range(num_questions):
        prompt = (
            f"Generate {num_questions} multiple choice question based on retrieved multiple choice questions "
            "The question should be structured clearly for an exam, "
            "with options labeled as A, B, C, D. Provide a challenging but fair question."
        )
        # try:
        #     response = qa_system.run(prompt)
        #     questions.append(f"Q{i + 1}:\n{response}\n\n")
        # except Exception as e:
        #     print(f"Error generating question {i + 1}: {e}")
        #     questions.append(f"Q{i + 1}:\n[Error generating this question]\n\n")
    response = qa_system.run(prompt)
    return response

In [9]:
# Step 7: Save the Questions to a File
def save_question_paper(questions, filename="MCQ_Paper.txt"):
    with open(filename, "w") as file:
        file.writelines(questions)
    print(f"Question paper generated and saved as '{filename}'.")

In [10]:
# Step 8: Generate and Save the Question Paper
questions = generate_question_paper(qa_system, num_questions=10)
save_question_paper(questions)


  response = qa_system.run(prompt)


Question paper generated and saved as 'MCQ_Paper.txt'.
