In [23]:
import os
import PyPDF2
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.enums import TA_JUSTIFY


In [24]:
# Load environment variables
from dotenv import load_dotenv
import os
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
if not groq_api_key:
    raise ValueError("Groq API key not found in .env file.")

In [25]:
# Global variable to store embeddings so they aren't created again
GLOBAL_EMBEDDINGS = None
GLOBAL_DB = None

In [26]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

In [27]:
def create_embeddings(pdf_paths):
    """This function creates embeddings once and stores them globally."""
    global GLOBAL_EMBEDDINGS
    global GLOBAL_DB

    # If embeddings are already created, just return them
    if GLOBAL_DB is not None:
        print("Using already created embeddings.")
        return GLOBAL_DB

    print("Creating new embeddings...")

    all_texts = ""
    for pdf_path in pdf_paths:
        pdf_text = extract_text_from_pdf(pdf_path)
        all_texts += pdf_text + "\n"

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_text(all_texts)

    embeddings = HuggingFaceEmbeddings()
    db = Chroma.from_texts(texts, embeddings)

    # Store embeddings in the global variable so that it can be reused
    GLOBAL_EMBEDDINGS = embeddings
    GLOBAL_DB = db

    return db

In [28]:
def create_qa_system(db):
    """This function creates a QA system using the already created embeddings."""
    llm = ChatGroq(
        model="llama-3.1-70b-versatile",
        temperature=0.7,
    )

    # Use the pre-created vector store (db)
    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever(search_kwargs={"k": 3}))

    return qa


In [29]:
def generate_objective_questions(qa_system):
    prompt = "Generate a new multiple choice exam which has 10 questions based on the content of the past papers, ensuring it's within the scope of the subject books. The questions should be challenging but fair. Format the questions in a structured manner suitable for an exam paper, including clear instructions. Give MCQs in the form of a, b, c, d options. Also, space between each question should be at least 2 lines in proper format."
    response = qa_system.run(prompt)
    return response

In [30]:
def generate_subjective_questions(qa_system):
    prompt = "Generate a new subjective question paper based on the contents of the past papers, ensuring it's within the scope of the subject books. There should be two sections in the paper. In the first section, there are three parts of short questions, and in each part of short questions, there are six short questions. Then in the second section, there are three long questions. The questions should be challenging but fair. Format the questions in a structured manner suitable for an exam paper. Also, space between each question should be at least 2 lines. Do not repeat the questions."
    response = qa_system.run(prompt)
    return response

In [31]:
def create_combined_question_paper_pdf(objective_questions, subjective_questions, output_path):

    doc = SimpleDocTemplate(output_path, pagesize=letter)
    styles = getSampleStyleSheet()
    content = []

    content.append(Paragraph("Generated Question Paper", styles['Title']))
    content.append(Spacer(1, 12))

    styles.add(ParagraphStyle(name='Justify', alignment=TA_JUSTIFY))
    
    # Objective Questions Section
    content.append(Paragraph("Section A: Objective Questions", styles['Heading1']))
    content.append(Spacer(1, 12))
    lines = objective_questions.split('\n')
    for line in lines:
        if line.strip():
            content.append(Paragraph(line, styles['Justify']))
        content.append(Spacer(1, 6))
    
    content.append(Spacer(1, 12))
    
    # Subjective Questions Section
    content.append(Paragraph("Section B: Subjective Questions", styles['Heading1']))
    content.append(Spacer(1, 12))
    lines = subjective_questions.split('\n')
    for line in lines:
        if line.strip():
            content.append(Paragraph(line, styles['Justify']))
        content.append(Spacer(1, 6))

    doc.build(content)

In [32]:
def get_pdf_files_from_folder(folder_path):
    pdf_files = []
    for file in os.listdir(folder_path):
        if file.lower().endswith('.pdf'):
            pdf_files.append(os.path.join(folder_path, file))
    return pdf_files

In [None]:
def main():
    objective_past_paper_folder = "chemistry_mcqs_output_pdfs"  # Folder containing objective past paper PDFs
    subjective_past_paper_folder = "chemistry_sub_output_pdfs"  # Folder containing subjective past paper PDFs
    subject_book_paths = ["chemistry9.pdf"]  # Add your subject book PDFs here

    objective_past_paper_paths = get_pdf_files_from_folder(objective_past_paper_folder)
    subjective_past_paper_paths = get_pdf_files_from_folder(subjective_past_paper_folder)
    
    # Call the create_embeddings function only once
    embeddings_db = create_embeddings(objective_past_paper_paths + subjective_past_paper_paths + subject_book_paths)
    
    # Create the QA system using the pre-created embeddings
    objective_qa_system = create_qa_system(embeddings_db)
    subjective_qa_system = create_qa_system(embeddings_db)
    
    objective_questions = generate_objective_questions(objective_qa_system)
    subjective_questions = generate_subjective_questions(subjective_qa_system)
    
    output_pdf_path = "2_combined_question_paper.pdf"
    create_combined_question_paper_pdf(objective_questions, subjective_questions, output_pdf_path)
    
    print(f"Generated combined question paper has been saved to {output_pdf_path}")
    print(f"Used {len(objective_past_paper_paths)} objective past papers from the folder '{objective_past_paper_folder}'")
    print(f"Used {len(subjective_past_paper_paths)} subjective past papers from the folder '{subjective_past_paper_folder}'")

if __name__ == "__main__":
    main()
