In [1]:
import re
import os
import json
from math import ceil
from pdfminer.high_level import extract_text
from reportlab.lib.pagesizes import letter
from reportlab.platypus import Paragraph, SimpleDocTemplate, PageBreak, KeepTogether
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_JUSTIFY, TA_LEFT
from reportlab.lib.units import inch
from reportlab.lib import colors

def extract_pdf_text(pdf_path):
    raw_text = extract_text(pdf_path)
    return clean_text(raw_text)  # Apply cleaning right after extraction

def clean_text(text):
    # Remove 'Page N' or variations like "Page 2 of 5"
    text = re.sub(r'Page \d+( of \d+)?', '', text)
    # Remove 'Topic N' lines
    text = re.sub(r'Topic \d+', '', text)
    # Remove excessive spaces and tabs
    text = re.sub(r'[ \t]+', ' ', text)
    # Remove any blank lines
    text = '\n'.join(line.strip() for line in text.splitlines() if line.strip())
    return text.strip()

def split_questions(text):
    # Adjust pattern to match "Question #[number]"
    pattern = r'(Question #\d+)'
    parts = re.split(pattern, text)
    questions = []
    for i in range(1, len(parts), 2):
        heading = parts[i].strip()
        content = parts[i+1] if i+1 < len(parts) else ''
        questions.append({'heading': heading, 'content': content})
    return questions

def parse_question_content(question):
    content = question['content']
    # Find the position where answer choices start
    # Adjusted regex to match 'A. ' where 'A' can be any uppercase letter
    answer_start = re.search(r'\b[A-Z]\.\s', content)
    if answer_start:
        question_body = content[:answer_start.start()].strip()
        answer_choices_text = content[answer_start.start():].strip()
    else:
        # If no answer choices found, treat the whole content as question body
        question_body = content
        answer_choices_text = ''

    # Process answer choices
    options = {}
    if answer_choices_text:
        # Remove any 'Page N' footers from answer choices
        answer_choices_text = re.sub(r'Page \d+( of \d+)?', '', answer_choices_text)
        # Use a regex pattern that splits on option letters followed by a dot and space
        # This pattern handles options 'A.' to 'Z.' and supports multi-line options
        answer_choices = re.split(r'(?<=\n)(?=[A-Z]\.\s)', '\n' + answer_choices_text)
        for choice in answer_choices:
            choice = choice.strip()
            if not choice:
                continue
            # Match each option's letter and text
            option_match = re.match(r'^([A-Z])\.\s+(.*)', choice, re.DOTALL)
            if option_match:
                option_letter = option_match.group(1)
                option_text = option_match.group(2).strip()
                # Clean up the option text to remove any unintended newlines or spaces
                option_text = ' '.join(option_text.split())
                options[option_letter] = option_text
    return question_body, options

def read_answer_key(answer_key_path):
    answer_key = {}
    try:
        with open(answer_key_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    # Adjust regex to match multiple letters (A-Z) separated by commas
                    match = re.match(r'^(\d+)\.\s*([A-Z](?:,\s*[A-Z])*)$', line, re.IGNORECASE)
                    if match:
                        question_number = int(match.group(1))
                        correct_answers = match.group(2)
                        # Split the correct answers into a list
                        correct_answers_list = [ans.strip().upper() for ans in correct_answers.split(',')]
                        answer_key[question_number] = correct_answers_list
                    else:
                        print(f"Warning: Could not parse line in answer key: {line}")
    except FileNotFoundError:
        print(f"Answer key file not found: {answer_key_path}")
    return answer_key

def save_questions_to_json(questions_by_part, output_json_path):
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(questions_by_part, json_file, indent=4, ensure_ascii=False)
    print(f"Questions saved to {output_json_path}")
def divide_questions(questions, num_pdfs):
    total_questions = len(questions)
    questions_per_pdf = ceil(total_questions / num_pdfs)
    divided_questions = []
    for i in range(num_pdfs):
        start_index = i * questions_per_pdf
        end_index = min(start_index + questions_per_pdf, total_questions)
        divided_questions.append(questions[start_index:end_index])
    return divided_questions
def create_pdf_formatted(questions, output_path, pdf_title):
    doc = SimpleDocTemplate(output_path, pagesize=letter,
                            rightMargin=inch, leftMargin=inch,
                            topMargin=inch, bottomMargin=inch)

    styles = getSampleStyleSheet()
    # Define custom styles
    styles.add(ParagraphStyle(name='QuestionHeading', fontSize=14, leading=16, spaceAfter=12, alignment=TA_LEFT, fontName='Helvetica-Bold'))
    styles.add(ParagraphStyle(name='QuestionBody', fontSize=12, leading=14, spaceAfter=12, alignment=TA_JUSTIFY))
    styles.add(ParagraphStyle(name='AnswerOption', fontSize=12, leading=14, spaceAfter=6, leftIndent=20, alignment=TA_JUSTIFY))

    elements = []

    for idx, question in enumerate(questions):
        # Start each question on a new page
        if idx > 0:
            elements.append(PageBreak())

        # Clean up content
        content = clean_text(question['content'])

        # Split content into question body and answer choices
        # Use regex to find the position where the answer choices start
        answer_start = re.search(r'\b[A-Z]\.\s', content)  # Match the first answer option
        if answer_start:
            question_body = content[:answer_start.start()].strip()
            answer_choices_text = content[answer_start.start():].strip()
        else:
            # If no answer choices found, treat the whole content as question body
            question_body = content
            answer_choices_text = ''

        # Create flowables for the question
        question_flowables = []

        # Add the question heading
        question_flowables.append(Paragraph(question['heading'], styles['QuestionHeading']))

        # Add the question body
        question_flowables.append(Paragraph(question_body, styles['QuestionBody']))

        # Process answer choices
        if answer_choices_text:
            # Remove any 'Page N' footers from answer choices
            answer_choices_text = re.sub(r'Page \d+', '', answer_choices_text)
            # Dynamically match answer choices (e.g., 'A.', 'B.', ..., 'Z.')
            # Ensure it only captures uppercase letters followed by a period and space
            answer_choices = re.split(r'(?=(?:[A-Z]\.\s))', answer_choices_text)
            for choice in answer_choices:
                choice = choice.strip()
                if not choice:
                    continue
                # Format the option letter and text
                option_match = re.match(r'^([A-Z]\.\s)(.*)', choice, re.DOTALL)
                if option_match:
                    option_letter = option_match.group(1)
                    option_text = option_match.group(2).strip()
                    formatted_option = f'<b>{option_letter}</b> {option_text}'
                    question_flowables.append(Paragraph(formatted_option, styles['AnswerOption']))
                else:
                    question_flowables.append(Paragraph(choice, styles['AnswerOption']))

        # Use KeepTogether to try to keep the question on the same page
        elements.append(KeepTogether(question_flowables))

    # Build the PDF with page numbers
def main():
    # Path to the large PDF
    pdf_path = r'C:\Users\beain\Documents\AWS Certified Solutions Architect - Associate SAA-C03_Questions.pdf'
    # Directory to store output PDFs and JSON
    base_dir = r'C:\Users\beain\Documents\AWS\output_pdfs'  # Update this path if needed
    os.makedirs(base_dir, exist_ok=True)
    num_parts = 12  # Number of parts/pdfs
    questions_by_part = {}

    # Step 1: Extract text from the large PDF
    print("Extracting text from the large PDF...")
    text = extract_pdf_text(pdf_path)
    if not text.strip():
        print("No text extracted from the PDF. The PDF might be scanned or image-based.")
        return

    # Step 2: Split the text into questions
    print("Splitting text into questions...")
    questions = split_questions(text)
    if not questions:
        print("No questions found. Please check the regular expression and ensure it matches your question headings.")
        return

    # Step 3: Divide questions among the PDFs
    print(f"Dividing {len(questions)} questions into {num_parts} PDFs...")
    divided_questions = divide_questions(questions, num_parts)

    # Step 4: Generate new PDFs with formatted questions and process for JSON
    print("Creating formatted PDF files and extracting questions...")
    for idx, question_group in enumerate(divided_questions):
        part_num = idx + 1
        output_pdf_path = os.path.join(base_dir, f'practice_exam_part_{part_num}.pdf')
        pdf_title = f'Practice Exam Part {part_num}'
        create_pdf_formatted(question_group, output_pdf_path, pdf_title)
        print(f'Created {output_pdf_path}')

        # Read the answer key if available
        answer_key_path = os.path.join(base_dir, f'part{part_num}_answers.txt')
        if os.path.exists(answer_key_path):
            print(f"Reading answer key for Part {part_num}")
            answer_key = read_answer_key(answer_key_path)
        else:
            print(f"Answer key for Part {part_num} not found.")
            answer_key = {}

        question_list = []
        for idx_q, question in enumerate(question_group):
            question_number = idx_q + 1
            question_body, options = parse_question_content(question)
            correct_answer = answer_key.get(question_number, [])

            # Warn if correct_answer is missing
            if not correct_answer:
                print(f"Warning: No correct answer found for question {question_number} in Part {part_num}. Check the answer key.")

            question_data = {
                'question_number': question_number,
                'question_text': question_body,
                'options': options,
                'correct_answer': correct_answer  # Correct answer as a list
            }
            question_list.append(question_data)

        questions_by_part[f'Part {part_num}'] = question_list

    if not questions_by_part:
        print("No questions were extracted from any PDF files.")
    else:
        output_json_path = os.path.join(base_dir, 'questions_by_part.json')
        save_questions_to_json(questions_by_part, output_json_path)

    print("All PDFs have been created and questions extracted successfully.")

if __name__ == '__main__':
    main()


Extracting text from the large PDF...
Splitting text into questions...
Dividing 1016 questions into 12 PDFs...
Creating formatted PDF files and extracting questions...
Created C:\Users\beain\Documents\AWS\output_pdfs\practice_exam_part_1.pdf
Reading answer key for Part 1
Created C:\Users\beain\Documents\AWS\output_pdfs\practice_exam_part_2.pdf
Reading answer key for Part 2
Created C:\Users\beain\Documents\AWS\output_pdfs\practice_exam_part_3.pdf
Reading answer key for Part 3
Created C:\Users\beain\Documents\AWS\output_pdfs\practice_exam_part_4.pdf
Reading answer key for Part 4
Created C:\Users\beain\Documents\AWS\output_pdfs\practice_exam_part_5.pdf
Reading answer key for Part 5
Created C:\Users\beain\Documents\AWS\output_pdfs\practice_exam_part_6.pdf
Reading answer key for Part 6
Created C:\Users\beain\Documents\AWS\output_pdfs\practice_exam_part_7.pdf
Reading answer key for Part 7
Created C:\Users\beain\Documents\AWS\output_pdfs\practice_exam_part_8.pdf
Reading answer key for Part 8
