In [2]:
# 1. Install required libraries and system tools
# 'poppler-utils' is required by pdf2image to convert PDF pages into images.
!apt-get install -y poppler-utils
!pip install openai pypdf pdf2image

import os
import base64
import io
import re
from openai import OpenAI
from pypdf import PdfReader
from pdf2image import convert_from_path

# --- Configuration ---

# 1. Standard Vision Model
# Reliable high-performance model for general visual tasks.
VISION_MODEL_VERSION = "gpt-4o"

# 2. Latest/SOTA Model
# The most advanced model available (hypothetical version). 
# Used only when the user explicitly requests the highest performance override.
LATEST_MODEL_VERSION = "gpt-5.1"

# 3. Cost-Effective Text Model
# Optimized for speed and cost. Used for pages that contain only text.
TEXT_MODEL_VERSION = "gpt-4o-mini"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libpoppler-dev libpoppler-private-dev libpoppler118
The following NEW packages will be installed:
  poppler-utils
The following packages will be upgraded:
  libpoppler-dev libpoppler-private-dev libpoppler118
3 upgraded, 1 newly installed, 0 to remove and 95 not upgraded.
Need to get 1,469 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libpoppler-private-dev amd64 22.02.0-2ubuntu0.12 [199 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libpoppler-dev amd64 22.02.0-2ubuntu0.12 [5,186 B]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libpoppler118 amd64 22.02.0-2ubuntu0.12 [1,079 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.12 [186 kB]
Fetched 

In [3]:
# --- API Key Setup ---
# This block handles secure API key retrieval, specifically designed for Kaggle environments.
# If running locally, ensure 'OPENAI_API_KEY' is set in your environment variables.

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value = user_secrets.get_secret("OPENAI_API_KEY")

    # Set the environment variable so the OpenAI client can detect it automatically.
    os.environ["OPENAI_API_KEY"] = secret_value
    print("Successfully retrieved API key from Kaggle Secrets.")
except ImportError:
    # This block executes if not running on Kaggle (e.g., local machine).
    print("Kaggle Secrets not found. Relying on local environment variables.")
except Exception as e:
    print(f"An error occurred while loading secrets: {e}")

# Initialize the OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

Successfully retrieved API key from Kaggle Secrets.


In [4]:
# --- Utility Functions ---

def encode_image_to_base64(pil_image):
    """
    Encodes a PIL image to a Base64 string for API transmission.
    
    Args:
        pil_image (PIL.Image): The image object to encode.
        
    Returns:
        str: Base64 encoded string of the image.
    """
    buffered = io.BytesIO()
    # Compress as JPEG to reduce token usage and latency.
    # Quality=70 provides a good balance between clarity and file size.
    pil_image.save(buffered, format="JPEG", quality=70)
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

def has_visual_keywords(text):
    """
    Scans the extracted text for specific keywords that indicate the presence of 
    visual content such as figures, equations, or diagrams.
    
    Args:
        text (str): The raw text extracted from a PDF page.
        
    Returns:
        bool: True if visual keywords are found, False otherwise.
    """
    if not text:
        return False
        
    # List of regex patterns to detect references to visual elements.
    triggers = [
        r"Figure \d", r"Fig\. \d",   # Detects 'Figure 1', 'Fig. 2', etc.
        r"Table \d",  r"Tab\. \d",   # Detects 'Table 1', 'Tab. 3', etc.
        r"Eq\. \d",   r"Equation",   # Detects 'Eq. 4', 'Equation 5'
        "Graph", "Chart", "Diagram", "Plot", "Schematic", "Map"
    ]
    
    combined_pattern = "|".join(triggers)
    # Perform a case-insensitive search for any of the patterns.
    return bool(re.search(combined_pattern, text, re.IGNORECASE))

def prepare_pdf_resources(pdf_path, start_page, end_page):
    """
    Prepares PDF resources by converting pages to images and initializing the text reader.
    
    Args:
        pdf_path (str): Path to the PDF file.
        start_page (int): The starting page number (1-based index).
        end_page (int): The ending page number (1-based index).
        
    Returns:
        tuple: (pdf_images_list, pdf_text_reader) or (None, None) on failure.
    """
    try:
        # Convert the specified range of pages into images.
        # Note: pdf2image uses 1-based indexing for 'first_page' and 'last_page'.
        images = convert_from_path(pdf_path, first_page=start_page, last_page=end_page)
        
        # Initialize the standard PDF text reader.
        reader = PdfReader(pdf_path)
        
        return images, reader
    except Exception as e:
        print(f"Error preparing PDF resources: {e}")
        return None, None

def construct_prompt_messages(mode, content_data):
    """
    Constructs the message payload for the OpenAI API based on the analysis mode.
    
    Args:
        mode (str): 'vision' (for image analysis) or 'text' (for text-only analysis).
        content_data (dict): Dictionary containing 'text' and optionally 'image' (base64).
        
    Returns:
        list: A list of message dictionaries formatted for the OpenAI API.
    """
    if mode == "vision":
        base64_img = content_data['image']
        return [
            {   
                "role": "system", 
                "content": "You are a quiz generator. Analyze the provided page image (containing text, diagrams, or equations) and generate a high-quality multiple-choice question."
            },
            {   
                "role": "user", 
                "content": [
                    {   
                        "type": "text", 
                        "text": "Based on this page image, generate 1 multiple-choice question. Priority: Focus on interpreting diagrams, charts, or equations if they exist. Format:\nQuestion:\nOptions:\n[Correct Answer]:"
                    },
                    {   
                        "type": "image_url", 
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_img}", "detail": "high"}
                    }
                ]
            }
        ]
    else: # text mode
        text_context = content_data['text']
        return [
            {   
                "role": "system", 
                "content": "You are a quiz generator based on text context."
            },
            {   
                "role": "user", 
                "content": f"Context:\n\"\"\"{text_context}\"\"\"\n\nGenerate 1 multiple-choice question based on this text. Format:\nQuestion:\nOptions:\n[Correct Answer]:"
            }
        ]

def call_llm_api(messages, model_name):
    """
    Calls the OpenAI API using the specified model version and returns the generated content.
    
    Args:
        messages (list): The list of prompt messages.
        model_name (str): The specific model version to use (e.g., 'gpt-4o', 'gpt-4o-mini').
        
    Returns:
        str: The content generated by the LLM, or None if an error occurs.
    """
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=messages,
            max_tokens=500, # Set a safe limit to prevent excessive token usage.
            temperature=0.7
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"API Call Error (Model: {model_name}): {e}")
        return None

In [5]:
# --- Main Controller Function ---

def generate_quiz_hybrid(pdf_path, start_page=1, end_page=1, use_latest_model=False):
    """
    Main controller function for Hybrid (Text/Vision) Quiz Generation with Manual Override.
    
    This function iterates through the specified pages of a PDF, decides whether to use 
    Vision (Image) analysis or Text analysis based on content keywords, and selects 
    the appropriate LLM model to optimize for both cost and performance.
    
    Args:
        pdf_path (str): Path to the target PDF file.
        start_page (int): Page number to start processing (1-based).
        end_page (int): Page number to stop processing (1-based).
        use_latest_model (bool): If True, overrides the standard vision model with the 
                                 LATEST_MODEL_VERSION (SOTA) for pages with visual content.
    
    Returns:
        list: A list of generated quiz questions strings.
    """
    print(f"Starting Quiz Generation for {pdf_path} (Pages {start_page}-{end_page})...")
    
    # 1. Prepare Resources (Images and Text Reader)
    pdf_images, pdf_reader = prepare_pdf_resources(pdf_path, start_page, end_page)
    
    if not pdf_images or not pdf_reader:
        print("Failed to load PDF resources. Aborting operation.")
        return []

    all_questions = []

    # 2. Iterate through each page in the specified range
    # 'pdf_images' is a list where index 0 corresponds to 'start_page'.
    for i, image in enumerate(pdf_images):
        current_page_num = start_page + i
        
        try:
            # Extract text from the page. pypdf uses 0-based indexing.
            raw_text = pdf_reader.pages[current_page_num - 1].extract_text()
        except Exception as e:
            print(f"Warning: Skipping page {current_page_num} due to text extraction error: {e}")
            continue
            
        print(f"\n--- Processing Page {current_page_num} ---")
        
        # 3. Decision Logic: Determine Mode (Vision vs. Text) and Select Model
        # Check if the text contains keywords implying visual content (e.g., "Figure 1").
        if has_visual_keywords(raw_text):
            print(f">> [Mode: Vision] Visual keywords detected.")
            
            # [Manual Toggle Check] Did the user explicitly request the SOTA model?
            if use_latest_model:
                selected_model = LATEST_MODEL_VERSION
                print(f"   -> [Override] Using LATEST Model ({selected_model}) as requested.")
            else:
                selected_model = VISION_MODEL_VERSION
                print(f"   -> Using Standard Vision Model ({selected_model}).")
            
            mode = "vision"
        else:
            # For pages with only text, we stick to the cost-effective model to avoid unnecessary expense.
            # (Note: You could also add logic here to use LATEST model for text if needed)
            print(f">> [Mode: Text] Text-only content detected.")
            print(f"   -> Using Cost-Effective Model ({TEXT_MODEL_VERSION}).")
            mode = "text"
            selected_model = TEXT_MODEL_VERSION
            
        # 4. Prepare Data Payload and Prompt
        content_data = {'text': raw_text}
        if mode == "vision":
            # Only encode image if we are in vision mode to save processing time.
            content_data['image'] = encode_image_to_base64(image)
            
        messages = construct_prompt_messages(mode, content_data)
        
        # 5. Execute API Call with the Selected Model
        # We pass the 'selected_model' determined in step 3.
        generated_question = call_llm_api(messages, model_name=selected_model)
        
        if generated_question:
            print(f"[Generated Question Preview]: {generated_question[:100]}...")
            all_questions.append(generated_question)
        else:
            print("Failed to generate question for this page.")

    return all_questions

In [6]:
# --- Execution Example ---

# Define the path to your PDF file (Adjust path as needed)
TARGET_PDF = "/kaggle/input/qa-test-pdf/2_.pdf" # Example path for Kaggle
# TARGET_PDF = "your_document.pdf" 

if os.path.exists(TARGET_PDF):
    
    # Case A: Standard execution (Cost-effective + Standard Vision)
    print("=== Running Standard Mode ===")
    # Process page 3 only
    pg_st_eff = 3
    pg_en_eff = 3
    questions_standard = generate_quiz_hybrid(TARGET_PDF, start_page=pg_st_eff, end_page=pg_en_eff, use_latest_model=False)

    
    # Case B: High-performance execution (Forces LATEST model for visual pages)
    # Use this for critical sections containing complex diagrams or when standard vision fails.
    print("\n=== Running High-Performance Mode ===")
    # Process page 4 only
    pg_st_maxP = 3
    pg_en_maxP = 3
    questions_latest = generate_quiz_hybrid(TARGET_PDF, start_page=pg_st_maxP, end_page=pg_en_maxP, use_latest_model=True)
    
    # Print Results
    print("\n=== Final Results ===")
    for q in questions_standard + questions_latest:
        print(q)
        print("-"*40)
else:
    print(f"File not found: {TARGET_PDF}. Please upload a PDF to test.")

=== Running Standard Mode ===
Starting Quiz Generation for /kaggle/input/qa-test-pdf/2_.pdf (Pages 3-3)...

--- Processing Page 3 ---
>> [Mode: Text] Text-only content detected.
   -> Using Cost-Effective Model (gpt-4o-mini).
[Generated Question Preview]: Question: What is the potential consequence of using materials from 메타코드 without permission for comm...

=== Running High-Performance Mode ===
Starting Quiz Generation for /kaggle/input/qa-test-pdf/2_.pdf (Pages 3-3)...

--- Processing Page 3 ---
>> [Mode: Text] Text-only content detected.
   -> Using Cost-Effective Model (gpt-4o-mini).
[Generated Question Preview]: Question: What is the main responsibility of the materials mentioned in the text regarding quiz ques...

=== Final Results ===
Question: What is the potential consequence of using materials from 메타코드 without permission for commercial purposes?  
Options:  
A) No consequences  
C) Legal action may be taken  
[Correct Answer]: C) Legal action may be taken
-------------------