<a href="https://colab.research.google.com/github/armelida/MELIDA/blob/main/src/questions-scraper/MELIDA_PDF_Scraper_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!pip install pdfplumber pandas --quiet

In [2]:
# Install gdown for Google Drive access
!pip install gdown --quiet

# Configure paths
import os

# Use a local directory structure in Colab instead of MyDrive
BASE_PATH = "/content/MELIDA"  # Changed from MyDrive path
PDF_DIR = os.path.join(BASE_PATH, "data/raw/exams")
OUTPUT_DIR = os.path.join(BASE_PATH, "data/questions")

# Create directories if they don't exist
os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"PDF Directory: {PDF_DIR}")
print(f"Output Directory: {OUTPUT_DIR}")

# Download files from the shared Google Drive folder
folder_id = "1QXwB1AXaV8TlgqoN41PIdwn1VSDJI1zg"
print(f"Downloading PDFs from Google Drive folder ID: {folder_id}")

# List and download files from the shared folder
!gdown --folder --id {folder_id} -O {PDF_DIR}

# List downloaded PDFs
pdf_files = [f for f in os.listdir(PDF_DIR) if f.lower().endswith('.pdf')]
print(f"\nFound {len(pdf_files)} PDF files in {PDF_DIR}:")
for i, file in enumerate(pdf_files, 1):
    print(f"{i}. {file}")

PDF Directory: /content/MELIDA/data/raw/exams
Output Directory: /content/MELIDA/data/questions
Downloading PDFs from Google Drive folder ID: 1QXwB1AXaV8TlgqoN41PIdwn1VSDJI1zg
Retrieving folder contents
Processing file 12-eLFeor5K8RGn8jcIstT-ZvSBLG29SA Cuaderno_2024_MEDICINA_0_C.pdf
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=12-eLFeor5K8RGn8jcIstT-ZvSBLG29SA
To: /content/MELIDA/data/raw/exams/Cuaderno_2024_MEDICINA_0_C.pdf
100% 797k/797k [00:00<00:00, 41.9MB/s]
Download completed

Found 1 PDF files in /content/MELIDA/data/raw/exams:
1. Cuaderno_2024_MEDICINA_0_C.pdf


In [3]:
import re
import json
import os
import pdfplumber
import pandas as pd
from typing import Dict, List, Tuple, Optional

def normalize_spaces(text: str) -> str:
    """
    Remove double spaces and other whitespace issues from text
    """
    if not text:
        return ""
    # Replace all whitespace sequences (including tabs, newlines) with a single space
    normalized = re.sub(r'\s+', ' ', text)
    # Trim leading/trailing whitespace
    normalized = normalized.strip()
    return normalized

def get_exam_metadata(filename: str) -> Tuple[str, str]:
    """
    Extracts exam type and year from filename
    Example: 'Cuaderno_2024_MEDICINA_0_C.pdf' -> ('MIR', '2024')
    Adjust patterns as needed for different exam types
    """
    # Default values
    exam_type = "MIR"
    year = "UNKNOWN"

    # Extract year
    year_match = re.search(r'(\d{4})', filename)
    if year_match:
        year = year_match.group(1)

    # Extract exam type if present (customize based on your naming conventions)
    if "MEDICINA" in filename.upper():
        exam_type = "MIR"
    elif "ENFERMERIA" in filename.upper():
        exam_type = "EIR"
    # Add more exam types as needed

    return exam_type, year


def format_question_id(exam_type: str, year: str, version: str = "v01",
                       question_type: str = "t01", question_num: int = 0) -> str:
    """
    Creates standardized question ID
    Format: {exam_type}-{year}-{version}-{question_type}-Q{question_num:03d}
    Example: MIR-2024-v01-t01-Q026
    """
    return f"{exam_type}-{year}-{version}-{question_type}-Q{question_num:03d}"


def create_formatted_question(exam_type: str, year: str, qnum: int,
                             qtext: str, options: List[str],
                             source_file: str, page_num: int) -> Dict:
    """
    Creates a question dict with the required format with improved cleaning
    """
    # Clean the question text thoroughly
    text = normalize_spaces(qtext)
    text = re.sub(r'(\w)-\s+(\w)', r'\1\2', text)  # Merge hyphen-split
    text = re.sub(r'\[(.*?)\]', r'\1', text)       # Remove bracket formatting

    # Format options as required dictionary with A, B, C, D keys
    option_dict = {}
    option_keys = ["A", "B", "C", "D"]

    for i, opt in enumerate(options):
        if i < len(option_keys):
            # Clean each option text
            opt = normalize_spaces(opt)
            opt = re.sub(r'(\w)-\s+(\w)', r'\1\2', opt)  # Merge hyphen-split
            option_dict[option_keys[i]] = opt

    # Ensure all options exist even if empty
    for key in option_keys:
        if key not in option_dict:
            option_dict[key] = ""

    # Create the question object in the required format
    return {
        "id": format_question_id(exam_type, year, question_num=qnum),
        "question_text": text,
        "options": option_dict,
        # Metadata fields (not in final output but useful for debugging)
        "_metadata": {
            "source_file": source_file,
            "page_number": page_num,
            "original_number": qnum
        }
    }


def extract_lines_with_x0(page, bbox):
    """
    Crops to bbox, extracts text lines as (text, x0).
    Merges lines ending in '-'.
    """
    lines = []
    hyphen_buffer = ""

    cropped = page.crop(bbox)
    raw_lines = cropped.extract_text_lines()

    for ln in raw_lines:
        text = ln["text"].strip()
        x0_val = ln["x0"]

        if hyphen_buffer:
            text = hyphen_buffer + text
            hyphen_buffer = ""

        if re.search(r'-\s*$', text):
            hyphen_buffer = re.sub(r'-\s*$', '', text)
            continue

        lines.append((text, x0_val))
    return lines


def split_markers(line_text):
    """
    Splits on every '(\\d+)\\. ' pattern.
    Returns [(num_str_or_None, snippet)].
    """
    pattern = re.compile(r'(\d+)\.\s')
    tokens = pattern.split(line_text)

    results = []
    leftover = tokens[0].strip()
    if leftover:
        results.append((None, leftover))

    i = 1
    while i < len(tokens):
        num_str = tokens[i]
        i += 1
        snippet = tokens[i].strip() if i < len(tokens) else ""
        i += 1
        results.append((num_str, snippet))
    return results


def parse_lines(
    lines, page_idx, exam_type, year, source_file,
    questions,
    current_qnum, current_qtext, current_opts,
    expected_next_qnum=None
):
    """
    Enhanced state machine logic to extract questions and options
    With space normalization throughout
    """
    MAX_QNUM = 210
    parsing_warnings = []

    # Initialize expected next question number if not provided
    if expected_next_qnum is None and current_qnum is not None:
        expected_next_qnum = current_qnum + 1
    elif expected_next_qnum is None:
        expected_next_qnum = 1

    for (full_line, _x0) in lines:
        if not full_line.strip():
            continue

        segments = split_markers(full_line)

        for (num_str, snippet) in segments:
            # Normalize spaces in the snippet
            snippet = normalize_spaces(snippet)

            if not snippet:  # Skip empty snippets
                continue

            if num_str is not None:
                val = int(num_str)

                # CASE A: Potential option if 1..4
                if 1 <= val <= 4:
                    # If we have a current question
                    if current_qnum is not None:
                        # Check if this is likely a valid next option
                        exists_already = any(opt.startswith(f"{val}. ") for opt in current_opts)
                        is_next_option = (len(current_opts) == 0 and val == 1) or \
                                         (len(current_opts) > 0 and val <= len(current_opts) + 1)

                        if len(current_opts) < 4 and not exists_already and is_next_option:
                            # Before adding first option, ensure question ends properly
                            if len(current_opts) == 0 and not (current_qtext.endswith(':') or current_qtext.endswith('?')):
                                # Look for : or ? in the snippet
                                colon_pos = snippet.find(':')
                                qmark_pos = snippet.find('?')

                                if colon_pos >= 0:
                                    # Split at colon - move text before colon to question
                                    current_qtext = normalize_spaces(current_qtext + " " + snippet[:colon_pos+1])
                                    snippet = normalize_spaces(snippet[colon_pos+1:])
                                elif qmark_pos >= 0:
                                    # Split at question mark - move text before question mark to question
                                    current_qtext = normalize_spaces(current_qtext + " " + snippet[:qmark_pos+1])
                                    snippet = normalize_spaces(snippet[qmark_pos+1:])
                                else:
                                    # No ending found, add a colon
                                    current_qtext = normalize_spaces(current_qtext)
                                    if current_qtext and not (current_qtext.endswith(':') or current_qtext.endswith('?')):
                                        current_qtext = current_qtext.rstrip() + ":"
                                    warning = f"Question {current_qnum} has no proper ending before options, added colon"
                                    parsing_warnings.append(warning)
                                    print(f"WARNING: {warning} (page {page_idx+1})")

                            # Add the option with number prefix
                            current_opts.append(f"{val}. {snippet}")
                        # Check if this is likely a new question starting with option 1
                        elif val == 1 and len(current_opts) > 0:
                            # Finalize the current question

                            # Process options for consistency
                            normalized_opts = []
                            collected_nums = set()

                            for opt in current_opts:
                                opt = normalize_spaces(opt)
                                opt_match = re.match(r'(\d+)\.\s+(.*)', opt)
                                if opt_match:
                                    opt_num = int(opt_match.group(1))
                                    collected_nums.add(opt_num)
                                    normalized_opts.append(f"{opt_num}. {normalize_spaces(opt_match.group(2))}")
                                else:
                                    # No number prefix, try to assign
                                    for n in range(1, 5):
                                        if n not in collected_nums:
                                            collected_nums.add(n)
                                            normalized_opts.append(f"{n}. {normalize_spaces(opt)}")
                                            break

                            # Fill in any missing options
                            current_opts = normalized_opts
                            for n in range(1, 5):
                                if n not in collected_nums:
                                    current_opts.append(f"{n}. ")

                            # Sort by option number
                            current_opts.sort(key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else 0)

                            # Check for missing options
                            if len(current_opts) < 4:
                                warning = f"Incomplete options: Question {current_qnum} has only {len(current_opts)} options (expected 4), filled gaps"
                                parsing_warnings.append(warning)
                                print(f"WARNING: {warning} (page {page_idx+1})")

                            # Ensure question ends properly
                            current_qtext = normalize_spaces(current_qtext)
                            if not (current_qtext.endswith(':') or current_qtext.endswith('?')):
                                current_qtext = current_qtext.rstrip() + ":"
                                warning = f"Question {current_qnum} has no proper ending, added colon"
                                parsing_warnings.append(warning)
                                print(f"WARNING: {warning} (page {page_idx+1})")

                            # Extract final options without numbering
                            final_opts = []
                            for opt in current_opts[:4]:  # Ensure exactly 4 options
                                opt_match = re.match(r'\d+\.\s+(.*)', opt)
                                if opt_match:
                                    final_opts.append(normalize_spaces(opt_match.group(1)))
                                else:
                                    final_opts.append(normalize_spaces(opt))

                            # Finalize the question
                            questions.append(create_formatted_question(
                                exam_type,
                                year,
                                current_qnum,
                                current_qtext,
                                final_opts,
                                source_file,
                                page_idx + 1
                            ))

                            # Start a new question - guess number based on expected sequence
                            if expected_next_qnum > current_qnum + 1:
                                # If we've seen a gap already, keep the expected number
                                current_qnum = expected_next_qnum
                            else:
                                # Otherwise, assume it's the next in sequence
                                current_qnum = current_qnum + 1

                            current_qtext = normalize_spaces(snippet)
                            current_opts = []
                            expected_next_qnum = current_qnum + 1
                        else:
                            # This is likely not an option but part of the text
                            if current_opts:
                                current_opts[-1] += f" {val}. {snippet}"
                            else:
                                current_qtext = normalize_spaces(current_qtext + f" {val}. {snippet}")
                    else:
                        # No active question, but found option - treat as new question
                        current_qnum = expected_next_qnum
                        current_qtext = normalize_spaces(snippet)
                        current_opts = []
                        expected_next_qnum = current_qnum + 1

                # CASE B: potential question number (5..210)
                elif 5 <= val <= MAX_QNUM:
                    # This is definitely a new question number

                    # Finalize old question if any
                    if current_qnum is not None:
                        # Process options for consistency
                        normalized_opts = []
                        collected_nums = set()

                        for opt in current_opts:
                            opt = normalize_spaces(opt)
                            opt_match = re.match(r'(\d+)\.\s+(.*)', opt)
                            if opt_match:
                                opt_num = int(opt_match.group(1))
                                collected_nums.add(opt_num)
                                normalized_opts.append(f"{opt_num}. {normalize_spaces(opt_match.group(2))}")
                            else:
                                # No number prefix, try to assign
                                for n in range(1, 5):
                                    if n not in collected_nums:
                                        collected_nums.add(n)
                                        normalized_opts.append(f"{n}. {normalize_spaces(opt)}")
                                        break

                        # Fill in any missing options
                        current_opts = normalized_opts
                        for n in range(1, 5):
                            if n not in collected_nums:
                                current_opts.append(f"{n}. ")

                        # Sort by option number
                        current_opts.sort(key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else 0)

                        # Check for missing options
                        if len(current_opts) < 4:
                            warning = f"Incomplete options: Question {current_qnum} has only {len(current_opts)} options (expected 4), filled gaps"
                            parsing_warnings.append(warning)
                            print(f"WARNING: {warning} (page {page_idx+1})")

                        # Ensure question ends properly
                        current_qtext = normalize_spaces(current_qtext)
                        if not (current_qtext.endswith(':') or current_qtext.endswith('?')):
                            current_qtext = current_qtext.rstrip() + ":"
                            warning = f"Question {current_qnum} has no proper ending, added colon"
                            parsing_warnings.append(warning)
                            print(f"WARNING: {warning} (page {page_idx+1})")

                        # Extract final options without numbering
                        final_opts = []
                        for opt in current_opts[:4]:  # Ensure exactly 4 options
                            opt_match = re.match(r'\d+\.\s+(.*)', opt)
                            if opt_match:
                                final_opts.append(normalize_spaces(opt_match.group(1)))
                            else:
                                final_opts.append(normalize_spaces(opt))

                        # Finalize the question
                        questions.append(create_formatted_question(
                            exam_type,
                            year,
                            current_qnum,
                            current_qtext,
                            final_opts,
                            source_file,
                            page_idx + 1
                        ))

                    # Check for expected sequence
                    if val != expected_next_qnum:
                        # Log the gap or out-of-sequence number
                        if val > expected_next_qnum:
                            warning = f"Question number gap: Expected {expected_next_qnum}, found {val} (gap of {val - expected_next_qnum})"
                        else:
                            warning = f"Out of order question: Expected {expected_next_qnum}, found {val} (backwards by {expected_next_qnum - val})"

                        parsing_warnings.append(warning)
                        print(f"WARNING: {warning} (page {page_idx+1})")

                    # Start new question
                    current_qnum = val
                    current_qtext = normalize_spaces(snippet)
                    current_opts = []
                    expected_next_qnum = val + 1  # Update expected next number
                else:
                    # Number out of range => just leftover text
                    if current_opts:
                        current_opts[-1] = normalize_spaces(current_opts[-1] + f" {val}. {snippet}")
                    elif current_qnum is not None:
                        current_qtext = normalize_spaces(current_qtext + f" {val}. {snippet}")

            else:
                # leftover text without a number prefix
                if current_qnum is not None:
                    # If we don't have a properly ended question yet and there are no options
                    if not (current_qtext.endswith(':') or current_qtext.endswith('?')) and not current_opts:
                        # Look for ":" or "?" in the snippet
                        colon_pos = snippet.find(':')
                        qmark_pos = snippet.find('?')

                        if colon_pos >= 0:
                            # Split the snippet at the colon
                            current_qtext = normalize_spaces(current_qtext + " " + snippet[:colon_pos+1])
                            remaining = normalize_spaces(snippet[colon_pos+1:])
                            if remaining:
                                # Start first option with remaining text
                                current_opts.append(f"1. {remaining}")
                            continue
                        elif qmark_pos >= 0:
                            # Split the snippet at the question mark
                            current_qtext = normalize_spaces(current_qtext + " " + snippet[:qmark_pos+1])
                            remaining = normalize_spaces(snippet[qmark_pos+1:])
                            if remaining:
                                # Start first option with remaining text
                                current_opts.append(f"1. {remaining}")
                            continue

                    # Normal case - append to appropriate place
                    if current_opts:
                        current_opts[-1] = normalize_spaces(current_opts[-1] + " " + snippet)
                    else:
                        current_qtext = normalize_spaces(current_qtext + " " + snippet)
                # else we have no active question => ignore

    return current_qnum, current_qtext, current_opts, expected_next_qnum, parsing_warnings


def process_pdf(pdf_path: str, start_page: int = 2) -> tuple:
    """
    Process PDF and extract questions with options
    Returns both questions and warnings about extraction issues
    """
    questions = []
    warnings = []
    current_qnum = None
    current_qtext = ""
    current_opts = []
    expected_next_qnum = 1  # Start expecting question #1
    MAX_QNUM = 210  # Define MAX_QNUM within this function

    # Extract metadata from filename
    filename = os.path.basename(pdf_path)
    exam_type, year = get_exam_metadata(filename)

    try:
        with pdfplumber.open(pdf_path) as pdf:
            source_file = os.path.basename(pdf_path)
            num_pages = len(pdf.pages)

            for i in range(start_page, num_pages):
                page = pdf.pages[i]
                w, h = page.width, page.height
                mid_x = w / 2.0

                # Split page into left and right columns
                left_bbox = (0, 0, mid_x, h)
                right_bbox = (mid_x, 0, w, h)

                left_lines = extract_lines_with_x0(page, left_bbox)
                right_lines = extract_lines_with_x0(page, right_bbox)

                # Parse left column
                current_qnum, current_qtext, current_opts, expected_next_qnum, page_warnings = parse_lines(
                    left_lines, i, exam_type, year, source_file,
                    questions,
                    current_qnum, current_qtext, current_opts, expected_next_qnum
                )
                warnings.extend(page_warnings)

                # Parse right column
                current_qnum, current_qtext, current_opts, expected_next_qnum, page_warnings = parse_lines(
                    right_lines, i, exam_type, year, source_file,
                    questions,
                    current_qnum, current_qtext, current_opts, expected_next_qnum
                )
                warnings.extend(page_warnings)

        # Add the last question if not already added
        if current_qnum is not None:
            # Normalize the question text
            current_qtext = normalize_spaces(current_qtext)

            # Check for incomplete options
            if len(current_opts) < 4:
                warning = f"Incomplete options: Final question {current_qnum} has only {len(current_opts)} options (expected 4)"
                warnings.append(warning)
                print(f"WARNING: {warning}")

                # Fill in missing options
                normalized_opts = []
                collected_nums = set()

                for opt in current_opts:
                    # Normalize the option text
                    opt = normalize_spaces(opt)
                    opt_match = re.match(r'(\d+)\.\s+(.*)', opt)
                    if opt_match:
                        opt_num = int(opt_match.group(1))
                        collected_nums.add(opt_num)
                        normalized_opts.append(f"{opt_num}. {normalize_spaces(opt_match.group(2))}")
                    else:
                        # No number prefix, try to assign
                        for n in range(1, 5):
                            if n not in collected_nums:
                                collected_nums.add(n)
                                normalized_opts.append(f"{n}. {opt}")
                                break

                # Fill in any missing options
                current_opts = normalized_opts
                for n in range(1, 5):
                    if n not in collected_nums:
                        current_opts.append(f"{n}. ")

                # Sort by option number
                current_opts.sort(key=lambda x: int(x.split('.')[0]) if x.split('.')[0].isdigit() else 0)

            # Ensure question ends properly before finalizing
            if not (current_qtext.endswith(':') or current_qtext.endswith('?')):
                current_qtext = current_qtext.rstrip() + ":"
                warning = f"Added missing ending to final question {current_qnum}"
                warnings.append(warning)

            # Extract final options without numbering and normalize spaces
            final_opts = []
            for opt in current_opts[:4]:  # Ensure exactly 4 options
                opt_match = re.match(r'\d+\.\s+(.*)', opt)
                if opt_match:
                    final_opts.append(normalize_spaces(opt_match.group(1)))
                else:
                    final_opts.append(normalize_spaces(opt))

            # Now finalize the question with processed options
            questions.append(create_formatted_question(
                exam_type,
                year,
                current_qnum,
                current_qtext,
                final_opts,
                source_file,
                i + 1
            ))

        # Post-processing to deduplicate questions
        if questions:
            # Sort questions by number
            questions.sort(key=lambda q: q.get('_metadata', {}).get('original_number', 0))

            # Deduplicate based on question number
            deduped_questions = []
            seen_numbers = set()

            for q in questions:
                qnum = q.get('_metadata', {}).get('original_number', 0)

                if qnum not in seen_numbers:
                    seen_numbers.add(qnum)
                    deduped_questions.append(q)
                else:
                    # Duplicate found - check which one seems more complete
                    existing_idx = next(i for i, existing_q in enumerate(deduped_questions)
                                       if existing_q.get('_metadata', {}).get('original_number', 0) == qnum)

                    # Check which has more complete options
                    existing_q = deduped_questions[existing_idx]

                    existing_options = existing_q['options']
                    new_options = q['options']

                    existing_emptiness = sum(1 for opt in existing_options.values() if not opt.strip())
                    new_emptiness = sum(1 for opt in new_options.values() if not opt.strip())

                    if new_emptiness < existing_emptiness:
                        # Replace with more complete version
                        deduped_questions[existing_idx] = q
                        warning = f"Replaced duplicate question {qnum} with more complete version"
                    else:
                        warning = f"Kept first occurrence of duplicate question {qnum}"

                    warnings.append(warning)

            # Update the questions list
            questions = deduped_questions
            print(f"After deduplication: {len(questions)} questions")

            # Final verification of sequential numbering
            question_numbers = [q.get('_metadata', {}).get('original_number', 0) for q in questions]
            question_numbers.sort()

            if question_numbers:
                print(f"Extracted question numbers: {min(question_numbers)} to {max(question_numbers)}")

                # Check for gaps
                expected_numbers = set(range(min(question_numbers), max(question_numbers) + 1))
                found_numbers = set(question_numbers)
                missing_numbers = expected_numbers - found_numbers

                if missing_numbers:
                    warning = f"Missing {len(missing_numbers)} questions in the sequence"
                    warnings.append(warning)
                    print(f"WARNING: {warning}: {sorted(missing_numbers)[:10]}...")

        # Final summary
        if warnings:
            print(f"\nExtraction completed with {len(warnings)} warnings")
            print(f"Successfully extracted {len(questions)} questions")
        else:
            print(f"\nExtraction completed successfully with no warnings")
            print(f"Successfully extracted {len(questions)} questions")

    except Exception as e:
        error_msg = f"Error processing {os.path.basename(pdf_path)}: {str(e)}"
        print(error_msg)
        warnings.append(error_msg)
        # Make sure to initialize questions if an exception occurs
        questions = []

    # Return both the questions and the warnings
    return questions, warnings


def clean_output_for_export(questions: List[Dict]) -> List[Dict]:
    """
    Prepare questions for export by removing metadata fields
    """
    cleaned = []
    for q in questions:
        # Create a copy without the metadata
        cleaned_q = {k: v for k, v in q.items() if not k.startswith('_')}
        cleaned.append(cleaned_q)
    return cleaned


def export_questions(questions: List[Dict], output_dir: str,
                    exam_type: str = "MIR", year: str = "2024",
                    version: str = "v01", question_type: str = "t01") -> Tuple[str, str]:
    """
    Export questions to CSV and JSON files with standardized naming
    """
    if not questions:
        return None, None

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Format the filename according to requirements
    filename_base = f"{exam_type}-{year}-{version}-{question_type}"

    # Clean the questions for export (remove metadata)
    export_questions = clean_output_for_export(questions)

    # Final space normalization pass before export
    for q in export_questions:
        q['question_text'] = normalize_spaces(q['question_text'])
        for key, value in q['options'].items():
            q['options'][key] = normalize_spaces(value)

    # Export to JSON
    json_path = os.path.join(output_dir, f"{filename_base}.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(export_questions, f, ensure_ascii=False, indent=2)

    # Export to CSV (flattening the options dictionary)
    df = pd.DataFrame(questions)

    # Extract options from the nested dictionary for CSV format
    if not df.empty and 'options' in df.columns:
        for key in ['A', 'B', 'C', 'D']:
            df[f'option_{key}'] = df['options'].apply(lambda x: x.get(key, ''))
        df = df.drop(columns=['options'])

    # Remove metadata columns
    if '_metadata' in df.columns:
        metadata_df = pd.json_normalize(df['_metadata'])
        df = pd.concat([df.drop(columns=['_metadata']), metadata_df], axis=1)

    # Export to CSV
    csv_path = os.path.join(output_dir, f"{filename_base}.csv")
    df.to_csv(csv_path, index=False, encoding='utf-8-sig')

    return json_path, csv_path


def process_directory(pdf_dir: str, output_dir: str, start_page: int = 2) -> None:
    """
    Process all PDFs in a directory and export questions
    Includes detailed warnings about extraction issues
    """
    all_questions = []
    all_warnings = []

    # Process all PDFs
    for filename in sorted(os.listdir(pdf_dir)):
        if not filename.lower().endswith(".pdf"):
            continue

        pdf_path = os.path.join(pdf_dir, filename)
        exam_type, year = get_exam_metadata(filename)

        try:
            questions, warnings = process_pdf(pdf_path, start_page=start_page)
            all_questions.extend(questions)

            # Record detailed warnings with file information
            file_warnings = [{"file": filename, "warning": w} for w in warnings]
            all_warnings.extend(file_warnings)

            print(f"Extracted {len(questions)} questions from {filename} ({exam_type} {year}) with {len(warnings)} warnings")
        except Exception as e:
            error_msg = f"Error processing {filename}: {str(e)}"
            print(error_msg)
            all_warnings.append({"file": filename, "warning": error_msg})

    # Group questions by exam type and year
    grouped_questions = {}
    for q in all_questions:
        # Extract exam type and year from question ID
        id_parts = q['id'].split('-')
        if len(id_parts) >= 2:
            exam_type, year = id_parts[0], id_parts[1]
            key = f"{exam_type}-{year}"

            if key not in grouped_questions:
                grouped_questions[key] = []

            grouped_questions[key].append(q)

    # Export each group separately
    for key, questions in grouped_questions.items():
        exam_type, year = key.split('-')

        # Export with standard naming
        json_path, csv_path = export_questions(
            questions,
            output_dir,
            exam_type=exam_type,
            year=year
        )

        if json_path:
            print(f"Exported {len(questions)} {exam_type} {year} questions to:")
            print(f"  - JSON: {os.path.basename(json_path)}")
            print(f"  - CSV: {os.path.basename(csv_path)}")

    # Export warnings to CSV
    if all_warnings:
        warnings_df = pd.DataFrame(all_warnings)
        warnings_path = os.path.join(output_dir, "extraction_warnings.csv")
        warnings_df.to_csv(warnings_path, index=False)
        print(f"\nExported {len(all_warnings)} warnings to {warnings_path}")

In [4]:
# Option to upload PDFs directly to Colab (if not already in Drive)
from google.colab import files
import shutil

def upload_pdfs_to_drive():
    uploaded = files.upload()

    for filename, content in uploaded.items():
        if filename.lower().endswith('.pdf'):
            dest_path = os.path.join(PDF_DIR, filename)
            with open(dest_path, 'wb') as f:
                f.write(content)
            print(f"Saved {filename} to {dest_path}")

# Uncomment the line below to upload PDF files
# upload_pdfs_to_drive()

In [5]:
# List PDF files available for processing
pdf_files = [f for f in os.listdir(PDF_DIR) if f.lower().endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files in {PDF_DIR}:")
for i, file in enumerate(pdf_files, 1):
    print(f"{i}. {file}")

Found 1 PDF files in /content/MELIDA/data/raw/exams:
1. Cuaderno_2024_MEDICINA_0_C.pdf


In [6]:
# Process all PDFs in the directory
process_directory(PDF_DIR, OUTPUT_DIR, start_page=2)

After deduplication: 209 questions
Extracted question numbers: 1 to 210

Successfully extracted 209 questions
Exported 209 MIR 2024 questions to:
  - JSON: MIR-2024-v01-t01.json
  - CSV: MIR-2024-v01-t01.csv



In [7]:
# VERIFICATION CELL: Check for questions without proper endings before export
def verify_questions_in_json_files(output_dir):
    all_issues = []
    total_questions = 0

    # Process each JSON file in the output directory
    for json_file in [f for f in os.listdir(output_dir) if f.lower().endswith('.json')]:
        file_path = os.path.join(output_dir, json_file)
        file_issues = []

        try:
            # Load the questions from the JSON file
            with open(file_path, 'r', encoding='utf-8') as f:
                questions = json.load(f)

            total_questions += len(questions)

            # Check each question
            for i, q in enumerate(questions):
                question_text = q.get('question_text', '').strip()

                if not (question_text.endswith(':') or question_text.endswith('?')):
                    file_issues.append({
                        'index': i,
                        'id': q.get('id', 'unknown'),
                        'text': question_text,
                        'file': json_file,
                        'source_file': q.get('_metadata', {}).get('source_file', 'unknown') if '_metadata' in q else 'unknown',
                        'page_number': q.get('_metadata', {}).get('page_number', 'unknown') if '_metadata' in q else 'unknown'
                    })

            all_issues.extend(file_issues)

        except Exception as e:
            print(f"Error processing {json_file}: {str(e)}")

    # Report results
    print(f"Total questions checked: {total_questions}")

    if all_issues:
        print(f"WARNING: Found {len(all_issues)} questions without proper endings (: or ?)")
        print("First 5 issues:")
        for i, issue in enumerate(all_issues[:5]):
            print(f"{i+1}. ID: {issue['id']}")
            print(f"   Text: {issue['text']}")
            print(f"   File: {issue['file']}")
            print(f"   Source: {issue['source_file']}, Page: {issue['page_number']}")

        # Export issues to CSV
        issues_df = pd.DataFrame(all_issues)
        issues_csv = os.path.join(output_dir, "question_ending_issues.csv")
        issues_df.to_csv(issues_csv, index=False)
        print(f"Exported all {len(all_issues)} issues to {issues_csv}")
    else:
        print("VERIFICATION PASSED: All questions end with ':' or '?'")

    return all_issues

# Verify questions after processing by reading the output JSON files
print("\nVerifying question formatting...")
question_issues = verify_questions_in_json_files(OUTPUT_DIR)


Verifying question formatting...
Total questions checked: 209
VERIFICATION PASSED: All questions end with ':' or '?'


In [8]:
# List processed output files
json_files = [f for f in os.listdir(OUTPUT_DIR) if f.lower().endswith('.json')]
print(f"Found {len(json_files)} processed JSON files in {OUTPUT_DIR}:")
for i, file in enumerate(json_files, 1):
    print(f"{i}. {file}")

Found 1 processed JSON files in /content/MELIDA/data/questions:
1. MIR-2024-v01-t01.json


In [9]:
# Preview a sample from the first JSON file
if json_files:
    sample_file = os.path.join(OUTPUT_DIR, json_files[0])
    with open(sample_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    print(f"Sample file: {json_files[0]}")
    print(f"Total questions: {len(data)}")
    print("\nSample questions (first 2):")

    for i, q in enumerate(data[:2], 1):
        print(f"\nQuestion {i}:")
        print(f"ID: {q['id']}")
        print(f"Text: {q['question_text']}")
        print("Options:")
        for key, value in q['options'].items():
            print(f"  {key}: {value}")

Sample file: MIR-2024-v01-t01.json
Total questions: 209

Sample questions (first 2):

Question 1:
ID: MIR-2024-v01-t01-Q001
Text: Pregunta asociada a la imagen 1. Mujer de 42 años que acude a la consulta de genética por un diagnóstico reciente de cáncer de endometrio. En base a los antecedentes familiares que constan en la imagen, ¿cuál de los siguientes síndromes es más probable que presente?:
Options:
  A: Poliposis adenomatosa familiar.
  B: Síndrome de Lynch.
  C: Síndrome de cáncer de mama y ovario hereditario.
  D: Síndrome de Cowden. 2. Pregunta asociada a la imagen 2. Paciente de 65 años que acude a urgencias por disminución brusca de agudeza visual en ojo derecho. La retinografía de dicho ojo se muestra en la imagen. Uno de los siguientes tratamientos está indicado para una enfermedad que es un factor de riesgo para esta situación. Indique cuál:

Question 2:
ID: MIR-2024-v01-t01-Q002
Text: Latanoprost y timolol. 2. Flecainida. 3. Hidroxicloroquina. 4. Complejos vitamínicos y a

In [10]:
# Check for questions that don't end with ":" or "?"
def check_question_endings(json_files_dir):
    issues_found = []
    total_questions = 0

    # Process each JSON file
    for json_file in [f for f in os.listdir(json_files_dir) if f.lower().endswith('.json')]:
        file_path = os.path.join(json_files_dir, json_file)

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                questions = json.load(f)

            # Check each question in the file
            for q in questions:
                total_questions += 1
                question_text = q.get('question_text', '').strip()

                if not (question_text.endswith(':') or question_text.endswith('?')):
                    issues_found.append({
                        'id': q.get('id', 'unknown'),
                        'file': json_file,
                        'text': question_text,
                        'last_char': question_text[-1] if question_text else 'empty'
                    })
        except Exception as e:
            print(f"Error processing {json_file}: {str(e)}")

    # Print summary
    print(f"Total questions analyzed: {total_questions}")

    # Calculate percentage with proper error handling
    percentage = (len(issues_found)/total_questions)*100 if total_questions > 0 else 0
    print(f"Questions without proper ending (: or ?): {len(issues_found)} ({percentage:.2f}%)")

    # Print details of problematic questions
    if issues_found:
        print("\nQuestions with improper endings:")
        for i, issue in enumerate(issues_found[:10], 1):  # Show first 10 issues
            print(f"{i}. ID: {issue['id']} (from {issue['file']})")
            print(f"   Text: {issue['text'][:100]}...")
            print(f"   Last character: '{issue['last_char']}'")

        if len(issues_found) > 10:
            print(f"\n... and {len(issues_found) - 10} more issues.")

    return issues_found

# Run the check on the output directory
print("Checking question formatting...")
issues = check_question_endings(OUTPUT_DIR)

# Optional: Export issues to CSV for further analysis
if issues and len(issues) > 0:
    issues_df = pd.DataFrame(issues)
    issues_csv = os.path.join(OUTPUT_DIR, "question_format_issues.csv")
    issues_df.to_csv(issues_csv, index=False)
    print(f"\nExported {len(issues)} issues to {issues_csv}")

Checking question formatting...
Total questions analyzed: 209
Questions without proper ending (: or ?): 0 (0.00%)


In [11]:
# Download a specific file
def download_file(file_path):
    try:
        files.download(file_path)
        print(f"Started download of {os.path.basename(file_path)}")
    except Exception as e:
        print(f"Error downloading file: {str(e)}")

# Example usage - uncomment to download a specific file
# if json_files:
#     download_file(os.path.join(OUTPUT_DIR, json_files[0]))

# Create a zip file with all processed files for easier download
import zipfile

def create_and_download_zip():
    zip_path = os.path.join(BASE_PATH, "processed_questions.zip")

    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for file in os.listdir(OUTPUT_DIR):
            file_path = os.path.join(OUTPUT_DIR, file)
            if os.path.isfile(file_path):
                zipf.write(file_path, arcname=file)

    download_file(zip_path)
    print(f"All processed files zipped to {zip_path}")

# Uncomment to create and download a zip of all processed files
# create_and_download_zip()