<a href="https://colab.research.google.com/github/armelida/MELIDA/blob/main/src/questions-scraper/MELIDA_PDF_Scraper_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Environment setup, imports, and helper functions

import os
import re
import json
import pdfplumber
import pandas as pd
from typing import Dict, List, Tuple

# Configure paths (run this cell first)
BASE_PATH = "/content/MELIDA"
PDF_DIR = os.path.join(BASE_PATH, "data/raw/exams")
OUTPUT_DIR = os.path.join(BASE_PATH, "data/questions")
os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"PDF Directory: {PDF_DIR}")
print(f"Output Directory: {OUTPUT_DIR}")

def normalize_spaces(text: str) -> str:
    if not text:
        return ""
    normalized = re.sub(r'\s+', ' ', text)
    return normalized.strip()

def get_exam_metadata(filename: str) -> Tuple[str, str]:
    exam_type = "MIR"
    year = "UNKNOWN"
    year_match = re.search(r'(\d{4})', filename)
    if year_match:
        year = year_match.group(1)
    if "MEDICINA" in filename.upper():
        exam_type = "MIR"
    elif "ENFERMERIA" in filename.upper():
        exam_type = "EIR"
    return exam_type, year

def format_question_id(exam_type: str, year: str, version: str = "v01",
                       question_type: str = "t01", question_num: int = 0) -> str:
    return f"{exam_type}-{year}-{version}-{question_type}-Q{question_num:03d}"

def create_formatted_question(exam_type: str, year: str, qnum: int,
                              qtext: str, options: List[str],
                              source_file: str, page_num: int) -> Dict:
    text = normalize_spaces(qtext)
    text = re.sub(r'(\w)-\s+(\w)', r'\1\2', text)
    text = re.sub(r'\[(.*?)\]', r'\1', text)
    option_dict = {}
    option_keys = ["A", "B", "C", "D"]
    for i, opt in enumerate(options):
        if i < len(option_keys):
            opt = normalize_spaces(opt)
            opt = re.sub(r'(\w)-\s+(\w)', r'\1\2', opt)
            option_dict[option_keys[i]] = opt
    for key in option_keys:
        if key not in option_dict:
            option_dict[key] = ""
    return {
        "id": format_question_id(exam_type, year, question_num=qnum),
        "question_text": text,
        "options": option_dict,
        "_metadata": {
            "source_file": source_file,
            "page_number": page_num,
            "original_number": qnum
        }
    }


PDF Directory: /content/MELIDA/data/raw/exams
Output Directory: /content/MELIDA/data/questions


In [2]:
# Cell 1A: Export functions

def clean_output_for_export(questions: List[Dict]) -> List[Dict]:
    cleaned = []
    for q in questions:
        cleaned_q = {k: v for k, v in q.items() if not k.startswith('_')}
        cleaned.append(cleaned_q)
    return cleaned

def export_questions(questions: List[Dict], output_dir: str,
                     exam_type: str = "MIR", year: str = "2024",
                     version: str = "v01", question_type: str = "t01") -> Tuple[str, str]:
    if not questions:
        return None, None
    os.makedirs(output_dir, exist_ok=True)
    filename_base = f"{exam_type}-{year}-{version}-{question_type}"
    export_qs = clean_output_for_export(questions)
    for q in export_qs:
        q['question_text'] = normalize_spaces(q['question_text'])
        for key, value in q['options'].items():
            q['options'][key] = normalize_spaces(value)
    json_path = os.path.join(output_dir, f"{filename_base}.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(export_qs, f, ensure_ascii=False, indent=2)
    df = pd.DataFrame(questions)
    if not df.empty and 'options' in df.columns:
        for key in ['A', 'B', 'C', 'D']:
            df[f'option_{key}'] = df['options'].apply(lambda x: x.get(key, ''))
        df = df.drop(columns=['options'])
    if '_metadata' in df.columns:
        metadata_df = pd.json_normalize(df['_metadata'])
        df = pd.concat([df.drop(columns=['_metadata']), metadata_df], axis=1)
    csv_path = os.path.join(output_dir, f"{filename_base}.csv")
    df.to_csv(csv_path, index=False, encoding='utf-8-sig')
    return json_path, csv_path


In [3]:
# Cell 2: Extraction helper
def extract_lines_with_x0(page, bbox):
    lines = []
    hyphen_buffer = ""
    cropped = page.crop(bbox)
    raw_lines = cropped.extract_text_lines()
    for ln in raw_lines:
        text = ln["text"].strip()
        # Skip page numbering lines (e.g., "Página: 1 de 33")
        if re.match(r"^Página:\s*\d+\s*de\s*\d+$", text):
            continue
        if hyphen_buffer:
            text = hyphen_buffer + text
            hyphen_buffer = ""
        if re.search(r'-\s*$', text):
            hyphen_buffer = re.sub(r'-\s*$', '', text)
            continue
        lines.append(text)
    return lines


In [4]:
# Cell 3: Extraction using regex with warnings for jumps and option count

def extract_questions_from_text(text: str, exam_type: str, year: str,
                                source_file: str, page_num: int, min_valid: int = 26) -> Tuple[List[Dict], int]:
    questions = []
    expected_qnum = min_valid  # The sequential expected question number
    # Regex pattern:
    #  - (?P<orig_qnum>\d+)\.\s+  captures the original question number from the text.
    #  - (?!Pregunta asociada a la imagen) ensures we skip unwanted blocks.
    #  - (?P<qtext>.*?(?:\?|:)) captures the header (question text) ending with '?' or ':'.
    #  - Then, we capture four options (each starting with a number and a dot).
    #  - (?=\d+\.|$) stops when a new question marker starts or at the end of text.
    pattern = (
        r"(?P<orig_qnum>\d+)\.\s+"
        r"(?!Pregunta asociada a la imagen)"
        r"(?P<qtext>.*?(?:\?|:))\s+"
        r"(?P<opt1>\d+\.\s+.*?)(?P<opt2>\d+\.\s+.*?)(?P<opt3>\d+\.\s+.*?)(?P<opt4>\d+\.\s+.*?)(?=\d+\.|$)"
    )
    matches = list(re.finditer(pattern, text, re.DOTALL))
    for m in matches:
        orig_qnum_str = m.group("orig_qnum")
        try:
            orig_qnum = int(orig_qnum_str)
        except ValueError:
            print(f"WARNING: Could not convert original question number '{orig_qnum_str}' to int on page {page_num}.")
            continue

        # Skip questions with a number below the valid minimum.
        if orig_qnum < min_valid:
            print(f"Skipping question with number {orig_qnum} (min valid is {min_valid}) on page {page_num}.")
            continue

        # Warn if the extracted question number does not match the expected sequential number.
        if orig_qnum != expected_qnum:
            print(f"WARNING: Expected question number {expected_qnum} but found {orig_qnum} on page {page_num}.")
            # Reset expected_qnum to current extracted number for subsequent checks.
            expected_qnum = orig_qnum

        # Extract question text (header) and normalize spaces.
        qtext = normalize_spaces(m.group("qtext"))
        # Reconstruct the full header (which should begin with the original number)
        header = f"{orig_qnum}. {qtext}"
        if not header.lstrip().startswith(f"{orig_qnum}."):
            print(f"WARNING: Header for question {orig_qnum} on page {page_num} does not start as expected.")

        # Extract the four options.
        opts = []
        for opt_tag in ["opt1", "opt2", "opt3", "opt4"]:
            opt_text = m.group(opt_tag)
            # Remove the leading number and dot from each option.
            opt_clean = re.sub(r"^\d+\.\s+", "", opt_text)
            opts.append(normalize_spaces(opt_clean))

        if len(opts) != 4:
            print(f"WARNING: Question {orig_qnum} on page {page_num} has {len(opts)} options (expected 4).")
            continue

        # Create the question using the original question number.
        q = create_formatted_question(exam_type, year, orig_qnum, header, opts, source_file, page_num)
        questions.append(q)
        expected_qnum = orig_qnum + 1  # Expect the next question to have the next number.

    return questions, expected_qnum


In [5]:
# Cell 4: Updated process_pdf using the new extraction function
def process_pdf(pdf_path: str, start_page: int = 2) -> Tuple[List[Dict], List[str]]:
    questions = []
    warnings = []
    filename = os.path.basename(pdf_path)
    exam_type, year = get_exam_metadata(filename)

    try:
        with pdfplumber.open(pdf_path) as pdf:
            source_file = os.path.basename(pdf_path)
            num_pages = len(pdf.pages)
            for i in range(start_page, num_pages):
                page = pdf.pages[i]
                w, h = page.width, page.height
                mid_x = w / 2.0
                left_bbox = (0, 0, mid_x, h)
                right_bbox = (mid_x, 0, w, h)
                left_lines = extract_lines_with_x0(page, left_bbox)
                right_lines = extract_lines_with_x0(page, right_bbox)
                all_lines = left_lines + right_lines
                page_text = normalize_spaces(" ".join(all_lines))
                qs, _ = extract_questions_from_text(page_text, exam_type, year, source_file, i + 1, min_valid=26)
                questions.extend(qs)
        print(f"Extraction completed successfully: {len(questions)} questions extracted.")
    except Exception as e:
        error_msg = f"Error processing {os.path.basename(pdf_path)}: {str(e)}"
        print(error_msg)
        warnings.append(error_msg)
        questions = []
    return questions, warnings


In [6]:
# Cell 5: Process directory using updated parsing and export results
def process_directory(pdf_dir: str, output_dir: str, start_page: int = 2) -> None:
    all_questions = []
    all_warnings = []
    for filename in sorted(os.listdir(pdf_dir)):
        if not filename.lower().endswith(".pdf"):
            continue
        pdf_path = os.path.join(pdf_dir, filename)
        exam_type, year = get_exam_metadata(filename)
        try:
            questions, warnings = process_pdf(pdf_path, start_page=start_page)
            all_questions.extend(questions)
            file_warnings = [{"file": filename, "warning": w} for w in warnings]
            all_warnings.extend(file_warnings)
            print(f"Extracted {len(questions)} questions from {filename} ({exam_type} {year})")
        except Exception as e:
            error_msg = f"Error processing {filename}: {str(e)}"
            print(error_msg)
            all_warnings.append({"file": filename, "warning": error_msg})

    # Group by exam type and year and export
    grouped_questions = {}
    for q in all_questions:
        id_parts = q['id'].split('-')
        if len(id_parts) >= 2:
            exam_type, year = id_parts[0], id_parts[1]
            key = f"{exam_type}-{year}"
            if key not in grouped_questions:
                grouped_questions[key] = []
            grouped_questions[key].append(q)

    for key, questions in grouped_questions.items():
        exam_type, year = key.split('-')
        json_path, csv_path = export_questions(
            questions,
            output_dir,
            exam_type=exam_type,
            year=year
        )
        if json_path:
            print(f"Exported {len(questions)} {exam_type} {year} questions to:")
            print(f"  - JSON: {os.path.basename(json_path)}")
            print(f"  - CSV: {os.path.basename(csv_path)}")

    if all_warnings:
        warnings_df = pd.DataFrame(all_warnings)
        warnings_path = os.path.join(output_dir, "extraction_warnings.csv")
        warnings_df.to_csv(warnings_path, index=False)
        print(f"Exported {len(all_warnings)} warnings to {warnings_path}")

# Then call the function:
process_directory(PDF_DIR, OUTPUT_DIR, start_page=2)


Skipping question with number 1 (min valid is 26) on page 3.
Skipping question with number 2 (min valid is 26) on page 3.
Skipping question with number 3 (min valid is 26) on page 3.
Skipping question with number 4 (min valid is 26) on page 3.
Skipping question with number 5 (min valid is 26) on page 3.
Skipping question with number 6 (min valid is 26) on page 3.
Skipping question with number 7 (min valid is 26) on page 4.
Skipping question with number 8 (min valid is 26) on page 4.
Skipping question with number 9 (min valid is 26) on page 4.
Skipping question with number 10 (min valid is 26) on page 4.
Skipping question with number 11 (min valid is 26) on page 4.
Skipping question with number 12 (min valid is 26) on page 4.
Skipping question with number 13 (min valid is 26) on page 5.
Skipping question with number 14 (min valid is 26) on page 5.
Skipping question with number 15 (min valid is 26) on page 5.
Skipping question with number 16 (min valid is 26) on page 5.
Skipping question