In [4]:
pip install pypdf

Collecting pypdf
  Downloading pypdf-6.6.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.6.0-py3-none-any.whl (328 kB)
Installing collected packages: pypdf
Successfully installed pypdf-6.6.0
Note: you may need to restart the kernel to use updated packages.


multipages pdfs to 1st page

In [None]:
import os
from pathlib import Path
from pypdf import PdfReader, PdfWriter

INPUT_DIR = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\pdfs\icss")
OUTPUT_DIR = Path(r"D:\ITMO Big Data & ML School\semester 3\RI3\pdfs\icss_truncated")


def truncate_pdf(input_path: Path, output_path: Path, max_pages=1):
    """Reads a PDF and saves a new one with only the first N pages."""
    try:
        reader = PdfReader(input_path)
        writer = PdfWriter()

        num_pages = len(reader.pages)

        if num_pages == 0:
            return False, "empty"

        pages_to_add = min(num_pages, max_pages)

        for i in range(pages_to_add):
            writer.add_page(reader.pages[i])

        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, "wb") as f:
            writer.write(f)

        return True, None

    except Exception as e:
        return False, str(e)


def main():
    if not INPUT_DIR.exists():
        print(f"Input directory not found: {INPUT_DIR}")
        return

    print(f"Input: {INPUT_DIR}")
    print(f"Output: {OUTPUT_DIR}")

    # Group files by year folder
    year_folders = sorted([d for d in INPUT_DIR.iterdir() if d.is_dir()])

    total_found = 0
    total_converted = 0
    total_failed = 0

    for year_folder in year_folders:
        year = year_folder.name
        pdf_files = sorted(year_folder.glob("*.pdf"))

        if not pdf_files:
            continue

        found = len(pdf_files)
        converted = 0
        failed = 0

        print(f"Year {year}: Found {found} papers")

        for file_path in pdf_files:
            relative_path = file_path.relative_to(INPUT_DIR)
            out_path = OUTPUT_DIR / relative_path

            success, error = truncate_pdf(file_path, out_path)

            if success:
                converted += 1
            else:
                failed += 1

        print(f"Year {year}: Converted {converted} papers")
        
main()

ICCS 2001 - 2009 + 2018 - 2025 Using Docling

In [1]:
import os
import json
import re
import random
import time
import unicodedata
import gc
import torch
from pathlib import Path

# --- 1. SYSTEM & GPU CONFIGURATION ---
# Target NVIDIA GPU (Index 0) and handle library conflicts
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

# --- 2. CONFIGURATION ---
# Paths (Based on your latest directory structure)
DEFAULT_ICCS_PDF_DIR = r"D:\ITMO Big Data & ML School\semester 3\RI3\pdfs\icss_truncated"
DEFAULT_ICCS_PARSED_DIR = r"D:\ITMO Big Data & ML School\semester 3\RI3\parsed\icss_docling"

ICCS_PDF_DIR = Path(os.getenv("ICCS_PDF_DIR", DEFAULT_ICCS_PDF_DIR))
ICCS_PARSED_DIR = Path(os.getenv("ICCS_PARSED_DIR", DEFAULT_ICCS_PARSED_DIR))

# Years to process
YEARS = list(range(2001, 2010)) + list(range(2018, 2026))

# Settings
RANDOM_SEED = 42
TEST_MODE = False  # Set to True to only process 5 files per year for testing
TEST_SAMPLE_SIZE = 5

# --- 3. HELPER FUNCTIONS ---
def safe_mkdir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def list_pdfs(year_dir: Path) -> list[Path]:
    if not year_dir.exists(): return []
    return sorted([p for p in year_dir.glob("*.pdf") if p.is_file()])

def dump_json(path: Path, obj: dict) -> None:
    path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")

def is_valid_pdf(path: Path) -> bool:
    try:
        if path.stat().st_size == 0: return False
        with open(path, "rb") as f:
            # Basic header check
            return f.read(5).startswith(b"%PDF-")
    except: return False

def remove_emails_and_urls(text: str) -> str:
    text = re.sub(r'[\w\.,]+\s?@\s?[\w\.-]+', '', text) 
    text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    return text

def clean_text(text: str) -> str:
    if not text: return ""
    text = unicodedata.normalize('NFKC', text) 
    text = remove_emails_and_urls(text)
    # Remove markdown formatting chars that might confuse parsing
    text = re.sub(r'[\*_#]', '', text)
    return " ".join(text.split()).strip()

# --- 4. ROBUST AFFILIATION DETECTION ---
def is_likely_affiliation(text: str) -> bool:
    """
    Determines if a line is likely an affiliation based on keywords and patterns.
    Designed to be safe for names like 'A.V. Walton'.
    """
    text_lower = text.lower()
    
    strong_keywords = [
        "university", "univ", "department", "dept", "laboratory", "lab", "school", "college",
        "institute", "institution", "center", "centre", "division", "faculty",
        "hospital", "foundation", "corporation", "ltd", "gmbh",
        "engineering", "technology", "association", "academy",
        "zuse", "zib", "zentrum", "informationstechnik", "berlin", "forschung",
        "avenue", "street", "road", "drive", "lane", "po box", 
        "city", "town", "state", "zip", "code",
        "usa", "uk", "germany", "france", "spain", "italy", "italia", "china", "japan", "korea", 
        "russia", "poland", "netherlands", "brazil", "australia", "canada", "india",
        "taiwan", "singapore", "mexico", "sweden", "norway", "denmark", "finland", 
        "belgium", "austria", "switzerland", "portugal", "greece", "turkey", "israel",
        "universität", "universite", "universidad", "università", "politecnico", "polytechnic",
        "dipartimento", "departamento", "departement", "institut", "instituto",
        "facoltà", "facultad", "faculte", "école", "ecole", "escuela",
        "rue", "strasse", "piazza", "calle", "avenida", "strada"
    ]
    
    if any(k in text_lower for k in strong_keywords): 
        return True

    # Dangerous keywords (short ones that could be names) need strict boundary checks
    dangerous_keywords = ["ave", "st", "rd", "dr", "via", "box"]
    for k in dangerous_keywords:
        if re.search(rf'\b{k}\b', text_lower):
            return True

    # Numeric marker check (e.g., "1. Department of...")
    if re.match(r'^[\(\[]?\d+[\)\]\.]?\s+', text_lower): 
        return True

    return False

# --- 5. HEADER PARSING ---
def parse_header_block(header_text: str) -> dict:
    lines = [clean_text(l) for l in header_text.split('\n')]
    lines = [l for l in lines if l] 
    
    extracted = {"title": None, "authors": None, "affiliations": []}
    if not lines: return extracted

    # Assume first line is title
    extracted["title"] = lines[0]
    
    real_authors = []
    real_affils = []

    # Process subsequent lines
    for line in lines[1:]:
        if is_likely_affiliation(line):
            real_affils.append(line)
        else:
            real_authors.append(line)

    if real_authors: 
        extracted["authors"] = " ".join(real_authors)
    extracted["affiliations"] = real_affils
    
    return extracted

# --- 6. METADATA EXTRACTION (VISUAL LOGIC) ---
def parse_metadata_visual_logic(md_text: str) -> dict:
    metadata = {
        "filename": "", "title": None, "authors": None, 
        "affiliations": [], "keywords": None, "abstract": None
    }
    
    # Regex for section markers
    abs_match = re.search(r'(?i)^(?:#+\s*|[\*\-_]+\s*)?(?:(?:1|I|[0-9])\.?\s*)?(?:Abstract|Summary)[:\.]?(?:[\*\-_]+)?\s*', md_text, re.MULTILINE)
    kw_match = re.search(r'(?i)^(?:#+\s*|[\*\-_]+\s*)?Key\s*words[:\.]?\s*', md_text, re.MULTILINE)
    intro_match = re.search(r'(?i)^(?:#+\s*|[\*\-_]+\s*)?(?:(?:1|I|[0-9])\.?\s*)?(?:Introduction|Background|Motivation|Overview)', md_text, re.MULTILINE)
    
    # Safety: Ignore "Introduction" if it appears BEFORE "Abstract" (false positive)
    if abs_match and intro_match and intro_match.start() < abs_match.start(): 
        intro_match = None

    end_idx = intro_match.start() if intro_match else len(md_text)
    
    # Determine the split point between Header and Body
    split_idx = -1
    has_abstract = False
    has_keywords = False
    
    if abs_match and kw_match:
        split_idx = min(abs_match.start(), kw_match.start())
        has_abstract = True; has_keywords = True
    elif abs_match:
        split_idx = abs_match.start(); has_abstract = True
    elif kw_match:
        split_idx = kw_match.start(); has_keywords = True
    
    # Parse based on split point
    if split_idx != -1:
        header_text = md_text[:split_idx]
        body_text = md_text[split_idx:end_idx]
        
        metadata.update(parse_header_block(header_text))
        
        # Extract Abstract and Keywords from Body
        if has_abstract and (not has_keywords or abs_match.start() < kw_match.start()):
            # Abstract comes first
            if has_keywords:
                # Need to find keywords inside the body relative to where abstract ended
                local_kw = re.search(r'(?i)^(?:#+\s*|[\*\-_]+\s*)?Key\s*words[:\.]?\s*', body_text, re.MULTILINE)
                if local_kw:
                    metadata["abstract"] = clean_text(body_text[abs_match.end() - abs_match.start() : local_kw.start()])
                    metadata["keywords"] = clean_text(body_text[local_kw.end():])
                else:
                    metadata["abstract"] = clean_text(body_text[abs_match.end() - abs_match.start():])
            else:
                metadata["abstract"] = clean_text(body_text[abs_match.end() - abs_match.start():])

        elif has_keywords and (not has_abstract or kw_match.start() < abs_match.start()):
            # Keywords come first
            if has_abstract:
                local_abs = re.search(r'(?i)^(?:#+\s*|[\*\-_]+\s*)?(?:(?:1|I|[0-9])\.?\s*)?(?:Abstract|Summary)[:\.]?(?:[\*\-_]+)?\s*', body_text, re.MULTILINE)
                if local_abs:
                    metadata["keywords"] = clean_text(body_text[kw_match.end() - kw_match.start() : local_abs.start()])
                    metadata["abstract"] = clean_text(body_text[local_abs.end():])
                else:
                    metadata["keywords"] = clean_text(body_text)
            else:
                metadata["keywords"] = clean_text(body_text[kw_match.end() - kw_match.start():])
    
    # Fallback: No Abstract/Keywords found, use Introduction as splitter
    elif intro_match:
        metadata.update(parse_header_block(md_text[:intro_match.start()]))
    # Fallback: Just take the first line as title
    else:
        lines = md_text.split('\n')
        if lines: metadata["title"] = clean_text(lines[0])
    
    return metadata

# --- 7. MAIN EXECUTION ---
def main():
    if not ICCS_PDF_DIR.exists():
        raise FileNotFoundError(f"Source Directory not found: {ICCS_PDF_DIR}")
    safe_mkdir(ICCS_PARSED_DIR)

    success_log_path = ICCS_PARSED_DIR / "success_log.txt"
    error_log_path = ICCS_PARSED_DIR / "error_log.txt"
    
    # Init Logs
    with open(success_log_path, "w", encoding="utf-8") as f:
        f.write(f"--- SUCCESS LOG (Started: {time.ctime()}) ---\n")
    with open(error_log_path, "w", encoding="utf-8") as f:
        f.write(f"--- ERROR LOG (Started: {time.ctime()}) ---\n")

    random.seed(RANDOM_SEED)

    print("\nInitializing Docling (GPU Mode)...")
    if torch.cuda.is_available():
        print(f"GPU Detected: {torch.cuda.get_device_name(0)}")
    else:
        print("GPU not found! Using CPU (Slower).")

    # Configure Pipeline
    pipeline_opts = PdfPipelineOptions()
    pipeline_opts.do_table_structure = False
    pipeline_opts.do_ocr = True
    
    converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_opts)}
    )
    print("Model Loaded. gathering files...\n")

    # Gather All Tasks
    all_tasks = []
    for year in YEARS:
        year_dir = ICCS_PDF_DIR / str(year)
        out_year_dir = ICCS_PARSED_DIR / str(year)
        safe_mkdir(out_year_dir)

        pdfs = list_pdfs(year_dir)
        
        # Apply Testing Limit if enabled
        if TEST_MODE and len(pdfs) > TEST_SAMPLE_SIZE:
            pdfs = random.sample(pdfs, TEST_SAMPLE_SIZE)
            
        for p in pdfs:
            all_tasks.append((p, out_year_dir))

    print(f"Total Files to Process: {len(all_tasks)}")
    if TEST_MODE: print(f"   (Test Mode Active: Max {TEST_SAMPLE_SIZE} files per year)")
    print("-" * 40)

    # Process Loop
    for i, (pdf_path, out_year_dir) in enumerate(all_tasks):
        print(f"[{i+1}/{len(all_tasks)}] {pdf_path.name}...", end=" ", flush=True)
        
        try:
            if not is_valid_pdf(pdf_path):
                print("Invalid PDF")
                with open(error_log_path, "a", encoding="utf-8") as f:
                    f.write(f"{pdf_path.name} - Invalid File Structure\n")
                continue

            start_time = time.time()
            
            # Convert
            res = converter.convert(str(pdf_path))
            md_text = res.document.export_to_markdown()
            
            # Parse
            extracted = parse_metadata_visual_logic(md_text)
            extracted["filename"] = pdf_path.name
            
            # Save
            out_file = out_year_dir / f"{pdf_path.stem}.json"
            dump_json(out_file, extracted)
            
            elapsed = time.time() - start_time
            print(f"✔ ({elapsed:.2f}s)")
            with open(success_log_path, "a", encoding="utf-8") as f:
                f.write(f"{pdf_path.name} - OK ({elapsed:.2f}s)\n")

            # Memory Cleanup (Critical for Batch Processing)
            del res
            del md_text
            if i % 10 == 0: 
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error: {e}")
            with open(error_log_path, "a", encoding="utf-8") as f:
                f.write(f"{pdf_path.name} - {type(e).__name__}: {str(e)}\n")

    print("\nDONE. Logs saved in parsed directory.")

if __name__ == "__main__":
    main()


Initializing Docling (GPU Mode)...
GPU Detected: NVIDIA GeForce RTX 3050 Laptop GPU
Model Loaded. gathering files...

Total Files to Process: 6135
----------------------------------------
[1/6135] 2001_10.1007_3-540-45545-0_1.pdf... 

[32m[INFO] 2026-01-21 05:07:37,712 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-21 05:07:37,729 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-21 05:07:37,747 [RapidOCR] download_file.py:60: File exists and is valid: C:\anaconda3\envs\RI_3\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-21 05:07:37,748 [RapidOCR] main.py:50: Using C:\anaconda3\envs\RI_3\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-21 05:07:38,871 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-21 05:07:38,871 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-21 05:07:38,874 [RapidOCR] download_file.py:60: File exists and is valid: C:\anaconda3\envs\RI_3\Lib\site-packages\rapidocr\models\ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-21 05:07:38,874 [RapidOCR] main.py:50: Using C:\anaconda3\envs\RI_3\Lib\site-pack

✔ (10.15s)
[2/6135] 2001_10.1007_3-540-45545-0_10.pdf... ✔ (0.43s)
[3/6135] 2001_10.1007_3-540-45545-0_100.pdf... ✔ (0.40s)
[4/6135] 2001_10.1007_3-540-45545-0_101.pdf... ✔ (0.52s)
[5/6135] 2001_10.1007_3-540-45545-0_102.pdf... ✔ (0.33s)
[6/6135] 2001_10.1007_3-540-45545-0_103.pdf... ✔ (0.37s)
[7/6135] 2001_10.1007_3-540-45545-0_104.pdf... ✔ (0.42s)
[8/6135] 2001_10.1007_3-540-45545-0_105.pdf... ✔ (0.40s)
[9/6135] 2001_10.1007_3-540-45545-0_106.pdf... ✔ (0.38s)
[10/6135] 2001_10.1007_3-540-45545-0_107.pdf... ✔ (0.60s)
[11/6135] 2001_10.1007_3-540-45545-0_108.pdf... ✔ (0.43s)
[12/6135] 2001_10.1007_3-540-45545-0_109.pdf... ✔ (0.33s)
[13/6135] 2001_10.1007_3-540-45545-0_11.pdf... ✔ (0.35s)
[14/6135] 2001_10.1007_3-540-45545-0_110.pdf... ✔ (0.35s)
[15/6135] 2001_10.1007_3-540-45545-0_111.pdf... ✔ (0.32s)
[16/6135] 2001_10.1007_3-540-45545-0_112.pdf... ✔ (0.35s)
[17/6135] 2001_10.1007_3-540-45545-0_113.pdf... ✔ (0.39s)
[18/6135] 2001_10.1007_3-540-45545-0_114.pdf... ✔ (0.33s)
[19/6135] 200

RapidOCR returned empty result!


✔ (0.60s)
[411/6135] 2002_10.1007_3-540-46080-2_34.pdf... ✔ (0.34s)
[412/6135] 2002_10.1007_3-540-46080-2_35.pdf... ✔ (0.29s)
[413/6135] 2002_10.1007_3-540-46080-2_36.pdf... ✔ (0.34s)
[414/6135] 2002_10.1007_3-540-46080-2_37.pdf... ✔ (0.35s)
[415/6135] 2002_10.1007_3-540-46080-2_38.pdf... ✔ (0.32s)
[416/6135] 2002_10.1007_3-540-46080-2_39.pdf... ✔ (0.32s)
[417/6135] 2002_10.1007_3-540-46080-2_4.pdf... ✔ (0.32s)
[418/6135] 2002_10.1007_3-540-46080-2_40.pdf... ✔ (0.33s)
[419/6135] 2002_10.1007_3-540-46080-2_41.pdf... ✔ (0.35s)
[420/6135] 2002_10.1007_3-540-46080-2_42.pdf... ✔ (0.34s)
[421/6135] 2002_10.1007_3-540-46080-2_43.pdf... ✔ (0.33s)
[422/6135] 2002_10.1007_3-540-46080-2_44.pdf... ✔ (0.31s)
[423/6135] 2002_10.1007_3-540-46080-2_45.pdf... ✔ (0.28s)
[424/6135] 2002_10.1007_3-540-46080-2_46.pdf... ✔ (0.41s)
[425/6135] 2002_10.1007_3-540-46080-2_47.pdf... ✔ (0.39s)
[426/6135] 2002_10.1007_3-540-46080-2_48.pdf... ✔ (0.34s)
[427/6135] 2002_10.1007_3-540-46080-2_49.pdf... ✔ (0.26s)
[428/

RapidOCR returned empty result!


✔ (0.39s)
[3577/6135] 2008_10.1007_978-3-540-69387-1_6.pdf... ✔ (0.27s)
[3578/6135] 2008_10.1007_978-3-540-69387-1_60.pdf... ✔ (0.31s)
[3579/6135] 2008_10.1007_978-3-540-69387-1_61.pdf... ✔ (0.34s)
[3580/6135] 2008_10.1007_978-3-540-69387-1_62.pdf... ✔ (0.31s)
[3581/6135] 2008_10.1007_978-3-540-69387-1_63.pdf... ✔ (0.34s)
[3582/6135] 2008_10.1007_978-3-540-69387-1_64.pdf... ✔ (0.36s)
[3583/6135] 2008_10.1007_978-3-540-69387-1_65.pdf... ✔ (0.42s)
[3584/6135] 2008_10.1007_978-3-540-69387-1_66.pdf... ✔ (0.36s)
[3585/6135] 2008_10.1007_978-3-540-69387-1_67.pdf... ✔ (0.34s)
[3586/6135] 2008_10.1007_978-3-540-69387-1_68.pdf... ✔ (0.34s)
[3587/6135] 2008_10.1007_978-3-540-69387-1_69.pdf... ✔ (0.32s)
[3588/6135] 2008_10.1007_978-3-540-69387-1_7.pdf... ✔ (0.38s)
[3589/6135] 2008_10.1007_978-3-540-69387-1_70.pdf... ✔ (0.41s)
[3590/6135] 2008_10.1007_978-3-540-69387-1_71.pdf... ✔ (0.37s)
[3591/6135] 2008_10.1007_978-3-540-69387-1_72.pdf... ✔ (0.34s)
[3592/6135] 2008_10.1007_978-3-540-69387-1_73.p

Auditing