# PDF Data Extraction and Processing Notebook

This notebook extracts titles and authors from PDF files, cleans the text, and saves the results into both CSV and JSON formats.

In [8]:
# ## 1. Imports and Setup
# Importing required libraries for PDF extraction, natural language processing, and data handling.

import fitz  # PyMuPDF
from transformers import pipeline
import re
import os
import json
import pandas as pd
import unicodedata

# Load the NER pipeline using a pre-trained model
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", grouped_entities=True)

# ## 2. Helper Functions
# Functions for cleaning and normalizing text, handling diacritical marks, and extracting author names.


In [9]:
def clean_hash_prefix(text):
    """Remove or correct sequences where '##' precede diacritic characters."""
    return re.sub(r'##([a-zA-Z])', r'\1', text)

def clean_author_lines(text):
    """Clean author lines by removing unwanted characters, bracketed sequences, numeric characters, and fixing diacritical marks."""
    text = re.sub(r'\[.*?\]', '', text)  # Remove sequences within brackets
    text = re.sub(r'\d+', '', text)  # Remove all numeric characters
    text = re.sub(r'[⋆*]', '', text)  # Remove stars and other unwanted special characters
    text = clean_hash_prefix(text)  # Normalize names with the '##' prefix
    text = fix_diacritics(text)  # Fix diacritical marks
    text = re.sub(r',+', ',', text)  # Replace multiple commas with a single comma
    text = re.sub(r'\s*,\s*', ', ', text).strip()  # Remove extra whitespace around commas
    return text

def fix_diacritics(text):
    """Normalize text to handle diacritical marks from various languages."""
    text = unicodedata.normalize('NFC', text)

    diacritic_symbols = ['ˇ', '´', '`', "'", '^', '~', '¨', ',']
    replacements = {
        'ˇ': {'s': 'š', 'c': 'č', 'z': 'ž', 'n': 'ň', 'r': 'ř', 't': 'ť', 'd': 'ď'},
        '´': {'e': 'é', 'a': 'á', 'i': 'í', 'o': 'ó', 'u': 'ú', 'ı': 'í'},
        '`': {'e': 'è', 'a': 'à', 'i': 'ì', 'o': 'ò', 'u': 'ù'},
        "'": {'e': 'é', 'a': 'á', 'i': 'í', 'o': 'ó', 'u': 'ú'},
        '^': {'a': 'â', 'i': 'î', 'o': 'ô', 'u': 'û'},
        '~': {'a': 'ã', 'o': 'õ', 'n': 'ñ'},
        '¨': {'a': 'ä', 'e': 'ë', 'i': 'ï', 'o': 'ö', 'u': 'ü'},
        ',': {'c': 'ç'}
    }

    text = re.sub(
        rf'({"|".join(map(re.escape, diacritic_symbols))})([a-zA-Zı])',
        lambda match: replacements.get(match.group(1), {}).get(match.group(2).lower(), match.group(0)),
        text
    )
    return text

def extract_authors(text):
    """Use NER to extract person names as authors, filtering out non-name entities."""
    ner_results = ner_pipeline(text)
    authors = [entity['word'] for entity in ner_results if entity['entity_group'] == 'PER']
    return list(set(authors))

def clean_title(title):
    """Remove non-letter characters from the title, except hyphens and colons."""
    return re.sub(r'[^A-Za-z\s\-\:]', '', title).strip()


# ## 3. Extracting Data from PDFs
# Function to extract titles and authors from PDFs, with special handling for specific files.

In [10]:
def extract_text_with_layout(pdf_path):
    """Extract text from the first page of a PDF using PyMuPDF, preserving block structure."""
    title_lines = []
    author_lines = ""

    with fitz.open(pdf_path) as doc:
        first_page = doc[0]  # Access the first page of the PDF
        blocks = first_page.get_text("blocks")  # Extract text as blocks to preserve layout
        
        if len(blocks) > 0:
            # Extract the title from the first block
            if pdf_path == 'data/ICDAR2024_proceedings_pdfs\\0162.pdf' or pdf_path == 'data/ICDAR2024_proceedings_pdfs\\0123.pdf' or pdf_path == 'data/ICDAR2024_proceedings_pdfs\\0271.pdf':
                title_lines = blocks[0][4] + blocks[1][4]
                title_lines = title_lines.splitlines()
            else:
                title_lines = blocks[0][4].splitlines()  # Get the text content of the first block as the title

            # Extract authors from the second block
            if len(blocks) > 1:
                if pdf_path == 'data/ICDAR2024_proceedings_pdfs\\0162.pdf' or pdf_path == 'data/ICDAR2024_proceedings_pdfs\\0123.pdf':
                    author_lines = clean_author_lines(blocks[2][4] + blocks[3][4] + blocks[4][4])
                elif pdf_path == 'data/ICDAR2024_proceedings_pdfs\\0271.pdf':
                    author_lines = clean_author_lines(blocks[2][4])
                else:
                    author_lines = clean_author_lines(blocks[1][4])  # Clean the text content of the second block for authors
    
    title = clean_title(" ".join(title_lines)) if title_lines else "Unknown Title"
    if pdf_path == 'data/ICDAR2024_proceedings_pdfs\\0271.pdf':
        authors = author_lines
        print(authors)
    else :
        authors = extract_authors(author_lines) if author_lines else ["Unknown Author"]
    return title, authors

# Function to extract abstracts and keywiords from PDFs

In [11]:
def extract_abstract_and_keywords(pdf_path):
    """Extract the abstract and keywords from the first page of a PDF."""
    abstract_text = None
    keywords_text = None

    with fitz.open(pdf_path) as doc:
        first_page = doc[0]
        text = first_page.get_text()

        # Extract Abstract
        abstract_match = re.search(r'Abstract\.\s*(.*?)(?=\n\n|\nKeywords:|$)', text, re.DOTALL | re.IGNORECASE)
        if abstract_match:
            abstract_text = abstract_match.group(1).strip()

        # Extract Keywords
        keywords_match = re.search(r'Keywords:\s*(.*?)(?=\n1 Introduction|\n\d|$)', text, re.DOTALL | re.IGNORECASE)
        if keywords_match:
            keywords_text = keywords_match.group(1).strip()

    return abstract_text, keywords_text

# ## 4. Processing PDF Files
# Logic to process all PDFs in the specified folder and extract titles and authors.

In [12]:
# Directory containing the PDF files
pdf_folder_path = 'data/ICDAR2024_proceedings_pdfs'

# Process all PDF files in the folder and store results directly into a list
results = []
pdf_files = sorted([f for f in os.listdir(pdf_folder_path) if f.endswith('.pdf')])

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder_path, pdf_file)
    title, authors = extract_text_with_layout(pdf_path)
    abstract, keywords = extract_abstract_and_keywords(pdf_path)
    pdf_id = pdf_file.split('.')[0].lstrip('0')

    # Handle authors list and format it properly
    if authors:
        formatted_authors = authors  
    else:
        formatted_authors = ["Unknown Author"]

    # Store the results in the specified format
    results.append({
        "ID": pdf_id,
        "Authors": formatted_authors,
        "Title": title,
        "Abstract": abstract,
        "Keywords": keywords
    })

# Convert the results to a DataFrame with the desired column order
results_df = pd.DataFrame(results, columns=["ID", "Authors", "Title"])



Ting-Wei LIAO and Hsiang-An WANG


# ## 5. Saving the Results
# Code to save the extracted results into both CSV and JSON formats.


In [13]:
# Save the DataFrame directly to a CSV file in the required format
csv_output_path = 'outputs/extracted_information.csv'
results_df.to_csv(csv_output_path, index=False)

print(f"Results have been saved to CSV file at {csv_output_path}")

json_data = {
    "tables": [{"ID": result["ID"], "title": result["Title"], "authors": result["Authors"]} for result in results],
    "classification": [],
    "keyInformationExtraction": [],
    "opticalCharacterRecognition": [],
    "datasets": [],
    "layoutUnderstanding": [],
    "others": []
}

json_output_path = 'outputs/extracted_information.json'
with open(json_output_path, 'w') as json_file:
    json.dump(json_data, json_file, indent=4)

print(f"Results have been saved to JSON file at {json_output_path}")

Results have been saved to CSV file at outputs/extracted_information.csv
Results have been saved to JSON file at outputs/extracted_information.json
