In [4]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     --------- ------------------------------ 10.2/42.0 kB ? eta -:--:--
     --------------------------- ---------- 30.7/42.0 kB 330.3 kB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 407.1 kB/s eta 0:00:00
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
     ---------------------------------------- 0.0/48.5 kB ? eta -:--:--
     ---------------------------------------- 48.5/48.5 kB 2.6 MB/s eta 0:00:00
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
   ---------------------------------------- 0.0/59.2 kB ? eta -:--:--
   ---------------------------------------- 59.2/59.2 kB 3.3 MB/s eta 0:00:00
Downloading

In [13]:
import json
import re

# Function to clean the text
def cleanse_text(text):
    unwanted_sections = ["references"]
    for section in unwanted_sections:
        pattern = r'(?i)\b{}\b.*?(?=\n\n|\Z)'.format(section)  # Case-insensitive section removal
        text = re.sub(pattern, '', text, flags=re.DOTALL)

    # Remove references to figures, tables, and page numbers
    text = re.sub(r'Figure\s*\d+', '', text)  # Remove "Figure X" references
    text = re.sub(r'Page\s*\d+\s*(of\s*\d+)?', '', text)  # Remove "Page X of Y" references
    text = re.sub(r'Table\s*\d+', '', text)  # Remove "Table X" references
    
    # You can add additional patterns to remove any other sections you don't want
    text = re.sub(r'Gates Open Research\s*\d{4},.*?Last updated:.*?\n', '', text)  # Remove repeated journal info
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = remove_leading_zeros(text)
    text = text.strip()
    text = re.sub(r'\. \.', '.', text)  # Fix any erroneous spaces between periods

    # Ensure paragraphs are maintained
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Ensure paragraph breaks are preserved

    return text


# Function to remove leading zeros from numbers
def remove_leading_zeros(text):
    pattern = r'\b0+(\d+(\.\d+)?)\b'
    result = re.sub(pattern, r'\1', text)
    return result

In [14]:
# Function to fix encoding issues
def fix_encoding_issues(text):
    try:
        text = text.encode('latin1').decode('utf-8')
    except UnicodeEncodeError:
        pass  # If encoding fails, just return the original text
    return text

In [15]:
# Function to extract the title if it's the first sentence before author names

def extract_title(text):
    # This regex will capture all text before the "[version" part
    title_match = re.search(r'^(.+?)\s*\[version.*$', text, re.DOTALL)
    if title_match:
        return title_match.group(1).strip()
    return None


# Function to extract DOI
def extract_doi(text):
    # Fix encoding issues
    text = fix_encoding_issues(text)
    
    # Regex pattern to match a DOI
    match = re.search(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', text, re.IGNORECASE)
    
    if match:
        doi = match.group(0).strip()
        
        # Clean up any trailing or leading non-DOI characters
        doi = re.sub(r'[^a-zA-Z0-9./:-]+$', '', doi)
        
        return doi
    else:
        return ""

# Function to extract authors
def extract_authors(text):
    text = fix_encoding_issues(text)
    # This regex looks for a list of names separated by commas
    match = re.search(r'(?m)^\s*[A-Z][a-z]+(?: [A-Z]\.)?(?:, [A-Z][a-z]+(?: [A-Z]\.)?)*', text)
    return match.group(0).strip() if match else ""

# Function to extract FullTextURL
def extract_fulltexturl(text):
    text = fix_encoding_issues(text)  # Fix any encoding issues first
    
    # Use regex to search for a line that contains 'FulltextUrl:' followed by a valid URL
    match = re.search(r'https?://[^\s]+', text)
    
    if match:
        url = match.group(0).strip()
        
        # Remove any trailing non-URL characters (e.g., 'List')
        url = re.sub(r'[^\w:/?=&.-]+$', '', url)  # Remove trailing non-URL characters
        
        return url
    else:
        return ""

In [16]:
import pdfplumber

# Load the PDF file
# pdf_path = 'climate.pdf'
pdf_path = 'sample_eu.pdf'
content = ""

# Extract text from each page in the PDF
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        content += page.extract_text() + " "

# Extract the required fields using the provided functions
title = extract_title(content)
doi = extract_doi(content)
full_text_url = extract_fulltexturl(content)
cleaned_content = cleanse_text(content)

# Display the extracted information
extracted_data = {
    "Title": title,
    "DOI": doi,
    "FullTextURL": full_text_url,
    "FullTextContent": cleaned_content
}

# Printing the results
print("Title:", extracted_data["Title"])
print("DOI:", extracted_data["DOI"])
print("FullTextURL:", extracted_data["FullTextURL"])
print("\nFullTextContent (first 1000 characters):\n", extracted_data["FullTextContent"][:10000])


Title: Open Research Europe
Open Research Europe 2024, 4:133 Last updated: 05 JUL 2024
RESEARCH ARTICLE
Enriching Earth observation datasets through semantics for
climate change applications: The EIFFEL ontology
DOI: 10.12688/openreseurope.17992.1
FullTextURL: https://doi.org/10.12688/openreseurope.17992.1

FullTextContent (first 1000 characters):
 Open Research Europe Open Research Europe 2024, 4:133 Last updated: 5 JUL 2024 RESEARCH ARTICLE Enriching Earth observation datasets through semantics for climate change applications: The EIFFEL ontology[version 1; peer review: awaiting peer review] Benjamin Molina 1, Carlos E. Palau1, Jaime Calvo-Gallego 2 1Communication Department, Universitat Politecnica de Valencia, Cam  de Vera, s/n, Valencia, 46022, Spain 2Computing and Automatics Department, Campus Viriato,scuela Polit cnica Superior de Zamora, Avenida de Requejo, 33,, Universidad de Salamanca, Zamora, 49022, Spain v1 First published: 2 Jul 2024, 4:133 Open Peer Review https://doi.org

In [17]:
from fpdf import FPDF

# Initialize PDF
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()

# Set title, if available
pdf.set_font("Arial", 'B', 16)
title = extracted_data.get("Title") or "No Title Available"
pdf.multi_cell(0, 10, title, align='C')
pdf.ln(10)  # Add space after title

# Set DOI and FullTextURL, if available
pdf.set_font("Arial", 'B', 12)
doi = extracted_data.get("DOI") or "DOI not available"
full_text_url = extracted_data.get("FullTextURL") or "Full Text URL not available"
pdf.cell(0, 10, f"DOI: {doi}", ln=True)
pdf.cell(0, 10, f"FullTextURL: {full_text_url}", ln=True)
pdf.ln(10)  # Add space before content

# Set the content with normal font and handle None type
pdf.set_font("Arial", '', 12)
full_text_content = extracted_data.get("FullTextContent") or "No content available"
pdf.multi_cell(0, 10, full_text_content)

# Save the PDF file
pdf_file_path = "extracted_data.pdf"
pdf.output(pdf_file_path)
print(f"PDF saved successfully to: {pdf_file_path}")


PDF saved successfully to: extracted_data.pdf
