# 📄 PDF Processor: Extract, Clean, and Analyze PDF Content

## 🧠 Purpose
This script processes PDF files to extract, clean, score, and summarize text and metadata.

In [None]:
import PyPDF2
import re
import pandas as pd
import os
import random

### 🧼 remove_existing_file

In [None]:
(file_path):
    """
    Checks if a file exists and removes it to prevent overwrite issues.
    Args:
    file_path (str): The path of the file to be checked and removed.
    """
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed existing file: {file_path}")
    else:
        print(f"No existing file to remove: {file_path}")

### 📖 extract_text

In [None]:
def (pdf_path):
    """
    Extracts text from a given PDF file using PyPDF2.
    Handles errors if the PDF is empty or unreadable.
    Args:
    pdf_path (str): The file path of the PDF from which to extract text.
    Returns:
    str: The extracted text or None if the file is empty or unreadable.
    """
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            if len(reader.pages) == 0:
                raise ValueError("PDF file contains no text pages.")
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
        return text.strip()
    except (PyPDF2.errors.PdfReadError, ValueError) as e:
        print(f"Error processing {pdf_path}: {e}")
        return None

### 🏷️ extract_metadata

In [None]:
def (pdf_path):
    """
    Extracts metadata from a PDF file using PyPDF2.
    Args:
    pdf_path (str): The file path of the PDF from which to extract metadata.
    Returns:
    dict: The metadata of the PDF.
    """
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        metadata = reader.metadata
    return metadata

### 🧹 clean_text

In [None]:
def (text):
    """
    Cleans the extracted text by removing non-printable characters and normalizing spaces.
    Args:
    text (str): The text to be cleaned.
    Returns:
    str: The cleaned text.
    """
    text = re.sub(r'[^ -~]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

### 🧪 assess_cleanliness

In [None]:
def (text):
    """
    Calculates the cleanliness score based on the ratio of alphabetic characters to total characters.
    Args:
    text (str): The text to assess.
    Returns:
    float: The cleanliness score as a percentage.
    """
    alpha_count = len(re.findall(r'[a-zA-Z]', text))
    text_length = len(text)
    return round((alpha_count / text_length) * 100, 2) if text_length > 0 else 0

### 🎲 process_random_pdfs

In [None]:
def (directory, output_file="batch_summary.xlsx"):
    """
    Processes a random selection of ten PDFs from a directory and compiles the results into an Excel file.
    Args:
    directory (str): The directory containing the PDF files.
    output_file (str): The filename for the output Excel spreadsheet.
    """
    pdf_files = [f for f in os.listdir(directory) if f.endswith('.pdf')]
    selected_files = random.sample(pdf_files, min(10, len(pdf_files)))
    results = []

    for pdf in selected_files:
        pdf_path = os.path.join(directory, pdf)
        print(f"Processing: {pdf_path}")
        metadata = extract_metadata(pdf_path)
        extracted_text = extract_text(pdf_path)
        if extracted_text:
            cleaned_text = clean_text(extracted_text)
            cleanliness_score = assess_cleanliness(cleaned_text)
            results.append({
                "File": pdf,
                "Metadata Title": metadata.get('/Title', 'No title in metadata'),
                "Cleanliness Score": cleanliness_score
            })
        else:
            results.append({
                "File": pdf,
                "Metadata Title": "N/A due to file error",
                "Cleanliness Score": "N/A due to file error"
            })

    df = pd.DataFrame(results)
    df.to_excel(output_file, index=False)
    print(f"Data successfully exported to {output_file}")

# Usage

### 📍 Script Usage

In [None]:
directory_path = "/Users/jm/NIST_SP_DOCS/"
remove_existing_file("batch_summary.xlsx")
process_random_pdfs(directory_path)