# PDF OCR Text Conversion

## Set-up

In [21]:
# import libraries and packages
import pytesseract
from pdf2image import convert_from_path
from concurrent.futures import ThreadPoolExecutor
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import logging
from nltk.corpus import words

In [22]:
# Setup
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
english_vocab = set(words.words())

In [23]:
# Paths
folder_path = "/Users/alexchen/Downloads/Projects/temp/Batch1"
output_readable_dir = "/Users/alexchen/Downloads/Projects/temp/Batch1_text_readable"
output_unreadable_dir = "/Users/alexchen/Downloads/Projects/temp/Batch1_text_unreadable"

# Create directories
os.makedirs(output_readable_dir, exist_ok=True)
os.makedirs(output_unreadable_dir, exist_ok=True)

# Logging setup
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

## Preprocessing Helper Function

In [24]:
# Preprocess pdfs for better OCR results
def preprocess_image(image: Image.Image) -> Image.Image:
    """
    Preprocess an image for better OCR results.
    Enhances contrast and converts the image to grayscale.
    """
    image = image.convert("L")  # Convert to grayscale
    return image

## Extracting text from PDF Helper Functions

In [25]:
def get_tesseract_confidence(image: Image.Image) -> float:
    """
    Returns the average confidence score from Tesseract OCR for the given image.
    """
    # Get the OCR output with word-level data
    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    confidences = [int(conf) for conf in ocr_data['conf'] if conf != '-1']
    
    # Return the average confidence (or 0 if no valid confidences found)
    if confidences:
        return sum(confidences) / len(confidences)
    else:
        return 0.0

In [26]:
# Helper function to extract text from a PDF
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a PDF using pytesseract and pdf2image.
    Includes an autocorrection step to fix common OCR mistakes.
    """
    try:
        images = convert_from_path(pdf_path, dpi=300)
        text = ""
        for image in images:
            processed_image = preprocess_image(image)
            text += pytesseract.image_to_string(processed_image, lang="eng")
        
        return text
    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        return ""

## Readability Classificaation Helper Functions

In [27]:
def is_readable_text(image: Image.Image, text: str, threshold: float = 0.6) -> bool:
    """
    Determines if the extracted text is readable by combining Tesseract confidence
    and the percentage of valid English words (without autocorrection).
    """
    # Get Tesseract confidence score
    tesseract_confidence = get_tesseract_confidence(image)

    # Calculate percentage of valid English words
    words_in_text = re.findall(r'\b\w+\b', text.lower())
    if not words_in_text:
        return False
    valid_words = [word for word in words_in_text if word in english_vocab]
    valid_percentage = len(valid_words) / len(words_in_text)

    # Composite score: weighted sum of confidence and valid word percentage
    composite_score = (0.7 * tesseract_confidence / 100) + (0.3 * valid_percentage)

    # Classify based on composite score
    return composite_score >= threshold

## Execute PDF Processing Helper Functions

In [28]:
metrics_list = []

def process_pdf(pdf_path: str, output_readable_dir: str, output_unreadable_dir: str, threshold: float = 0.6, export_text=False) -> dict:
    """
    Processes a single PDF, calculates metrics, and saves the text to the appropriate directory.
    Returns metrics for the processed PDF.
    """
    logging.info(f"Processing {pdf_path}...")
    file_name = os.path.basename(pdf_path)
    metrics = {"file_name": file_name}

    try:
        # Extract text and images
        images = convert_from_path(pdf_path, dpi=300)
        text = ""
        confidences = []

        for image in images:
            processed_image = preprocess_image(image)
            text += pytesseract.image_to_string(processed_image, lang="eng")
            confidences.append(get_tesseract_confidence(processed_image))

        # Calculate metrics
        num_chars = len(text)
        num_words = len(re.findall(r'\b\w+\b', text))
        avg_confidence = np.mean(confidences) if confidences else 0.0

        # Determine readability
        words_in_text = re.findall(r'\b\w+\b', text.lower())
        valid_words = [word for word in words_in_text if word in english_vocab]
        valid_percentage = len(valid_words) / len(words_in_text) if words_in_text else 0.0

        composite_score = (0.7 * avg_confidence / 100) + (0.3 * valid_percentage)
        quality = "readable" if composite_score >= threshold else "unreadable"

        # Save text to the appropriate directory
        output_dir = output_readable_dir if quality == "readable" else output_unreadable_dir
        output_path = os.path.join(output_dir, os.path.splitext(file_name)[0] + ".txt")
        if export_text:
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(text)
            logging.info(f"Saved text to {output_path}")

        # Populate metrics dictionary
        metrics.update({
            "number_of_characters": num_chars,
            "number_of_words": num_words,
            "confidence_level": avg_confidence,
            "composite_score": composite_score,
            "quality": quality,
        })

    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        metrics.update({
            "number_of_characters": 0,
            "number_of_words": 0,
            "confidence_level": 0.0,
            "composite_score": 0.0,
            "quality": "unreadable",
        })

    return metrics

In [29]:
def process_all_pdfs(folder_path: str, output_readable_dir: str, output_unreadable_dir: str, threshold: float = 0.6, limit=None, export_text=False) -> pd.DataFrame:
    """
    Processes all PDFs in a folder using multithreading for efficiency.
    Returns a DataFrame containing metrics for all processed PDFs.
    """
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".pdf")]

    if limit:
        pdf_files = pdf_files[:limit]  # Apply the limit if specified

    metrics_list = []
    with ThreadPoolExecutor() as executor:
        results = executor.map(
            lambda pdf: process_pdf(pdf, output_readable_dir, output_unreadable_dir, threshold, export_text),
            pdf_files
        )
        metrics_list.extend(results)

    # Convert the metrics list to a DataFrame
    metrics_df = pd.DataFrame(metrics_list)
    return metrics_df

## Execute Functions on Folder

In [30]:
# Run the pipeline
# results_df = process_all_pdfs(folder_path=folder_path, output_readable_dir=output_readable_dir, output_unreadable_dir=output_unreadable_dir, limit=None, export_text=False)
# results_df = process_all_pdfs(folder_path=folder_path, output_readable_dir=output_readable_dir, output_unreadable_dir=output_unreadable_dir, limit=None, export_text=True)

results_df = process_all_pdfs(
    folder_path=folder_path,
    output_readable_dir=output_readable_dir,
    output_unreadable_dir=output_unreadable_dir,
    threshold=0.5,
    limit=None,
    export_text=True
)

results_df

2024-12-11 15:29:44,544 - Processing /Users/alexchen/Downloads/Projects/temp/Batch1/16_2008-03-03_Certificates of Incorporation.pdf...
2024-12-11 15:29:44,544 - Processing /Users/alexchen/Downloads/Projects/temp/Batch1/27_2009-05-15_Certificates of Incorporation.pdf...
2024-12-11 15:29:44,544 - Processing /Users/alexchen/Downloads/Projects/temp/Batch1/16_2009-01-20_Certificates of Incorporation.pdf...
2024-12-11 15:29:44,545 - Processing /Users/alexchen/Downloads/Projects/temp/Batch1/16_2004-07-14_Certificates of Incorporation.pdf...
2024-12-11 15:29:44,545 - Processing /Users/alexchen/Downloads/Projects/temp/Batch1/27_2002-09-23_Certificates of Incorporation.pdf...
2024-12-11 15:29:44,545 - Processing /Users/alexchen/Downloads/Projects/temp/Batch1/24_2016-04-04_Certificates of Incorporation.pdf...
2024-12-11 15:29:44,545 - Processing /Users/alexchen/Downloads/Projects/temp/Batch1/92_2010-02-23_Certificates of Incorporation.pdf...
2024-12-11 15:29:44,553 - Processing /Users/alexchen/Do

Unnamed: 0,file_name,number_of_characters,number_of_words,confidence_level,composite_score,quality
0,16_2008-03-03_Certificates of Incorporation.pdf,2473,391,73.204728,0.747216,readable
1,27_2009-05-15_Certificates of Incorporation.pdf,4337,744,75.885735,0.769910,readable
2,16_2009-01-20_Certificates of Incorporation.pdf,55638,9112,83.692853,0.843774,readable
3,16_2004-07-14_Certificates of Incorporation.pdf,46327,7572,83.490633,0.841685,readable
4,27_2002-09-23_Certificates of Incorporation.pdf,2498,403,76.769806,0.775602,readable
...,...,...,...,...,...,...
87,59_2006-05-01_Certificates of Incorporation.pdf,48676,8121,82.979610,0.839631,readable
88,24_2009-06-12_Certificates of Incorporation.pdf,2157,343,69.393056,0.713157,readable
89,81_2011-12-22_Certificates of Incorporation.pdf,128487,21449,85.609514,0.863685,readable
90,16_2007-05-16_Certificates of Incorporation.pdf,53604,8777,83.400815,0.840875,readable


## Display and Analyze Metrics from PDF Reading

In [32]:
# change displays for pdf_results
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
#pd.reset_option('^display.', silent=True)

results_df

Unnamed: 0,file_name,number_of_characters,number_of_words,confidence_level,composite_score,quality
0,16_2008-03-03_Certificates of Incorporation.pdf,2473,391,73.204728,0.747216,readable
1,27_2009-05-15_Certificates of Incorporation.pdf,4337,744,75.885735,0.76991,readable
2,16_2009-01-20_Certificates of Incorporation.pdf,55638,9112,83.692853,0.843774,readable
3,16_2004-07-14_Certificates of Incorporation.pdf,46327,7572,83.490633,0.841685,readable
4,27_2002-09-23_Certificates of Incorporation.pdf,2498,403,76.769806,0.775602,readable
5,24_2016-04-04_Certificates of Incorporation.pdf,107743,17803,84.482088,0.855347,readable
6,92_2010-02-23_Certificates of Incorporation.pdf,53431,8819,82.425599,0.83575,readable
7,92_2004-11-23_Certificates of Incorporation.pdf,46879,7765,82.874414,0.836309,readable
8,59_2007-08-15_Certificates of Incorporation.pdf,54151,8999,84.279008,0.849915,readable
9,28_2009-12-07_Certificates of Incorporation.pdf,2882,472,82.341748,0.798214,readable


In [33]:
# only include pdfs that pass the confidence threshold
filtered_pdf_results = results_df[results_df['quality'] == 'readable']
filtered_pdf_results.sort_values(by="composite_score", ascending= False)
#filtered_pdf_results.reset_index()

Unnamed: 0,file_name,number_of_characters,number_of_words,confidence_level,composite_score,quality
24,49_2008-06-12_Certificates of Incorporation.pdf,109852,18145,86.084837,0.866518,readable
89,81_2011-12-22_Certificates of Incorporation.pdf,128487,21449,85.609514,0.863685,readable
74,24_2018-04-03_Certificates of Incorporation.pdf,112067,18495,85.499236,0.86289,readable
55,24_2014-08-27_Certificates of Incorporation.pdf,104258,17194,85.258637,0.861199,readable
78,81_2009-12-03_Certificates of Incorporation.pdf,102255,17058,85.186218,0.860408,readable
46,27_2010-10-10_Certificates of Incorporation.pdf,71376,11882,85.197592,0.858763,readable
43,27_2009-08-04_Certificates of Incorporation.pdf,66530,11046,85.158319,0.857352,readable
16,27_2013-09-26_Certificates of Incorporation.pdf,79271,13273,84.97987,0.856661,readable
85,28_2012-03-16_Certificates of Incorporation.pdf,49340,8061,85.193501,0.855528,readable
21,24_2011-05-05_Certificates of Incorporation.pdf,89124,14609,84.577498,0.855366,readable


In [34]:
excluded_pdf_results = results_df[results_df['quality'] == 'unreadable']
excluded_pdf_results.sort_values(by="composite_score", ascending=False)

Unnamed: 0,file_name,number_of_characters,number_of_words,confidence_level,composite_score,quality
60,10_2006-09-13_Certificates of Incorporation.pdf,86,13,43.291667,0.441503,unreadable
14,27_2008-11-13_Certificates of Incorporation.pdf,553,113,27.992829,0.339313,unreadable


In [35]:
## calculate readability perecentage
batch1_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch1"
batch1_readable_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"
batch1_unreadable_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_unreadable"

batch2_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch2"
batch2_readable_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch2_text_readable"
batch2_unreadable_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch2_text_unreadable"

batch1_readable_files = len([f for f in os.listdir(batch1_readable_dir) if os.path.isfile(os.path.join(batch1_readable_dir, f))])
batch1_unreadable_files = len([f for f in os.listdir(batch1_unreadable_dir) if os.path.isfile(os.path.join(batch1_unreadable_dir, f))])
batch1_total = len([f for f in os.listdir(batch1_dir) if os.path.isfile(os.path.join(batch1_dir, f))])

batch2_readable_files = len([f for f in os.listdir(batch2_readable_dir) if os.path.isfile(os.path.join(batch2_readable_dir, f))])
batch2_unreadable_files = len([f for f in os.listdir(batch2_unreadable_dir) if os.path.isfile(os.path.join(batch2_unreadable_dir, f))])
batch2_total = len([f for f in os.listdir(batch2_dir) if os.path.isfile(os.path.join(batch2_dir, f))])

print(f"Number of readable files in Batch1: {batch1_readable_files}")
print(f"Number of unreadable files in Batch1: {batch1_unreadable_files}")
print(f"Number of total files in Bacth1: {batch1_total}")
print(f"Proportion of readable files among Batch1: {batch1_readable_files/batch1_total}")

print(f"Number of readable files in Batch2: {batch2_readable_files}")
print(f"Number of unreadable files in Batch2: {batch2_unreadable_files}")
print(f"Number of total files in Batch2: {batch2_total}")
print(f"Proportion of readable files among Batch2: {batch2_readable_files/batch2_total}")

print(f"Proportion of unreadable files among Batch1 and Batch2: {(batch1_unreadable_files + batch2_unreadable_files)/(batch1_total + batch2_total)}")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/alexchen/Downloads/Projects/vc-research/Batch1'