# PDF OCR Text Conversion

## Set-up

In [1]:
# import libraries and packages
import pytesseract
from pdf2image import convert_from_path
from concurrent.futures import ThreadPoolExecutor
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import logging
from nltk.corpus import words

In [2]:
# Setup
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
english_vocab = set(words.words())

In [3]:
# Paths
folder_path = "/Users/alexchen/Downloads/Projects/vc-research/Batch2"
output_readable_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch2_text_readable"
output_unreadable_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch2_text_unreadable"

# Create directories
os.makedirs(output_readable_dir, exist_ok=True)
os.makedirs(output_unreadable_dir, exist_ok=True)

# Logging setup
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

## Preprocessing Helper Function

In [4]:
# Preprocess pdfs for better OCR results
def preprocess_image(image: Image.Image) -> Image.Image:
    """
    Preprocess an image for better OCR results.
    Enhances contrast and converts the image to grayscale.
    """
    image = image.convert("L")  # Convert to grayscale
    return image

## Extracting text from PDF Helper Functions

In [5]:
def get_tesseract_confidence(image: Image.Image) -> float:
    """
    Returns the average confidence score from Tesseract OCR for the given image.
    """
    # Get the OCR output with word-level data
    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    confidences = [int(conf) for conf in ocr_data['conf'] if conf != '-1']
    
    # Return the average confidence (or 0 if no valid confidences found)
    if confidences:
        return sum(confidences) / len(confidences)
    else:
        return 0.0

In [6]:
# Helper function to extract text from a PDF
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a PDF using pytesseract and pdf2image.
    Includes an autocorrection step to fix common OCR mistakes.
    """
    try:
        images = convert_from_path(pdf_path, dpi=300)
        text = ""
        for image in images:
            processed_image = preprocess_image(image)
            text += pytesseract.image_to_string(processed_image, lang="eng")
        
        return text
    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        return ""

## Readability Classificaation Helper Functions

In [7]:
def is_readable_text(image: Image.Image, text: str, threshold: float = 0.6) -> bool:
    """
    Determines if the extracted text is readable by combining Tesseract confidence
    and the percentage of valid English words (without autocorrection).
    """
    # Get Tesseract confidence score
    tesseract_confidence = get_tesseract_confidence(image)

    # Calculate percentage of valid English words
    words_in_text = re.findall(r'\b\w+\b', text.lower())
    if not words_in_text:
        return False
    valid_words = [word for word in words_in_text if word in english_vocab]
    valid_percentage = len(valid_words) / len(words_in_text)

    # Composite score: weighted sum of confidence and valid word percentage
    composite_score = (0.7 * tesseract_confidence / 100) + (0.3 * valid_percentage)

    # Classify based on composite score
    return composite_score >= threshold

## Execute PDF Processing Helper Functions

In [8]:
metrics_list = []

def process_pdf(pdf_path: str, output_readable_dir: str, output_unreadable_dir: str, threshold: float = 0.6, export_text=False) -> dict:
    """
    Processes a single PDF, calculates metrics, and saves the text to the appropriate directory.
    Returns metrics for the processed PDF.
    """
    logging.info(f"Processing {pdf_path}...")
    file_name = os.path.basename(pdf_path)
    metrics = {"file_name": file_name}

    try:
        # Extract text and images
        images = convert_from_path(pdf_path, dpi=300)
        text = ""
        confidences = []

        for image in images:
            processed_image = preprocess_image(image)
            text += pytesseract.image_to_string(processed_image, lang="eng")
            confidences.append(get_tesseract_confidence(processed_image))

        # Calculate metrics
        num_chars = len(text)
        num_words = len(re.findall(r'\b\w+\b', text))
        avg_confidence = np.mean(confidences) if confidences else 0.0

        # Determine readability
        words_in_text = re.findall(r'\b\w+\b', text.lower())
        valid_words = [word for word in words_in_text if word in english_vocab]
        valid_percentage = len(valid_words) / len(words_in_text) if words_in_text else 0.0

        composite_score = (0.7 * avg_confidence / 100) + (0.3 * valid_percentage)
        quality = "readable" if composite_score >= threshold else "unreadable"

        # Save text to the appropriate directory
        output_dir = output_readable_dir if quality == "readable" else output_unreadable_dir
        output_path = os.path.join(output_dir, os.path.splitext(file_name)[0] + ".txt")
        if export_text:
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(text)
            logging.info(f"Saved text to {output_path}")

        # Populate metrics dictionary
        metrics.update({
            "number_of_characters": num_chars,
            "number_of_words": num_words,
            "confidence_level": avg_confidence,
            "composite_score": composite_score,
            "quality": quality,
        })

    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        metrics.update({
            "number_of_characters": 0,
            "number_of_words": 0,
            "confidence_level": 0.0,
            "composite_score": 0.0,
            "quality": "unreadable",
        })

    return metrics

In [9]:
def process_all_pdfs(folder_path: str, output_readable_dir: str, output_unreadable_dir: str, threshold: float = 0.6, limit=None, export_text=False) -> pd.DataFrame:
    """
    Processes all PDFs in a folder using multithreading for efficiency.
    Returns a DataFrame containing metrics for all processed PDFs.
    """
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".pdf")]

    if limit:
        pdf_files = pdf_files[:limit]  # Apply the limit if specified

    metrics_list = []
    with ThreadPoolExecutor() as executor:
        results = executor.map(
            lambda pdf: process_pdf(pdf, output_readable_dir, output_unreadable_dir, threshold, export_text),
            pdf_files
        )
        metrics_list.extend(results)

    # Convert the metrics list to a DataFrame
    metrics_df = pd.DataFrame(metrics_list)
    return metrics_df

## Execute Functions on Folder

In [10]:
# Run the pipeline
# results_df = process_all_pdfs(folder_path=folder_path, output_readable_dir=output_readable_dir, output_unreadable_dir=output_unreadable_dir, limit=None, export_text=False)
# results_df = process_all_pdfs(folder_path=folder_path, output_readable_dir=output_readable_dir, output_unreadable_dir=output_unreadable_dir, limit=None, export_text=True)

results_df = process_all_pdfs(
    folder_path=folder_path,
    output_readable_dir=output_readable_dir,
    output_unreadable_dir=output_unreadable_dir,
    threshold=0.5,
    limit=None,
    export_text=True
)

results_df

2024-12-09 13:41:20,249 - Processing /Users/alexchen/Downloads/Projects/vc-research/Batch2/560_2000-03-13_Certificates of Incorporation.pdf...
2024-12-09 13:41:20,249 - Processing /Users/alexchen/Downloads/Projects/vc-research/Batch2/957_2011-08-25_Certificates of Incorporation.pdf...
2024-12-09 13:41:20,249 - Processing /Users/alexchen/Downloads/Projects/vc-research/Batch2/832_2014-09-29_Certificates of Incorporation.pdf...
2024-12-09 13:41:20,250 - Processing /Users/alexchen/Downloads/Projects/vc-research/Batch2/155_2011-05-13_Certificates of Incorporation.pdf...
2024-12-09 13:41:20,250 - Processing /Users/alexchen/Downloads/Projects/vc-research/Batch2/618_2011-04-15_Certificates of Incorporation.pdf...
2024-12-09 13:41:20,250 - Processing /Users/alexchen/Downloads/Projects/vc-research/Batch2/237_2009-03-18_Certificates of Incorporation.pdf...
2024-12-09 13:41:20,250 - Processing /Users/alexchen/Downloads/Projects/vc-research/Batch2/832_2014-06-30_Certificates of Incorporation.pdf...

Unnamed: 0,file_name,number_of_characters,number_of_words,confidence_level,composite_score,quality
0,560_2000-03-13_Certificates of Incorporation.pdf,2341,393,73.447797,0.751539,readable
1,957_2011-08-25_Certificates of Incorporation.pdf,63218,10468,84.503034,0.853061,readable
2,832_2014-09-29_Certificates of Incorporation.pdf,6392,1060,69.675305,0.724614,readable
3,155_2011-05-13_Certificates of Incorporation.pdf,4467,720,82.733838,0.817887,readable
4,618_2011-04-15_Certificates of Incorporation.pdf,79890,13213,84.138942,0.844675,readable
...,...,...,...,...,...,...
945,861_2006-06-14_Certificates of Incorporation.pdf,830,142,62.854922,0.611111,readable
946,825_2005-12-02_Certificates of Incorporation.pdf,1127,181,59.488000,0.592107,readable
947,900_2007-08-17_Certificates of Incorporation.pdf,73643,12130,82.518521,0.834027,readable
948,317_2007-04-05_Certificates of Incorporation.pdf,55651,9078,79.144994,0.804577,readable


## Display and Analyze Metrics from PDF Reading

In [15]:
# change displays for pdf_results
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
#pd.reset_option('^display.', silent=True)

results_df

Unnamed: 0,file_name,number_of_characters,number_of_words,confidence_level,composite_score,quality
0,560_2000-03-13_Certificates of Incorporation.pdf,2341,393,73.447797,0.751539,readable
1,957_2011-08-25_Certificates of Incorporation.pdf,63218,10468,84.503034,0.853061,readable
2,832_2014-09-29_Certificates of Incorporation.pdf,6392,1060,69.675305,0.724614,readable
3,155_2011-05-13_Certificates of Incorporation.pdf,4467,720,82.733838,0.817887,readable
4,618_2011-04-15_Certificates of Incorporation.pdf,79890,13213,84.138942,0.844675,readable
5,237_2009-03-18_Certificates of Incorporation.pdf,53048,8719,84.58771,0.848795,readable
6,832_2014-06-30_Certificates of Incorporation.pdf,6370,1054,69.747328,0.723336,readable
7,587_2007-06-20_Certificates of Incorporation.pdf,3236,526,77.928907,0.789609,readable
8,903_2009-11-13_Certificates of Incorporation.pdf,56750,9272,83.934669,0.845028,readable
9,135_2006-11-21_Certificates of Incorporation.pdf,54438,9095,84.432836,0.846467,readable


In [12]:
# only include pdfs that pass the confidence threshold
filtered_pdf_results = results_df[results_df['quality'] == 'readable']
filtered_pdf_results.sort_values(by="composite_score", ascending= False)
#filtered_pdf_results.reset_index()


Unnamed: 0,file_name,number_of_characters,number_of_words,confidence_level,composite_score,quality
87,359_2004-02-12_Certificates of Incorporation.pdf,62503,10374,87.415175,0.874659,readable
439,786_2012-04-24_Certificates of Incorporation.pdf,60339,10142,87.300773,0.868303,readable
112,244_2010-03-29_Certificates of Incorporation.pdf,94941,15960,85.504882,0.865188,readable
113,952_2007-06-12_Certificates of Incorporation.pdf,68428,11357,85.661716,0.865028,readable
476,786_2011-05-24_Certificates of Incorporation.pdf,56791,9503,86.471038,0.864921,readable
592,376_2007-02-06_Certificates of Incorporation.pdf,154059,25370,85.569665,0.864873,readable
389,303_2010-06-23_Certificates of Incorporation.pdf,97472,16131,86.125483,0.864865,readable
451,192_2007-02-23_Certificates of Incorporation.pdf,81389,13526,85.958595,0.864382,readable
668,939_2012-12-07_Certificates of Incorporation.pdf,50613,8324,86.615644,0.863962,readable
591,587_2012-11-15_Certificates of Incorporation.pdf,65946,11051,86.889831,0.862323,readable


In [13]:
excluded_pdf_results = results_df[results_df['quality'] == 'unreadable']
excluded_pdf_results.sort_values(by="composite_score", ascending=False)

Unnamed: 0,file_name,number_of_characters,number_of_words,confidence_level,composite_score,quality
449,304_2007-08-17_Certificates of Incorporation.pdf,1456,248,40.462644,0.433239,unreadable
327,693_2006-03-01_Certificates of Incorporation.pdf,52872,9201,36.018504,0.322361,unreadable


In [18]:
## calculate readability perecentage
batch1_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch1"
batch1_readable_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"
batch1_unreadable_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_unreadable"

batch2_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch2"
batch2_readable_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch2_text_readable"
batch2_unreadable_dir = "/Users/alexchen/Downloads/Projects/vc-research/Batch2_text_unreadable"

batch1_readable_files = len([f for f in os.listdir(batch1_readable_dir) if os.path.isfile(os.path.join(batch1_readable_dir, f))])
batch1_unreadable_files = len([f for f in os.listdir(batch1_unreadable_dir) if os.path.isfile(os.path.join(batch1_unreadable_dir, f))])
batch1_total = len([f for f in os.listdir(batch1_dir) if os.path.isfile(os.path.join(batch1_dir, f))])

batch2_readable_files = len([f for f in os.listdir(batch2_readable_dir) if os.path.isfile(os.path.join(batch2_readable_dir, f))])
batch2_unreadable_files = len([f for f in os.listdir(batch2_unreadable_dir) if os.path.isfile(os.path.join(batch2_unreadable_dir, f))])
batch2_total = len([f for f in os.listdir(batch2_dir) if os.path.isfile(os.path.join(batch2_dir, f))])

print(f"Number of readable files in Batch1: {batch1_readable_files}")
print(f"Number of unreadable files in Batch1: {batch1_unreadable_files}")
print(f"Number of total files in Bacth1: {batch1_total}")
print(f"Proportion of readable files among Batch1: {batch1_readable_files/batch1_total}")

print(f"Number of readable files in Batch2: {batch2_readable_files}")
print(f"Number of unreadable files in Batch2: {batch2_unreadable_files}")
print(f"Number of total files in Batch2: {batch2_total}")
print(f"Proportion of readable files among Batch2: {batch2_readable_files/batch2_total}")

print(f"Proportion of unreadable files among Batch1 and Batch2: {(batch1_unreadable_files + batch2_unreadable_files)/(batch1_total + batch2_total)}")

Number of readable files in Batch1: 90
Number of unreadable files in Batch1: 2
Number of total files in Bacth1: 93
Proportion of readable files among Batch1: 0.967741935483871
Number of readable files in Batch2: 948
Number of unreadable files in Batch2: 2
Number of total files in Batch2: 950
Proportion of readable files among Batch2: 0.9978947368421053
Proportion of unreadable files among Batch1 and Batch2: 0.003835091083413231
