In [9]:
import os
import Levenshtein
import re

# Function to read the contents of a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

# Path to the ground truth file and OCR output folder
ground_truth_path = 'truth.txt'  # Path to your reference ground truth file
ocr_output_folder = 'output'  # Path to your folder containing OCR output files

# Read the contents of the ground truth file
ground_truth_content = read_file(ground_truth_path)

# Get a list of all OCR output text files in the folder
ocr_output_files = [f for f in os.listdir(ocr_output_folder) if f.endswith('.txt')]

# Process each OCR output file
for ocr_output_file in ocr_output_files:
    # Get the full path to the OCR output file
    ocr_output_path = os.path.join(ocr_output_folder, ocr_output_file)

    # Read the OCR output text
    ocr_output_content = read_file(ocr_output_path)

    # Calculate Levenshtein distance between the contents of the ground truth and OCR output
    levenshtein_distance = Levenshtein.distance(ground_truth_content, ocr_output_content)

    # Calculate accuracy (1 - Levenshtein distance / max_length)
    max_length = max(len(ground_truth_content), len(ocr_output_content))
    accuracy = 1 - levenshtein_distance / max_length

    # Calculate similarity percentage (100 - (Levenshtein distance / max_length) * 100)
    similarity_percentage = (1 - levenshtein_distance / max_length) * 100

    # Calculate error rate (Levenshtein distance / max_length)
    error_rate = levenshtein_distance / max_length

    # Print the evaluation metrics for the current OCR output file
    print(f'File: {ocr_output_file}')
    print(f'  Levenshtein Distance: {levenshtein_distance}')
    print(f'  Accuracy: {accuracy:.2f}')
    print(f'  Similarity Percentage: {similarity_percentage:.2f}%')
    print(f'  Error Rate: {error_rate:.2f}')
    print('-' * 40)


File: 1.txt
  Levenshtein Distance: 219
  Accuracy: 0.83
  Similarity Percentage: 82.93%
  Error Rate: 0.17
----------------------------------------
File: 10.txt
  Levenshtein Distance: 35
  Accuracy: 0.97
  Similarity Percentage: 97.28%
  Error Rate: 0.03
----------------------------------------
File: 11.txt
  Levenshtein Distance: 24
  Accuracy: 0.98
  Similarity Percentage: 98.12%
  Error Rate: 0.02
----------------------------------------
File: 2.txt
  Levenshtein Distance: 112
  Accuracy: 0.91
  Similarity Percentage: 91.41%
  Error Rate: 0.09
----------------------------------------
File: 3.txt
  Levenshtein Distance: 205
  Accuracy: 0.84
  Similarity Percentage: 84.05%
  Error Rate: 0.16
----------------------------------------
File: 4.txt
  Levenshtein Distance: 486
  Accuracy: 0.63
  Similarity Percentage: 62.64%
  Error Rate: 0.37
----------------------------------------
File: 5.txt
  Levenshtein Distance: 35
  Accuracy: 0.97
  Similarity Percentage: 97.26%
  Error Rate: 0.03