In [72]:
import os 
import pandas as pd
import pytesseract
from PIL import Image
from easyocr import Reader
import time

In [73]:
# Import and preprocess data

# Create output directory
output_dir = os.path.join('data', 'labels')
os.makedirs(output_dir, exist_ok=True)

df = pd.read_csv('data/annotations.csv')

for index, row in df.iterrows():
    # Get values from row
    image_name = row.iloc[0]  
    text = row.iloc[1]    
    
    # Clean text
    cleaned_text = text.replace('"', '').strip()
    
    # Create output file path
    output_file = os.path.join(output_dir, f"{image_name}.txt") 
    
    # Save cleaned text
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)
        
print(f"Complete! Processed {len(df)} files.")

Complete! Processed 30 files.


In [74]:
# Text Detection
image_path = "data/images/12_test.jpg"
image = Image.open(image_path)

In [75]:
# Tesseract
text_tesseract = pytesseract.image_to_string(image, lang="eng")
print(text_tesseract)

Everyone has three lives:
a public life, a private life,

and a secret life.




In [76]:
# EasyOCR
reader = Reader(['en'])
results = reader.readtext(image_path)
text_ocr = [results[i][1] for i in range(len(results))]
print(text_ocr)

['Everyone has three lives:', 'life_', 'private life,', 'and', 'secret lile.', "'public"]


In [77]:
def calculate_similarity(str1: str, str2: str) -> float:
    set1 = set(str1.lower())
    set2 = set(str2.lower())
    
    # Calculate intersection and union
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    if union == 0: return 0.0
    return intersection / union

In [78]:
# iterate through the images
def main():
    reader = easyocr.Reader(['en'])
    tesseract_scores = []
    easyocr_scores = []
    tesseract_times = []
    easyocr_times = []

    for image_name in os.listdir('data/images'):
        image_path = os.path.join('data/images', image_name)
        label_path = os.path.join('data/labels', f"{image_name.split('.')[0]}.txt")
        ground_truth = open(label_path, 'r').read()
        
        image = Image.open(image_path)
        
        # Tesseract
        start_time = time.time()
        text_tesseract = pytesseract.image_to_string(image, lang="eng")
        tesseract_time = time.time() - start_time
        tesseract_times.append(tesseract_time)
        text_tesseract = text_tesseract.replace('\n', ' ')

        # EasyOCR
        start_time = time.time()
        results = reader.readtext(image_path)
        easyocr_time = time.time() - start_time
        easyocr_times.append(easyocr_time)
        text_ocr = [results[i][1] for i in range(len(results))]
        text_ocr = ' '.join(text_ocr)

        # Calculate similarities
        tesseract_similarity = calculate_similarity(ground_truth, text_tesseract)
        easyocr_similarity = calculate_similarity(ground_truth, text_ocr)
        
        tesseract_scores.append(tesseract_similarity)
        easyocr_scores.append(easyocr_similarity)
        
        # print(f"Image: {image_name}")
        # print(f"Ground Truth: {ground_truth}")
        # print(f"Tesseract: {text_tesseract}")
        # print(f"EasyOCR: {text_ocr}")
        # print(f"Tesseract Similarity: {tesseract_similarity:.3f}")
        # print(f"EasyOCR Similarity: {easyocr_similarity:.3f}")
        # print("-" * 50)

    
    print(f"\nAverage Tesseract Similarity: {sum(tesseract_scores)/len(tesseract_scores):.3f}")
    print(f"Average EasyOCR Similarity: {sum(easyocr_scores)/len(easyocr_scores):.3f}")

    print(f"Tesseract - Time: {sum(tesseract_times)/len(tesseract_times):.2f}s, Similarity: {sum(tesseract_scores)/len(tesseract_scores):.3f}")
    print(f"EasyOCR - Time: {sum(easyocr_times)/len(easyocr_times):.2f}s, Similarity: {sum(easyocr_scores)/len(easyocr_scores):.3f}")

    
main()
    


Average Tesseract Similarity: 0.887
Average EasyOCR Similarity: 0.909
Tesseract - Time: 0.15s, Similarity: 0.887
EasyOCR - Time: 0.18s, Similarity: 0.909
