In [None]:
import pandas as pd

manual_results = pd.read_csv('manual_results.csv')
google_ocr_results = pd.read_csv('google_ocr_results.csv')
pytesseract_results = pd.read_csv('pytesseract_ocr_results.csv')

In [None]:
def calculate_characters_accuracy(correct_label, detected_label):
    correct_chars = sum([1 for i, c in enumerate(correct_label) if i < len(detected_label) and c == detected_label[i]])
    total_chars = len(correct_label)
    return correct_chars / total_chars


def calculate_accuracy(ocr_results, ocr_method):
    left_misidentified = []
    right_misidentified = []

    total_files = len(manual_results)
    correct_left_labels_count = 0
    correct_right_labels_count = 0
    left_chars_accuracy_sum = 0
    right_chars_accuracy_sum = 0

    for index, row in manual_results.iterrows():
        ocr_row = ocr_results.loc[ocr_results['filename'] == row['filename']]

        left_label_detected = ocr_row.iloc[0]['left_label']
        right_label_detected = ocr_row.iloc[0]['right_label']

        if row['left_label'] == left_label_detected:
            correct_left_labels_count += 1
        else:
            left_misidentified.append({'filename': row['filename'], 'expected': row['left_label'], 'actual': left_label_detected})

        if row['right_label'] == right_label_detected:
            correct_right_labels_count += 1
        else:
            right_misidentified.append({'filename': row['filename'], 'expected': row['right_label'], 'actual': right_label_detected})

        left_chars_accuracy_sum += calculate_characters_accuracy(row['left_label'], left_label_detected)
        right_chars_accuracy_sum += calculate_characters_accuracy(row['right_label'], right_label_detected)

    left_accuracy = (correct_left_labels_count / total_files) * 100
    right_accuracy = (correct_right_labels_count / total_files) * 100
    left_chars_accuracy_avg = (left_chars_accuracy_sum / total_files) * 100
    right_chars_accuracy_avg = (right_chars_accuracy_sum / total_files) * 100

    print(f"{ocr_method} OCR has {left_accuracy:.2f}% accuracy on left label and {right_accuracy:.2f}% accuracy for right label.")
    print(f"{ocr_method} OCR has {left_chars_accuracy_avg:.2f}% character-level accuracy on left label and {right_chars_accuracy_avg:.2f}% character-level accuracy for right label.\n")

    print("Left labels misidentified:")
    for misidentified in left_misidentified:
        print(f"Filename: {misidentified['filename']}\nExpected value: {misidentified['expected']}\nActual value: {misidentified['actual']}\n")

    print("Right labels misidentified:")
    for misidentified in right_misidentified:
        print(f"Filename: {misidentified['filename']}\nExpected value: {misidentified['expected']}\nActual value: {misidentified['actual']}\n")

### Google OCR evaluation

In [None]:
calculate_accuracy(google_ocr_results, "Google")

### Tesseract OCR evaluation

In [None]:
calculate_accuracy(pytesseract_results, "Pytesseract")