In [1]:
import os
import pandas as pd
import chardet
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

endpoint = ""
key = ""

data_dir = "D:\ocr-ner\evaluation\handwritten dataset"

def extract_text_from_image(file_path):
    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

    with open(file_path, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-read", document=f
        )
        result = poller.result()

    extracted_text = []
    for page in result.pages:
        for line in page.lines:
            extracted_text.append(line.content)
    
    return " ".join(extracted_text)

def read_text_file(txt_path):
    with open(txt_path, "rb") as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']

    with open(txt_path, "r", encoding=encoding) as f:
        return f.read().strip()

def create_dataframe():
    data = []
    
    for filename in os.listdir(data_dir):
        if filename.endswith(".png") or filename.endswith(".jpg"):  
            image_path = os.path.join(data_dir, filename)
            txt_path = os.path.splitext(image_path)[0] + ".txt"

            if not os.path.exists(txt_path):
                print(f"Warning: No matching text file for {image_path}")
                continue

            extracted_text = extract_text_from_image(image_path)

            ground_truth_text = read_text_file(txt_path)

            data.append({
                "Image Name": filename,
                "Extracted Text": extracted_text,
                "Text from TXT File": ground_truth_text
            })

    df = pd.DataFrame(data)
    return df

if __name__ == "__main__":
    dataframe = create_dataframe()
    output_csv_path = "extracted_texts.csv"
    dataframe.to_csv(output_csv_path, index=False, encoding='utf-8')
    print(f"DataFrame saved to {output_csv_path}")


DataFrame saved to extracted_texts.csv


In [38]:
import pandas as pd
import string

def preprocess(text):
    return set(text.lower().translate(str.maketrans('', '', string.punctuation)).split())

def calculate_metrics(extracted_text, reference_text):
    extracted_set = preprocess(extracted_text)
    reference_set = preprocess(reference_text)
    
    true_positives = len(extracted_set & reference_set)
    false_positives = len(extracted_set - reference_set)
    false_negatives = len(reference_set - extracted_set)
    true_negatives = 0  

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    total_possible_positives = len(reference_set)
    accuracy = true_positives / total_possible_positives if total_possible_positives > 0 else 0

    return precision, recall, f1_score, accuracy

df = pd.read_csv('extracted_texts.csv')

df[['Precision', 'Recall', 'F1 Score', 'Accuracy']] = df.apply(
    lambda row: pd.Series(calculate_metrics(row['Extracted Text'], row['Text from TXT File'])), axis=1
)

average_precision = df['Precision'].mean()
average_recall = df['Recall'].mean()
average_f1_score = df['F1 Score'].mean()
average_accuracy = df['Accuracy'].mean()

print(df)

# Print the average metrics
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1 Score: {average_f1_score}")
print(f"Average Accuracy: {average_accuracy}")


    Image Name                                     Extracted Text  \
0     0160.jpg  Aristotle has was born in Macedonia which now ...   
1     0161.jpg  Charles Bukowski was a drunk writer , who was ...   
2     0162.jpg  William Shakespeare was an actor, Playwright, ...   
3     0163.jpg  Charles Lamb was born on 1775. His father name...   
4     0164.jpg  Charles lamb was born in a poor family - He wa...   
..         ...                                                ...   
103   0280.jpg  The Physical state of water on Earth. Water on...   
104   0281.jpg  In addition to greenhouse gases, other manmade...   
105   0282.jpg  · Have you ever filled glass of water to the V...   
106   0283.jpg  Hydrogen Bondss- Due to water's Polarity, each...   
107   0284.jpg  A river forms from water moving from higher to...   

                                    Text from TXT File  Precision    Recall  \
0    Aristotle was born in Macedonia which\nhas now...   0.904762  0.982759   
1    Charles 

In [39]:
df.head()

Unnamed: 0,Image Name,Extracted Text,Text from TXT File,Precision,Recall,F1 Score,Accuracy
0,0160.jpg,Aristotle has was born in Macedonia which now ...,Aristotle was born in Macedonia which\nhas now...,0.904762,0.982759,0.942149,0.982759
1,0161.jpg,"Charles Bukowski was a drunk writer , who was ...","Charles Bukowuski was a #\nwriter , who was al...",0.935065,0.935065,0.935065,0.935065
2,0162.jpg,"William Shakespeare was an actor, Playwright, ...","William Shakespeare was an actor , playwright ...",0.849315,0.873239,0.861111,0.873239
3,0163.jpg,Charles Lamb was born on 1775. His father name...,charles Lamb was born on 1775 .\nHis father na...,0.912281,0.945455,0.928571,0.945455
4,0164.jpg,Charles lamb was born in a poor family - He wa...,Charles lamb was born in a\npoor family . He w...,0.954545,0.984375,0.969231,0.984375
