In [6]:
import argparse
import re
from collections import defaultdict
from typing import List
import numpy as np
import nltk
from nltk import edit_distance
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import json
from PyPDF2 import PdfReader  # Use PdfReader instead of PdfFileReader

# Download NLTK WordNet resource (if not already downloaded)
try:
    nltk.data.find('corpora/wordnet.zip')
except LookupError:
    print("Downloading NLTK WordNet resource...")
    nltk.download('wordnet')

def extract_text_from_pdf(pdf_file):
    """
    Extract text from a PDF file using PyPDF2.
    """
    text = ""
    pdf_reader = PdfReader(pdf_file)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def compute_metrics(pred, gt, minlen=4):
    metrics = {}
    if len(pred) < minlen or len(gt) < minlen:
        return metrics
    
    # Split predicted and ground truth text into words
    pred_words = set(pred.split())
    gt_words = set(gt.split())
    
    # Calculate true positives, false positives, true negatives, false negatives
    true_positives = len(pred_words.intersection(gt_words))
    false_positives = len(pred_words - gt_words)
    false_negatives = len(gt_words - pred_words)
    true_negatives = len(set(nltk.corpus.words.words()).difference(pred_words.union(gt_words)))
    
    # Calculate precision, recall, and F-measure
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f_measure = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    metrics["true_positives"] = true_positives
    metrics["false_positives"] = false_positives
    metrics["false_negatives"] = false_negatives
    metrics["true_negatives"] = true_negatives
    metrics["precision"] = precision
    metrics["recall"] = recall
    metrics["f_measure"] = f_measure
    
    return metrics

def get_input():
    predicted_text = input("Enter path to predicted PDF file: ")
    ground_truth_text = input("Enter path to ground truth PDF file: ")
    
    # Extract text from PDF files
    with open(predicted_text, 'rb') as pred_pdf_file, open(ground_truth_text, 'rb') as gt_pdf_file:
        predicted_text = extract_text_from_pdf(pred_pdf_file)
        ground_truth_text = extract_text_from_pdf(gt_pdf_file)
    
    return predicted_text, ground_truth_text

if __name__ == "__main__":
    predicted_text, ground_truth_text = get_input()
    
    # Calculate metrics
    metrics = compute_metrics(predicted_text, ground_truth_text)
    
    # Display results
    print("Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    
    # Organize data into a JSON structure
    data = {
        "predictions": [predicted_text],
        "ground_truths": [ground_truth_text]
    }


Enter path to predicted PDF file: C:\Users\Acer\Desktop\test_pdf\yua_4.4.pdf
Enter path to ground truth PDF file: C:\Users\Acer\Desktop\test_pdf\yua.pdf


LookupError: 
**********************************************************************
  Resource [93mwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('words')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/words[0m

  Searched in:
    - 'C:\\Users\\Acer/nltk_data'
    - 'C:\\Users\\Acer\\anaconda3\\nltk_data'
    - 'C:\\Users\\Acer\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\Acer\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Acer\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [7]:
import nltk

In [8]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [9]:
import argparse
import re
from collections import defaultdict
from typing import List
import numpy as np
import nltk
from nltk import edit_distance
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import json
from PyPDF2 import PdfReader  # Use PdfReader instead of PdfFileReader

# Download NLTK WordNet resource (if not already downloaded)
try:
    nltk.data.find('corpora/wordnet.zip')
except LookupError:
    print("Downloading NLTK WordNet resource...")
    nltk.download('wordnet')

def extract_text_from_pdf(pdf_file):
    """
    Extract text from a PDF file using PyPDF2.
    """
    text = ""
    pdf_reader = PdfReader(pdf_file)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def compute_metrics(pred, gt, minlen=4):
    metrics = {}
    if len(pred) < minlen or len(gt) < minlen:
        return metrics
    
    # Split predicted and ground truth text into words
    pred_words = set(pred.split())
    gt_words = set(gt.split())
    
    # Calculate true positives, false positives, true negatives, false negatives
    true_positives = len(pred_words.intersection(gt_words))
    false_positives = len(pred_words - gt_words)
    false_negatives = len(gt_words - pred_words)
    true_negatives = len(set(nltk.corpus.words.words()).difference(pred_words.union(gt_words)))
    
    # Calculate precision, recall, and F-measure
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f_measure = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    metrics["true_positives"] = true_positives
    metrics["false_positives"] = false_positives
    metrics["false_negatives"] = false_negatives
    metrics["true_negatives"] = true_negatives
    metrics["precision"] = precision
    metrics["recall"] = recall
    metrics["f_measure"] = f_measure
    
    return metrics

def get_input():
    predicted_text = input("Enter path to predicted PDF file: ")
    ground_truth_text = input("Enter path to ground truth PDF file: ")
    
    # Extract text from PDF files
    with open(predicted_text, 'rb') as pred_pdf_file, open(ground_truth_text, 'rb') as gt_pdf_file:
        predicted_text = extract_text_from_pdf(pred_pdf_file)
        ground_truth_text = extract_text_from_pdf(gt_pdf_file)
    
    return predicted_text, ground_truth_text

if __name__ == "__main__":
    predicted_text, ground_truth_text = get_input()
    
    # Calculate metrics
    metrics = compute_metrics(predicted_text, ground_truth_text)
    
    # Display results
    print("Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    
    # Organize data into a JSON structure
    data = {
        "predictions": [predicted_text],
        "ground_truths": [ground_truth_text]
    }


Enter path to predicted PDF file: C:\Users\Acer\Desktop\test_pdf\yua_4.4.pdf
Enter path to ground truth PDF file: C:\Users\Acer\Desktop\test_pdf\yua.pdf
Metrics:
true_positives: 165
false_positives: 92
false_negatives: 281
true_negatives: 235705
precision: 0.642023346303502
recall: 0.36995515695067266
f_measure: 0.46941678520625896
