In [4]:
import pytesseract
from PIL import Image

# If Tesseract isn't in PATH, specify it:
# pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"

img = Image.open("scanned_page_bw.png")
text = pytesseract.image_to_string(img)

with open("extracted_text.txt", "w", encoding="utf-8") as f:
    f.write(text)

print("âœ… Text extracted and saved in extracted_text.txt")


âœ… Text extracted and saved in extracted_text.txt


In [3]:
import pytesseract
from PIL import Image
import re
from spellchecker import SpellChecker
from textblob import TextBlob

# If Tesseract isn't in PATH, specify it:
# pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"

def fix_ocr_errors(text):
    """
    Fix common OCR errors using NLP techniques:
    1. Replace common OCR character mistakes (0 -> O, 1 -> I, etc.)
    2. Fix spelling errors
    3. Correct grammar
    4. Normalize capitalization
    """
    # Step 1: Fix common OCR character substitutions
    ocr_corrections = {
        r'\b0': 'O',  # 0 at word start -> O
        r'0\b': 'o',  # 0 at word end -> o
        r'1': 'l',    # 1 -> l (in some contexts)
        r'5': 'S',    # 5 -> S
        r'8': 'B',    # 8 -> B
    }
    
    corrected_text = text
    for pattern, replacement in ocr_corrections.items():
        corrected_text = re.sub(pattern, replacement, corrected_text)
    
    # Step 2: Spell checking and correction
    spell = SpellChecker()
    words = corrected_text.split()
    corrected_words = []
    
    for word in words:
        # Keep punctuation
        if word and word[-1] in '.,!?;:':
            punct = word[-1]
            clean_word = word[:-1]
            corrected = spell.correction(clean_word)
            if corrected:
                corrected_words.append(corrected + punct)
            else:
                corrected_words.append(word)
        else:
            corrected = spell.correction(word)
            if corrected:
                corrected_words.append(corrected)
            else:
                corrected_words.append(word)
    
    corrected_text = ' '.join(corrected_words)
    
    # Step 3: Grammar correction using TextBlob
    blob = TextBlob(corrected_text)
    corrected_text = str(blob.correct())
    
    # Step 4: Normalize capitalization (capitalize first letter of sentences)
    sentences = re.split(r'([.!?]\s+)', corrected_text)
    normalized_sentences = []
    
    for i, sentence in enumerate(sentences):
        if i % 2 == 0 and sentence:  # Actual sentence, not delimiter
            normalized_sentences.append(sentence[0].upper() + sentence[1:])
        else:
            normalized_sentences.append(sentence)
    
    corrected_text = ''.join(normalized_sentences)
    
    return corrected_text


# Read image and extract text
img = Image.open("scanned_page_bw.png")
raw_text = pytesseract.image_to_string(img)

print("ðŸ“„ Raw OCR Output:")
print(raw_text)
print("\n" + "="*50 + "\n")

# Apply NLP corrections
corrected_text = fix_ocr_errors(raw_text)

print("âœ¨ Corrected Text:")
print(corrected_text)

# Save both versions
with open("extracted_text_raw.txt", "w", encoding="utf-8") as f:
    f.write(raw_text)

with open("extracted_text_corrected.txt", "w", encoding="utf-8") as f:
    f.write(corrected_text)

print("\nâœ… Raw text saved in extracted_text_raw.txt")
print("âœ… Corrected text saved in extracted_text_corrected.txt")


ðŸ“„ Raw OCR Output:
BILSTM-GCN Based Stuttering Detection with Corrective
Feedback Leveraging SEP-28k.doc




âœ¨ Corrected Text:
BILSTM-GCN Based Stuttering Detection with Corrective Feedback Averaging SEP-Ask.do

âœ… Raw text saved in extracted_text_raw.txt
âœ… Corrected text saved in extracted_text_corrected.txt
