In [1]:
import os
os.environ['TESSDATA_PREFIX'] = '/opt/homebrew/share/tessdata'

In [2]:
# âœ… Install dependencies (run once per environment)
%pip install pytesseract opencv-python pdf2image Pillow

# --- Imports ---
import pytesseract
from pdf2image import convert_from_path
import cv2
from PIL import Image
import os
from tqdm import tqdm  # optional, for progress bar

# --- Paths ---
pdf_path = "../PDFs/tbmm17004078.pdf"    # your input PDF
output_dir = "outputocr"              # folder to save output
os.makedirs(output_dir, exist_ok=True)

# --- Convert PDF pages to high-resolution images ---
print("Converting PDF to images...")
pages = convert_from_path(pdf_path, dpi=300)

# --- OCR Loop ---
all_text = ""

for i, page in enumerate(tqdm(pages, desc="Processing pages")):
    image_path = os.path.join(output_dir, f"page_{i+1}.png")
    page.save(image_path, "PNG")

    # --- Read image for preprocessing ---
    img = cv2.imread(image_path)

    # ðŸ”§ Preprocessing steps for old documents
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # grayscale
    gray = cv2.GaussianBlur(gray, (3, 3), 0)      # smooth noise
    gray = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        31, 2
    )                                             # binarize & enhance contrast
    gray = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)  # denoise

    # --- Save preprocessed image (optional, for debugging) ---
    preprocessed_path = os.path.join(output_dir, f"page_{i+1}_clean.png")
    cv2.imwrite(preprocessed_path, gray)

    # --- Run Tesseract OCR ---
    text = pytesseract.image_to_string(
        gray,
        lang="tur",
        config="--oem 1 --psm 6"
    )

    # --- Save individual text file ---
    txt_path = os.path.join(output_dir, f"page_{i+1}.txt")
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)

    # --- Add to combined output ---
    all_text += f"\n\n===== PAGE {i+1} =====\n\n" + text

# --- Save combined text file ---
combined_path = os.path.join(output_dir, "combined_output.txt")
with open(combined_path, "w", encoding="utf-8") as f:
    f.write(all_text)

print("âœ… OCR complete! All pages processed.")
print(f"Combined text file saved at: {combined_path}")




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Converting PDF to images...


Processing pages: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 52/52 [11:23<00:00, 13.14s/it]

âœ… OCR complete! All pages processed.
Combined text file saved at: outputocr/combined_output.txt



