#### This Notebook takes input directory contaning scanned copies of the pdf and convert them to .txt format.

## Conversion of pdf to text

In [1]:
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import os

def pdf_to_text(pdf_path):
    images = convert_from_path(pdf_path)
    text_all = []
    for i, img in enumerate(images):
        text = pytesseract.image_to_string(img)
        text_all.append(f"--- Page {i+1} ---\n{text.strip()}")
    return "\n".join(text_all)


In [2]:
def process_pdfs_in_directory(input_dir, output_dir):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # List PDF files in input directory
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(input_dir, filename)
            print(f"Processing: {pdf_path}")
            try:
                text = pdf_to_text(pdf_path)

                # Build output text file path with original base name
                base_name = os.path.splitext(filename)[0]
                output_file_path = os.path.join(output_dir, base_name + ".txt")

                with open(output_file_path, "w", encoding="utf-8") as f:
                    f.write(text)
                print(f"Saved text to: {output_file_path}")
            except Exception as e:
                print(f"Failed to process {filename}: {e}")

#### Path of the directories

In [3]:
input_directory = "../Data"
output_directory = "../Data/Review"

process_pdfs_in_directory(input_directory, output_directory)

Processing: ../Data\press_release_ugc_net_june_2025.pdf
Saved text to: ../Data/Review\press_release_ugc_net_june_2025.txt
Processing: ../Data\public-notice-issuance-certificate-june-2025.pdf
Saved text to: ../Data/Review\public-notice-issuance-certificate-june-2025.txt
