<a href="https://colab.research.google.com/github/balajimurugesan2016/Algorithms/blob/master/AIQuestions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install required packages (run these in separate cells if using Jupyter)
# !pip install easyocr
# !pip install pdf2image
# !apt-get install poppler-utils

# Import necessary libraries
import easyocr
from pdf2image import convert_from_path
import numpy as np
import sys

# Initialize the OCR reader (you can specify languages here)
reader = easyocr.Reader(['en'])  # Example for English

# Define the path to your PDF file
pdf_path = '/content/GS English Set-B.pdf'  # Replace with your PDF file path

# Convert PDF to images (one image per page)
# Skip the first page by specifying first_page=2
try:
    print("Converting PDF to images...")
    pages = convert_from_path(pdf_path, first_page=2)
    print(f"Successfully converted {len(pages)} pages")
except FileNotFoundError:
    print(f"Error: PDF file not found at {pdf_path}")
    print("Please check the file path and make sure the file exists.")
    sys.exit(1)
except Exception as e:
    print(f"Error converting PDF to images: {e}")
    print("Possible solutions:")
    print("1. Make sure poppler-utils is installed: !apt-get install poppler-utils")
    print("2. Check if the PDF file exists and is not corrupted")
    print("3. Try with a different PDF file")
    sys.exit(1)

# Process each page
text_content = []
total_pages = len(pages)
old_num = 0
for page_num, page in enumerate(pages, start=2):  # Start from the second page (index 2)
    print(f"Processing page {page_num}/{total_pages + 1}")  # Adjust total page count for display

    # Convert PIL Image to numpy array
    page_array = np.array(page)

    # Get image dimensions
    height, width = page_array.shape[:2]  # Handle both RGB and grayscale

    # Define the split point for two columns (e.g., the middle)
    mid_point = width // 2

    try:
        # Process the left column
        left_half = page_array[:, :mid_point]
        left_result = reader.readtext(left_half, detail=0)

        # Process the right column
        right_half = page_array[:, mid_point:]
        right_result = reader.readtext(right_half, detail=0)

        # Combine the text from both columns for the current page
        # You might need more sophisticated logic here depending on the layout
        page_text = " ".join(left_result + right_result)


        text_content.append(f"{page_text}")  # Non-digits are appended as-is


        print(f"Extracted {len(left_result + right_result)} text blocks from page {page_num}")

    except Exception as e:
        print(f"Error processing page {page_num}: {e}")
        text_content.append(f"--- Page {page_num} ---\nError processing this page: {e}")

# Save output
output_filename = "extracted_text.txt"
try:
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write("\n\n".join(text_content))
    print(f"[INFO] OCR complete, output saved to {output_filename}")
    print(f"[INFO] Processed {len(text_content)} pages total")
except Exception as e:
    print(f"Error saving output file: {e}")

Converting PDF to images...
Successfully converted 21 pages
Processing page 2/22
1
2
3
5
6
4
5
6
2
0
1
0
5
Extracted 159 text blocks from page 2
Processing page 3/22
7
1
1
1
8
1
9
0
5
6
5
1
0
Extracted 196 text blocks from page 3
Processing page 4/22
1
1
5
1
2
2
8
2
8
2
8
5
6
8
1
3
8
Extracted 207 text blocks from page 4
Processing page 5/22
1
4
6
6
6
1
5
5
6
1
6
1
1
7
Extracted 221 text blocks from page 5
Processing page 6/22
1
8
2
0
5
0
2
1
1
9
5
6
2
2
0
5
0
1
Extracted 143 text blocks from page 6
Processing page 7/22
1
1
1
2
2
5
6
2
3
0
2
4
2
5
3
Extracted 170 text blocks from page 7
Processing page 8/22
2
6
2
7
6
0
0
3
2
2
5
6
2
8
1
9
0
1
2
9
3
0
Extracted 156 text blocks from page 8
Processing page 9/22


KeyboardInterrupt: 