<a href="https://colab.research.google.com/github/balajimurugesan2016/Algorithms/blob/master/AIQuestions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Install required packages (run these in separate cells if using Jupyter)
!pip install easyocr
!pip install pdf2image
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.10).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [15]:
# Install required packages (run these in separate cells if using Jupyter)
#!pip install easyocr
#!pip install pdf2image
#!apt-get install poppler-utils

# Import necessary libraries
import easyocr
from pdf2image import convert_from_path
import numpy as np
import sys
import re

# Initialize the OCR reader (you can specify languages here)
reader = easyocr.Reader(['en'])  # Example for English

# Define the path to your PDF file
pdf_path = '/content/GS English Set-B.pdf'  # Replace with your PDF file path

# Convert PDF to images (one image per page)
# Skip the first page by specifying first_page=2
try:
    print("Converting PDF to images...")
    pages = convert_from_path(pdf_path, first_page=2)
    print(f"Successfully converted {len(pages)} pages")
except FileNotFoundError:
    print(f"Error: PDF file not found at {pdf_path}")
    print("Please check the file path and make sure the file exists.")
    sys.exit(1)
except Exception as e:
    print(f"Error converting PDF to images: {e}")
    print("Possible solutions:")
    print("1. Make sure poppler-utils is installed: !apt-get install poppler-utils")
    print("2. Check if the PDF file exists and is not corrupted")
    print("3. Try with a different PDF file")
    sys.exit(1)

# Process each page
text_content = []
total_pages = len(pages)
old_num = 0
for page_num, page in enumerate(pages, start=2):  # Start from the second page (index 2)
    print(f"Processing page {page_num}/{total_pages + 1}")  # Adjust total page count for display

    # Convert PIL Image to numpy array
    page_array = np.array(page)

    # Get image dimensions
    height, width = page_array.shape[:2]  # Handle both RGB and grayscale

    # Define the split point for two columns (e.g., the middle)
    mid_point = width // 2

    try:
        # Process the left column
        left_half = page_array[:, :mid_point]
        left_result = reader.readtext(left_half, detail=0)

        # Process the right column
        right_half = page_array[:, mid_point:]
        right_result = reader.readtext(right_half, detail=0)

        # Combine the text from both columns for the current page
        # You might need more sophisticated logic here depending on the layout
        page_text = " ".join(left_result + right_result)
        questions = re.findall(r'((?:[1-9]|[1-9][0-9]|100)[.,]\s+.*?)(?=(?:[1-9]|[1-9][0-9]|100)[.,]\s+|\Z)', page_text, re.DOTALL)
        for question in questions:
            cleaned = question.strip()
            if cleaned:
             text_content.append(f"{cleaned}")

        print(f"Extracted {len(left_result + right_result)} text blocks from page {page_num}")

    except Exception as e:
        print(f"Error processing page {page_num}: {e}")
        text_content.append(f"--- Page {page_num} ---\nError processing this page: {e}")

# Save output
output_filename = "extracted_text.txt"
try:
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write("\n\n".join(text_content))
    print(f"[INFO] OCR complete, output saved to {output_filename}")
    print(f"[INFO] Processed {len(text_content)} pages total")
except Exception as e:
    print(f"Error saving output file: {e}")

Converting PDF to images...
Successfully converted 21 pages
Processing page 2/22
Extracted 159 text blocks from page 2
Processing page 3/22
Extracted 196 text blocks from page 3
Processing page 4/22
Extracted 207 text blocks from page 4
Processing page 5/22
Extracted 221 text blocks from page 5
Processing page 6/22
Extracted 143 text blocks from page 6
Processing page 7/22
Extracted 170 text blocks from page 7
Processing page 8/22
Extracted 156 text blocks from page 8
Processing page 9/22
Extracted 285 text blocks from page 9
Processing page 10/22
Extracted 210 text blocks from page 10
Processing page 11/22
Extracted 231 text blocks from page 11
Processing page 12/22
Extracted 237 text blocks from page 12
Processing page 13/22
Extracted 307 text blocks from page 13
Processing page 14/22
Extracted 256 text blocks from page 14
Processing page 15/22
Extracted 236 text blocks from page 15
Processing page 16/22
Extracted 229 text blocks from page 16
Processing page 17/22
Extracted 249 text 