In [4]:
# !pip install google-generativeai
# !pip install pypdf
# !pip install pdf2image
# !pip install pillow
# !pip install python-dotenv
# !sudo apt-get update
# !sudo apt-get install -y poppler-utils

In [6]:
import os
import json
import logging
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from pathlib import Path
import google.generativeai as genai
from dotenv import load_dotenv
from PIL import Image
import base64
import io
from pdf2image import convert_from_path
import re
from datetime import datetime

In [None]:
# Set your Gemini API key
os.environ["GOOGLE_API_KEY"] = "your-api-kay"

# Load Gemini model
model = genai.GenerativeModel(
    "gemini-2.5-flash-preview-04-17", #"gemini-2.5-flash-preview-04-17"
)

In [28]:
def convert_pdf_to_images(pdf_path, output_folder, dpi=300):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Convert PDF pages to images
    images = convert_from_path(pdf_path, dpi=dpi)

    # Save images to the output folder
    image_paths = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f'page_{i+1}.jpg')
        image.save(image_path, 'JPEG')
        image_paths.append(image_path)

    return image_paths

In [75]:
pdf_path = '/content/Machine Learning Fundamentals.pdf'
output_folder = '/content/images'
# image_paths = convert_pdf_to_images(pdf_path, output_folder)

In [72]:
!rm -r /content/images

rm: cannot remove '/content/images': No such file or directory


In [73]:
def batch_images(image_paths, batch_size=10):
    """Group images into batches for processing"""
    for i in range(0, len(image_paths), batch_size):
        yield image_paths[i:i + batch_size]

In [76]:
def ocr_with_gemini(image_paths, instruction):
    images = [Image.open(path) for path in image_paths]

    prompt = f"""

        You are an expert document analysis AI with exceptional OCR capabilities.
    Your task is to extract ALL textual content from the provided document images with perfect accuracy.

    CRITICAL REQUIREMENTS:
    1. ACCURACY: Every word, number, symbol, and punctuation mark must be captured exactly as shown
    2. STRUCTURE: Maintain the original document structure, hierarchy, and formatting
    3. COMPLETENESS: Do not skip any content, including headers, footers, page numbers, footnotes, watermarks, or marginalia
    4. CONTEXT: Understand the document context to resolve ambiguous characters

    {instruction}

    """

    response = model.generate_content([prompt, *images])
    return response.text

In [77]:
def ocr_complex_document(image_paths):
    instruction = """
    Extract ALL text content from these document pages.
    For Tables:
    - Use markdown table format with proper alignment
    - Include all headers, subheaders, and merged cells
    - Preserve numerical precision and units
    - Note any table notes or footnotes

    For Multi-column Text:
    - Process columns in natural reading order (left to right, top to bottom)
    - Clearly separate column content with appropriate breaks
    - Maintain column-specific formatting

    For Charts/Graphs:
    - Describe chart type and purpose
    - Extract all axis labels, legends, and data points
    - Capture titles, captions, and source information
    - Note any trends or key insights visible in the visual

    For Special Elements:
    - Preserve bullet points, numbered lists, and indentation
    - Maintain emphasis (bold, italic, underline) using markdown
    - Capture all hyperlinks and cross-references
    - Include page numbers and section breaks

    QUALITY ASSURANCE:
    - Double-check all numerical data for accuracy
    - Verify proper names, technical terms, and specialized vocabulary
    - Ensure logical flow and coherence in extracted text
    - Flag any unclear or potentially misread content with [UNCERTAIN: text]
    Preserve all headers, footers, page numbers, and footnotes.
    """

    return ocr_with_gemini(image_paths, instruction)

In [78]:
def process_large_pdf(pdf_path, output_folder):
    # Convert PDF to images
    image_paths = convert_pdf_to_images(pdf_path, output_folder)

    # Create batches of images (e.g., by chapter or section)
    batches = batch_images(image_paths, 30)

    full_text = ""
    for i, batch in enumerate(batches):
        print(f"Processing batch {i+1}...")
        batch_text = ocr_with_gemini(batch, "Extract all text, maintaining document structure")
        full_text += f"\n\n--- BATCH {i+1} ---\n\n{batch_text}"

    return full_text

In [79]:
text = process_large_pdf(pdf_path, output_folder)

Processing batch 1...


In [82]:
def text_to_json_with_gemini(text_content):
  """Converts text content to JSON format using the Gemini model."""

  prompt = """
    You are a document structuring expert. Convert the provided text into a comprehensive, well-organized JSON structure.

    REQUIREMENTS:
    1. Create a logical, hierarchical JSON structure that reflects the document's organization
    2. Preserve all content while organizing it meaningfully
    3. Include metadata about the document structure and content

  Convert the following text content into a structured JSON object.
  Identify key sections, headings, paragraphs, tables, and other elements
  and represent them appropriately in the JSON structure.


  Text content to convert:

  {}

    JSON STRUCTURE TEMPLATE:
    {{
        "document_metadata": {{
            "title": "Document title if available",
            "document_type": "academic|technical|report|manual|other",
            "total_pages": "number of pages processed",
            "language": "primary language detected",
            "has_tables": true/false,
            "has_charts": true/false,
            "has_images": true/false
        }},
        "document_structure": {{
            "sections": [
                {{
                    "section_number": "1",
                    "title": "Section Title",
                    "subsections": [
                        {{
                            "subsection_number": "1.1",
                            "title": "Subsection Title",
                            "content": "Full text content",
                            "paragraphs": ["paragraph 1", "paragraph 2"],
                            "lists": ["list items if any"],
                            "tables": [
                                {{
                                    "table_number": "Table 1",
                                    "caption": "Table caption",
                                    "headers": ["Column 1", "Column 2"],
                                    "rows": [["Data 1", "Data 2"]]
                                }}
                            ],
                            "figures": [
                                {{
                                    "figure_number": "Figure 1",
                                    "caption": "Figure caption",
                                    "description": "Description of visual content"
                                }}
                            ]
                        }}
                    ]
                }}
            ]
        }},
    }}

    ADAPTATION RULES:
    - If document doesn't have clear sections, organize by pages or logical breaks
    - For tables without clear structure, preserve as formatted text
    - For charts/graphs, include detailed descriptions in figures array
    - Adapt structure to match document type (academic papers, reports, manuals, etc.)
    - Ensure all content is preserved even if structure is unclear

    OUTPUT: Valid JSON only, no additional text or explanations.
  """.format(text_content)

  response = model.generate_content([prompt])
  return response.text

In [83]:
# Convert the extracted text to JSON
json_output = text_to_json_with_gemini(text)
print(json_output)

```json
{
  "document_metadata": {
    "title": "Machine Learning Fundamentals",
    "document_type": "technical",
    "total_pages": "unknown",
    "language": "English",
    "has_tables": false,
    "has_charts": false,
    "has_images": false
  },
  "document_structure": {
    "table_of_contents": [
      "Chapter 1: Foundations of Machine Learning",
      "1.1 Introduction to Machine Learning",
      "1.1.1 What is Machine Learning?",
      "1.1.1.1 Definition and Scope",
      "1.1.1.2 Historical Context",
      "1.1.1.3 Modern Applications",
      "1.1.2 Types of Machine Learning",
      "1.1.2.1 Supervised Learning",
      "1.1.2.2 Unsupervised Learning",
      "1.1.2.3 Reinforcement Learning",
      "1.2 Mathematical Foundations",
      "1.2.1 Linear Algebra",
      "1.2.1.1 Vectors and Matrices",
      "1.2.1.2 Eigenvalues and Eigenvectors",
      "1.2.2 Statistics and Probability",
      "1.2.2.1 Probability Distributions",
      "1.2.2.2 Bayes' Theorem",
      "1.3 Data Proc