# Batch Convert PDF Tables to Excel

This notebook scans the current directory for all PDF files, processes each PDF page by page, extracts tables using Google Vision API, and saves each PDF as a separate Excel file with '_extracted.xlsx' suffix.

## Setup and Configuration

1. Install required packages if not already installed
2. Set your Google API key
3. Configure the prompt for table extraction

In [1]:
# Install required packages
#^pip install google-genai pdf2image pillow pandas openpyxl tqdm

## API Key Configuration

**IMPORTANT:** Replace the API_KEY below with your actual key from https://aistudio.google.com/apikey

In [2]:
API_KEY = 'AIzaSyDBhCAMcISchXzLzkyWN3uI_ZvNKBDEP6Q'

## Import Libraries and Setup

In [3]:
from google import genai
from google.genai import types
from pydantic import BaseModel
from tqdm.notebook import tqdm
from pdf2image import convert_from_path
from PIL import Image

import io
import os
import json
import pandas as pd
import glob
from typing import List, Any

# Initialize the client
client = genai.Client(api_key=API_KEY)

## Define Data Models and Prompts

In [4]:
# Define the data model for table rows
class TableRow(BaseModel):
    row_data: List[str]  # Each cell in the row as a string

class TableData(BaseModel):
    headers: List[str]  # Column headers
    rows: List[TableRow]  # Table rows
    page_info: str  # Any additional page information

In [5]:
# Prompt for table extraction
TABLE_EXTRACTION_PROMPT = '''
Analyze this image and extract all table data found on this page.

Please:
1. Identify all tables on the page
2. Extract column headers (if any)
3. Extract all row data, preserving the structure
4. If there are multiple tables, combine them or note their separation
5. Include any relevant page information (title, date, etc.)

Return the data in the specified JSON format with:
- headers: list of column headers
- rows: list of table rows, where each row contains a list of cell values
- page_info: any additional context about the page

If no table is found, return empty headers and rows arrays.
'''

## PDF Processing Functions

In [6]:
def get_pdf_files_in_directory(directory="."):
    """
    Get all PDF files in the specified directory
    
    Args:
        directory: Directory to search for PDF files (default current directory)
    
    Returns:
        List of PDF file paths
    """
    pdf_pattern = os.path.join(directory, "*.pdf")
    pdf_files = glob.glob(pdf_pattern)
    return [os.path.basename(pdf) for pdf in pdf_files]

In [7]:
def get_output_filename(pdf_filename):
    """
    Generate output Excel filename from PDF filename
    
    Args:
        pdf_filename: Original PDF filename (e.g., 'HSP.pdf')
    
    Returns:
        Excel filename with '_extracted.xlsx' suffix (e.g., 'HSP_extracted.xlsx')
    """
    base_name = os.path.splitext(pdf_filename)[0]
    return f"{base_name}_extracted.xlsx"

In [8]:
def convert_pdf_to_images(pdf_path, dpi=200):
    """
    Convert PDF pages to images
    
    Args:
        pdf_path: Path to the PDF file
        dpi: Resolution for conversion (higher = better quality, larger file)
    
    Returns:
        List of PIL Image objects
    """
    try:
        print(f"  Converting PDF to images with DPI: {dpi}")
        images = convert_from_path(pdf_path, dpi=dpi)
        print(f"  Successfully converted {len(images)} pages")
        return images
    except Exception as e:
        print(f"  Error converting PDF: {e}")
        return []

In [9]:
def extract_table_from_image(image, page_num, pdf_name):
    """
    Extract table data from a single page image using Google Vision API
    
    Args:
        image: PIL Image object
        page_num: Page number for reference
        pdf_name: PDF filename for reference
    
    Returns:
        TableData object or None if extraction fails
    """
    try:
        # Convert PIL image to bytes
        img_byte_arr = io.BytesIO()
        image.save(img_byte_arr, format='PNG')
        img_byte_arr = img_byte_arr.getvalue()
        
        # Call Google Vision API
        response = client.models.generate_content(
            model='gemini-2.0-flash-lite',
            config=types.GenerateContentConfig(
                temperature=0.2,
                response_mime_type='application/json',
                response_schema=TableData
            ),
            contents=[
                types.Part.from_bytes(
                    data=img_byte_arr,
                    mime_type='image/png'
                ),
                TABLE_EXTRACTION_PROMPT
            ]
        )
        
        table_data: TableData = response.parsed
        print(f"    Page {page_num}: Extracted {len(table_data.rows)} rows")
        return table_data
        
    except Exception as e:
        print(f"    Error extracting table from page {page_num}: {e}")
        return None

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Batch Convert PDF Tables to Excel\n",
    "\n",
    "This notebook scans the current directory for all PDF files, processes each PDF page by page, extracts tables using Google Vision API, and saves each PDF as a separate Excel file with '_extracted.xlsx' suffix."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup and Configuration\n",
    "\n",
    "1. Install required packages if not already installed\n",
    "2. Set your Google API key\n",
    "3. Configure the prompt for table extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install required packages\n",
    "#!pip install google-genai pdf2image pillow pandas openpyxl tqdm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## API Key Configuration\n",
    "\n",
    "**IMPORTANT:** Replace the API_KEY below with your actual key from https://aistudio.google.com/apikey"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "API_KEY = 'AIzaSyDBhCAMcISchXzLzkyWN3uI_ZvNKBDEP6Q'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Libraries and Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from google import genai\n",
    "from google.genai import types\n",
    "from pydantic import BaseModel\n",
    "from tqdm.notebook import tqdm\n",
    "from pdf2image import convert_from_path\n",
    "from PIL import Image\n",
    "\n",
    "import io\n",
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "import glob\n",
    "from typing import List, Any\n",
    "\n",
    "# Initialize the client\n",
    "client = genai.Client(api_key=API_KEY)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define Data Models and Prompts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the data model for table rows\n",
    "class TableRow(BaseModel):\n",
    "    row_data: List[str]  # Each cell in the row as a string\n",
    "\n",
    "class TableData(BaseModel):\n",
    "    headers: List[str]  # Column headers\n",
    "    rows: List[TableRow]  # Table rows\n",
    "    page_info: str  # Any additional page information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prompt for table extraction\n",
    "TABLE_EXTRACTION_PROMPT = '''\n",
    "Analyze this image and extract all table data found on this page.\n",
    "\n",
    "Please:\n",
    "1. Identify all tables on the page\n",
    "2. Extract column headers (if any)\n",
    "3. Extract all row data, preserving the structure\n",
    "4. If there are multiple tables, combine them or note their separation\n",
    "5. Include any relevant page information (title, date, etc.)\n",
    "\n",
    "Return the data in the specified JSON format with:\n",
    "- headers: list of column headers\n",
    "- rows: list of table rows, where each row contains a list of cell values\n",
    "- page_info: any additional context about the page\n",
    "\n",
    "If no table is found, return empty headers and rows arrays.\n",
    "'''"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## PDF Processing Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pdf_files_in_directory(directory=\".\"):\n",
    "    \"\"\"\n",
    "    Get all PDF files in the specified directory\n",
    "    \n",
    "    Args:\n",
    "        directory: Directory to search for PDF files (default current directory)\n",
    "    \n",
    "    Returns:\n",
    "        List of PDF file paths\n",
    "    \"\"\"\n",
    "    pdf_pattern = os.path.join(directory, \"*.pdf\")\n",
    "    pdf_files = glob.glob(pdf_pattern)\n",
    "    return [os.path.basename(pdf) for pdf in pdf_files]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_output_filename(pdf_filename):\n",
    "    \"\"\"\n",
    "    Generate output Excel filename from PDF filename\n",
    "    \n",
    "    Args:\n",
    "        pdf_filename: Original PDF filename (e.g., 'HSP.pdf')\n",
    "    \n",
    "    Returns:\n",
    "        Excel filename with '_extracted.xlsx' suffix (e.g., 'HSP_extracted.xlsx')\n",
    "    \"\"\"\n",
    "    base_name = os.path.splitext(pdf_filename)[0]\n",
    "    return f\"{base_name}_extracted.xlsx\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_pdf_to_images(pdf_path, dpi=200):\n",
    "    \"\"\"\n",
    "    Convert PDF pages to images\n",
    "    \n",
    "    Args:\n",
    "        pdf_path: Path to the PDF file\n",
    "        dpi: Resolution for conversion (higher = better quality, larger file)\n",
    "    \n",
    "    Returns:\n",
    "        List of PIL Image objects\n",
    "    \"\"\"\n",
    "    try:\n",
    "        print(f\"  Converting PDF to images with DPI: {dpi}\")\n",
    "        images = convert_from_path(pdf_path, dpi=dpi)\n",
    "        print(f\"  Successfully converted {len(images)} pages\")\n",
    "        return images\n",
    "    except Exception as e:\n",
    "        print(f\"  Error converting PDF: {e}\")\n",
    "        return []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_table_from_image(image, page_num, pdf_name):\n",
    "    \"\"\"\n",
    "    Extract table data from a single page image using Google Vision API\n",
    "    \n",
    "    Args:\n",
    "        image: PIL Image object\n",
    "        page_num: Page number for reference\n",
    "        pdf_name: PDF filename for reference\n",
    "    \n",
    "    Returns:\n",
    "        TableData object or None if extraction fails\n",
    "    \"\"\"\n",
    "    try:\n",
    "        # Convert PIL image to bytes\n",
    "        img_byte_arr = io.BytesIO()\n",
    "        image.save(img_byte_arr, format='PNG')\n",
    "        img_byte_arr = img_byte_arr.getvalue()\n",
    "        \n",
    "        # Call Google Vision API\n",
    "        response = client.models.generate_content(\n",
    "            model='gemini-2.0-flash-lite',\n",
    "            config=types.GenerateContentConfig(\n",
    "                temperature=0.2,\n",
    "                response_mime_type='application/json',\n",
    "                response_schema=TableData\n",
    "            ),\n",
    "            contents=[\n",
    "                types.Part.from_bytes(\n",
    "                    data=img_byte_arr,\n",
    "                    mime_type='image/png'\n",
    "                ),\n",
    "                TABLE_EXTRACTION_PROMPT\n",
    "            ]\n",
    "        )\n",
    "        \n",
    "        table_data: TableData = response.parsed\n",
    "        print(f\"    Page {page_num}: Extracted {len(table_data.rows)} rows\")\n",
    "        return table_data\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"    Error extracting table from page {page_num}: {e}\")\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def table_data_to_dataframe(table_data, page_num, pdf_name):\n",
    "    \"\"\"\n",
    "    Convert TableData to pandas DataFrame with column length handling\n",
    "    \"\"\"\n",
    "    try:\n",
    "        if not table_data or not table_data.rows:\n",
    "            print(f\"    Page {page_num}: No table data found\")\n",
    "            return None\n",
    "        \n",
    "        # Convert rows to list of lists\n",
    "        rows_data = [row.row_data for row in table_data.rows]\n",
    "        \n",
    "        # Handle column mismatch - find the maximum number of columns\n",
    "        max_columns = max(len(row) for row in rows_data)\n",
    "        \n",
    "        # Ensure all rows have the same length\n",
    "        for i, row in enumerate(rows_data):\n",
    "            if len(row) < max_columns:\n",
    "                # Pad shorter rows with empty strings\n",
    "                rows_data[i] = row + [''] * (max_columns - len(row))\n",
    "            elif len(row) > max_columns:\n",
    "                # Truncate longer rows (less likely but possible)\n",
    "                rows_data[i] = row[:max_columns]\n",
    "        \n",
    "        # Create headers - if we have headers, ensure they match max_columns\n",
    "        if table_data.headers:\n",
    "            headers = table_data.headers.copy()\n",
    "            if len(headers) < max_columns:\n",
    "                headers += [f'Column_{i}' for i in range(len(headers), max_columns)]\n",
    "            elif len(headers) > max_columns:\n",
    "                headers = headers[:max_columns]\n",
    "        else:\n",
    "            headers = [f'Column_{i}' for i in range(max_columns)]\n",
    "        \n",
    "        # Create DataFrame\n",
    "        df = pd.DataFrame(rows_data, columns=headers)\n",
    "        \n",
    "        # Add metadata\n",
    "        df['page_number'] = page_num\n",
    "        df['source_pdf'] = pdf_name\n",
    "        if table_data.page_info:\n",
    "            df['page_info'] = table_data.page_info\n",
    "        \n",
    "        return df\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"    Error converting table data to DataFrame for page {page_num}: {e}\")\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def old_table_data_to_dataframe(table_data, page_num, pdf_name):\n",
    "    \"\"\"\n",
    "    Convert TableData to pandas DataFrame\n",
    "    \n",
    "    Args:\n",
    "        table_data: TableData object\n",
    "        page_num: Page number for reference\n",
    "        pdf_name: PDF filename for reference\n",
    "    \n",
    "    Returns:\n",
    "        pandas DataFrame or None if conversion fails\n",
    "    \"\"\"\n",
    "    try:\n",
    "        if not table_data or not table_data.rows:\n",
    "            print(f\"    Page {page_num}: No table data found\")\n",
    "            return None\n",
    "        \n",
    "        # Convert rows to list of lists\n",
    "        rows_data = [row.row_data for row in table_data.rows]\n",
    "        \n",
    "        # Create DataFrame\n",
    "        if table_data.headers:\n",
    "            df = pd.DataFrame(rows_data, columns=table_data.headers)\n",
    "        else:\n",
    "            df = pd.DataFrame(rows_data)\n",
    "        \n",
    "        # Add metadata\n",
    "        df['page_number'] = page_num\n",
    "        df['source_pdf'] = pdf_name\n",
    "        if table_data.page_info:\n",
    "            df['page_info'] = table_data.page_info\n",
    "        \n",
    "        return df\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"    Error converting table data to DataFrame for page {page_num}: {e}\")\n",
    "        return None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Single PDF Processing Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_single_pdf_to_excel(pdf_path, output_path, dpi=200):\n",
    "    \"\"\"\n",
    "    Process a single PDF and create Excel file with separate sheets for each page\n",
    "    \n",
    "    Args:\n",
    "        pdf_path: Path to the PDF file\n",
    "        output_path: Path for the output Excel file\n",
    "        dpi: Resolution for PDF to image conversion\n",
    "    \n",
    "    Returns:\n",
    "        True if successful, False otherwise\n",
    "    \"\"\"\n",
    "    pdf_name = os.path.basename(pdf_path)\n",
    "    print(f\"\\nProcessing: {pdf_name}\")\n",
    "    \n",
    "    # Convert PDF to images\n",
    "    images = convert_pdf_to_images(pdf_path, dpi)\n",
    "    if not images:\n",
    "        print(f\"  Failed to convert {pdf_name} to images\")\n",
    "        return False\n",
    "    \n",
    "    # Process each page\n",
    "    all_dataframes = {}\n",
    "    \n",
    "    for i, image in enumerate(tqdm(images, desc=f\"  Processing {pdf_name}\")):\n",
    "        page_num = i + 1\n",
    "        \n",
    "        # Extract table data\n",
    "        table_data = extract_table_from_image(image, page_num, pdf_name)\n",
    "        \n",
    "        if table_data:\n",
    "            # Convert to DataFrame\n",
    "            df = table_data_to_dataframe(table_data, page_num, pdf_name)\n",
    "            \n",
    "            if df is not None and not df.empty:\n",
    "                sheet_name = f\"Page_{page_num}\"\n",
    "                all_dataframes[sheet_name] = df\n",
    "                print(f\"    Page {page_num}: Added {len(df)} rows to sheet '{sheet_name}'\")\n",
    "            else:\n",
    "                print(f\"    Page {page_num}: No valid data extracted\")\n",
    "        else:\n",
    "            print(f\"    Page {page_num}: Failed to extract table data\")\n",
    "    \n",
    "    # Save to Excel\n",
    "    if all_dataframes:\n",
    "        print(f\"  Saving {len(all_dataframes)} sheets to {output_path}\")\n",
    "        try:\n",
    "            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:\n",
    "                for sheet_name, df in all_dataframes.items():\n",
    "                    df.to_excel(writer, sheet_name=sheet_name, index=False)\n",
    "            print(f\"  Successfully saved: {output_path}\")\n",
    "            return True\n",
    "        except Exception as e:\n",
    "            print(f\"  Error saving Excel file: {e}\")\n",
    "            return False\n",
    "    else:\n",
    "        print(f\"  No data extracted from {pdf_name}\")\n",
    "        return False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Batch Processing Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def batch_process_pdfs(directory=\".\", dpi=200):\n",
    "    \"\"\"\n",
    "    Process all PDF files in the directory\n",
    "    \n",
    "    Args:\n",
    "        directory: Directory to search for PDF files\n",
    "        dpi: Resolution for PDF to image conversion\n",
    "    \n",
    "    Returns:\n",
    "        Dictionary with processing results\n",
    "    \"\"\"\n",
    "    # Find all PDF files\n",
    "    pdf_files = get_pdf_files_in_directory(directory)\n",
    "    \n",
    "    if not pdf_files:\n",
    "        print(\"No PDF files found in the current directory.\")\n",
    "        return {}\n",
    "    \n",
    "    print(f\"Found {len(pdf_files)} PDF files:\")\n",
    "    for pdf in pdf_files:\n",
    "        print(f\"  - {pdf}\")\n",
    "    \n",
    "    # Process each PDF\n",
    "    results = {\n",
    "        'successful': [],\n",
    "        'failed': [],\n",
    "        'total': len(pdf_files)\n",
    "    }\n",
    "    \n",
    "    print(f\"\\n{'='*50}\")\n",
    "    print(\"STARTING BATCH PROCESSING\")\n",
    "    print(f\"{'='*50}\")\n",
    "    \n",
    "    for i, pdf_file in enumerate(pdf_files, 1):\n",
    "        print(f\"\\n[{i}/{len(pdf_files)}] Processing: {pdf_file}\")\n",
    "        \n",
    "        # Generate output filename\n",
    "        output_file = get_output_filename(pdf_file)\n",
    "        \n",
    "        # Check if output already exists\n",
    "        if os.path.exists(output_file):\n",
    "            print(f\"  Output file {output_file} already exists. Skipping...\")\n",
    "            results['successful'].append({'pdf': pdf_file, 'output': output_file, 'status': 'skipped'})\n",
    "            continue\n",
    "        \n",
    "        # Process the PDF\n",
    "        success = process_single_pdf_to_excel(pdf_file, output_file, dpi)\n",
    "        \n",
    "        if success:\n",
    "            results['successful'].append({'pdf': pdf_file, 'output': output_file, 'status': 'processed'})\n",
    "        else:\n",
    "            results['failed'].append({'pdf': pdf_file, 'output': output_file, 'error': 'processing_failed'})\n",
    "    \n",
    "    return results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Execute Batch Processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute batch processing\n",
    "print(\"Starting batch PDF processing...\")\n",
    "print(\"This will process all PDF files in the current directory.\")\n",
    "print(\"Each PDF will be converted to an Excel file with '_extracted.xlsx' suffix.\\n\")\n",
    "\n",
    "# Run batch processing\n",
    "results = batch_process_pdfs(directory=\".\", dpi=200)\n",
    "\n",
    "# Print summary\n",
    "if results:\n",
    "    print(f\"\\n{'='*50}\")\n",
    "    print(\"BATCH PROCESSING COMPLETE\")\n",
    "    print(f\"{'='*50}\")\n",
    "    print(f\"Total files processed: {results['total']}\")\n",
    "    print(f\"Successful: {len(results['successful'])}\")\n",
    "    print(f\"Failed: {len(results['failed'])}\")\n",
    "    \n",
    "    if results['successful']:\n",
    "        print(\"\\nSuccessfully processed:\")\n",
    "        for item in results['successful']:\n",
    "            status = item['status']\n",
    "            print(f\"  ✓ {item['pdf']} → {item['output']} ({status})\")\n",
    "    \n",
    "    if results['failed']:\n",
    "        print(\"\\nFailed to process:\")\n",
    "        for item in results['failed']:\n",
    "            print(f\"  ✗ {item['pdf']} (Error: {item['error']})\")\n",
    "else:\n",
    "    print(\"No PDF files found to process.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Optional: Preview Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional: Preview all generated Excel files\n",
    "excel_files = glob.glob(\"*_extracted.xlsx\")\n",
    "\n",
    "if excel_files:\n",
    "    print(f\"Found {len(excel_files)} generated Excel files:\\n\")\n",
    "    \n",
    "    for excel_file in excel_files:\n",
    "        print(f\"{'='*60}\")\n",
    "        print(f\"File: {excel_file}\")\n",
    "        print(f\"{'='*60}\")\n",
    "        \n",
    "        try:\n",
    "            excel_data = pd.ExcelFile(excel_file)\n",
    "            print(f\"Contains {len(excel_data.sheet_names)} sheets:\")\n",
    "            \n",
    "            for sheet_name in excel_data.sheet_names[:5]:  # Show first 5 sheets\n",
    "                df = pd.read_excel(excel_file, sheet_name=sheet_name)\n",
    "                print(f\"\\n  {sheet_name}: {len(df)} rows, {len(df.columns)} columns\")\n",
    "                if len(df.columns) > 0:\n",
    "                    print(f\"  Columns: {list(df.columns)[:5]}...\")  # Show first 5 columns\n",
    "                \n",
    "                # Show first row if data exists\n",
    "                if len(df) > 0:\n",
    "                    print(f\"  Sample data: {df.iloc[0].tolist()[:3]}...\")  # First 3 cells\n",
    "            \n",
    "            if len(excel_data.sheet_names) > 5:\n",
    "                print(f\"\\n  ... and {len(excel_data.sheet_names) - 5} more sheets\")\n",
    "                \n",
    "        except Exception as e:\n",
    "            print(f\"Error reading {excel_file}: {e}\")\n",
    "        \n",
    "        print(\"\\n\")\n",
    "else:\n",
    "    print(\"No Excel files with '_extracted.xlsx' suffix found.\")\n",
    "    print(\"Please run the batch processing cell above first.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


In [None]:
def old_table_data_to_dataframe(table_data, page_num, pdf_name):
    """
    Convert TableData to pandas DataFrame
    
    Args:
        table_data: TableData object
        page_num: Page number for reference
        pdf_name: PDF filename for reference
    
    Returns:
        pandas DataFrame or None if conversion fails
    """
    try:
        if not table_data or not table_data.rows:
            print(f"    Page {page_num}: No table data found")
            return None
        
        # Convert rows to list of lists
        rows_data = [row.row_data for row in table_data.rows]
        
        # Create DataFrame
        if table_data.headers:
            df = pd.DataFrame(rows_data, columns=table_data.headers)
        else:
            df = pd.DataFrame(rows_data)
        
        # Add metadata
        df['page_number'] = page_num
        df['source_pdf'] = pdf_name
        if table_data.page_info:
            df['page_info'] = table_data.page_info
        
        return df
        
    except Exception as e:
        print(f"    Error converting table data to DataFrame for page {page_num}: {e}")
        return None

## Single PDF Processing Function

In [11]:
def process_single_pdf_to_excel(pdf_path, output_path, dpi=200):
    """
    Process a single PDF and create Excel file with separate sheets for each page
    
    Args:
        pdf_path: Path to the PDF file
        output_path: Path for the output Excel file
        dpi: Resolution for PDF to image conversion
    
    Returns:
        True if successful, False otherwise
    """
    pdf_name = os.path.basename(pdf_path)
    print(f"\nProcessing: {pdf_name}")
    
    # Convert PDF to images
    images = convert_pdf_to_images(pdf_path, dpi)
    if not images:
        print(f"  Failed to convert {pdf_name} to images")
        return False
    
    # Process each page
    all_dataframes = {}
    
    for i, image in enumerate(tqdm(images, desc=f"  Processing {pdf_name}")):
        page_num = i + 1
        
        # Extract table data
        table_data = extract_table_from_image(image, page_num, pdf_name)
        
        if table_data:
            # Convert to DataFrame
            df = table_data_to_dataframe(table_data, page_num, pdf_name)
            
            if df is not None and not df.empty:
                sheet_name = f"Page_{page_num}"
                all_dataframes[sheet_name] = df
                print(f"    Page {page_num}: Added {len(df)} rows to sheet '{sheet_name}'")
            else:
                print(f"    Page {page_num}: No valid data extracted")
        else:
            print(f"    Page {page_num}: Failed to extract table data")
    
    # Save to Excel
    if all_dataframes:
        print(f"  Saving {len(all_dataframes)} sheets to {output_path}")
        try:
            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                for sheet_name, df in all_dataframes.items():
                    df.to_excel(writer, sheet_name=sheet_name, index=False)
            print(f"  Successfully saved: {output_path}")
            return True
        except Exception as e:
            print(f"  Error saving Excel file: {e}")
            return False
    else:
        print(f"  No data extracted from {pdf_name}")
        return False

## Batch Processing Function

In [12]:
def batch_process_pdfs(directory=".", dpi=200):
    """
    Process all PDF files in the directory
    
    Args:
        directory: Directory to search for PDF files
        dpi: Resolution for PDF to image conversion
    
    Returns:
        Dictionary with processing results
    """
    # Find all PDF files
    pdf_files = get_pdf_files_in_directory(directory)
    
    if not pdf_files:
        print("No PDF files found in the current directory.")
        return {}
    
    print(f"Found {len(pdf_files)} PDF files:")
    for pdf in pdf_files:
        print(f"  - {pdf}")
    
    # Process each PDF
    results = {
        'successful': [],
        'failed': [],
        'total': len(pdf_files)
    }
    
    print(f"\n{'='*50}")
    print("STARTING BATCH PROCESSING")
    print(f"{'='*50}")
    
    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"\n[{i}/{len(pdf_files)}] Processing: {pdf_file}")
        
        # Generate output filename
        output_file = get_output_filename(pdf_file)
        
        # Check if output already exists
        if os.path.exists(output_file):
            print(f"  Output file {output_file} already exists. Skipping...")
            results['successful'].append({'pdf': pdf_file, 'output': output_file, 'status': 'skipped'})
            continue
        
        # Process the PDF
        success = process_single_pdf_to_excel(pdf_file, output_file, dpi)
        
        if success:
            results['successful'].append({'pdf': pdf_file, 'output': output_file, 'status': 'processed'})
        else:
            results['failed'].append({'pdf': pdf_file, 'output': output_file, 'error': 'processing_failed'})
    
    return results

## Execute Batch Processing

In [None]:
# Execute batch processing
print("Starting batch PDF processing...")
print("This will process all PDF files in the current directory.")
print("Each PDF will be converted to an Excel file with '_extracted.xlsx' suffix.\n")

# Run batch processing
results = batch_process_pdfs(directory=".", dpi=200)

# Print summary
if results:
    print(f"\n{'='*50}")
    print("BATCH PROCESSING COMPLETE")
    print(f"{'='*50}")
    print(f"Total files processed: {results['total']}")
    print(f"Successful: {len(results['successful'])}")
    print(f"Failed: {len(results['failed'])}")
    
    if results['successful']:
        print("\nSuccessfully processed:")
        for item in results['successful']:
            status = item['status']
            print(f"  ✓ {item['pdf']} → {item['output']} ({status})")
    
    if results['failed']:
        print("\nFailed to process:")
        for item in results['failed']:
            print(f"  ✗ {item['pdf']} (Error: {item['error']})")
else:
    print("No PDF files found to process.")

Starting batch PDF processing...
This will process all PDF files in the current directory.
Each PDF will be converted to an Excel file with '_extracted.xlsx' suffix.

Found 1 PDF files:
  - HSP.pdf

STARTING BATCH PROCESSING

[1/1] Processing: HSP.pdf

Processing: HSP.pdf
  Converting PDF to images with DPI: 200
  Successfully converted 38 pages


  Processing HSP.pdf:   0%|          | 0/38 [00:00<?, ?it/s]

    Page 1: Extracted 0 rows
    Page 1: No table data found
    Page 1: No valid data extracted
    Page 2: Extracted 0 rows
    Page 2: No table data found
    Page 2: No valid data extracted
    Page 3: Extracted 1 rows
    Page 3: Added 1 rows to sheet 'Page_3'
    Page 4: Extracted 15 rows
    Page 4: Added 15 rows to sheet 'Page_4'
    Page 5: Extracted 79 rows
    Page 5: Added 79 rows to sheet 'Page_5'
    Page 6: Extracted 86 rows
    Page 6: Added 86 rows to sheet 'Page_6'
    Page 7: Extracted 84 rows
    Page 7: Added 84 rows to sheet 'Page_7'
    Page 8: Extracted 83 rows
    Page 8: Added 83 rows to sheet 'Page_8'
    Page 9: Extracted 68 rows
    Page 9: Added 68 rows to sheet 'Page_9'
    Page 10: Extracted 84 rows
    Page 10: Added 84 rows to sheet 'Page_10'
    Page 11: Extracted 42 rows
    Page 11: Added 42 rows to sheet 'Page_11'
    Page 12: Extracted 52 rows
    Page 12: Added 52 rows to sheet 'Page_12'
    Page 13: Extracted 79 rows
    Page 13: Added 79 rows t

## Optional: Preview Results

In [None]:
# Optional: Preview all generated Excel files
excel_files = glob.glob("*_extracted.xlsx")

if excel_files:
    print(f"Found {len(excel_files)} generated Excel files:\n")
    
    for excel_file in excel_files:
        print(f"{'='*60}")
        print(f"File: {excel_file}")
        print(f"{'='*60}")
        
        try:
            excel_data = pd.ExcelFile(excel_file)
            print(f"Contains {len(excel_data.sheet_names)} sheets:")
            
            for sheet_name in excel_data.sheet_names[:5]:  # Show first 5 sheets
                df = pd.read_excel(excel_file, sheet_name=sheet_name)
                print(f"\n  {sheet_name}: {len(df)} rows, {len(df.columns)} columns")
                if len(df.columns) > 0:
                    print(f"  Columns: {list(df.columns)[:5]}...")  # Show first 5 columns
                
                # Show first row if data exists
                if len(df) > 0:
                    print(f"  Sample data: {df.iloc[0].tolist()[:3]}...")  # First 3 cells
            
            if len(excel_data.sheet_names) > 5:
                print(f"\n  ... and {len(excel_data.sheet_names) - 5} more sheets")
                
        except Exception as e:
            print(f"Error reading {excel_file}: {e}")
        
        print("\n")
else:
    print("No Excel files with '_extracted.xlsx' suffix found.")
    print("Please run the batch processing cell above first.")