In [1]:
!pip install pandas tqdm pdf2image pytesseract PyPDF2 langchain_groq

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain_groq
  Downloading langchain_groq-0.2.4-py3-none-any.whl.metadata (3.0 kB)
Collecting groq<1,>=0.4.1 (from langchain_groq)
  Downloading groq-0.16.0-py3-none-any.whl.metadata (14 kB)
Collecting langchain-core<0.4.0,>=0.3.33 (from langchain_groq)
  Downloading langchain_core-0.3.33-py3-none-any.whl.metadata (6.3 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_groq-0.2.4-py3-none-any.whl (14 kB)
Downloading groq-0.16.0-py3-none-any.whl (109 kB)
[2K   [9

In [6]:
!unzip "/content/test.zip" -d "/content/extracted"

Archive:  /content/test.zip
  inflating: /content/extracted/Test Data/Failed Formats of PDF/Sri bhavani plastics/INV-806.pdf  
  inflating: /content/extracted/Test Data/Failed Formats of PDF/100 CUBES/Tax Invoice (6).pdf  
  inflating: /content/extracted/Test Data/Failed Formats of PDF/SRI HARI ENTERPRISE/EDISON INVOICE 39.pdf  
  inflating: /content/extracted/Test Data/Failed Formats of PDF/XCELLENT XEROX AND ONLINE SERVICES/GST Sales July 24.pdf  
  inflating: /content/extracted/Test Data/Failed Formats of PDF/Sri bhavani plastics/INV-809.pdf  
  inflating: /content/extracted/Test Data/Failed Formats of PDF/Campos Technologies/15.pdf  
  inflating: /content/extracted/Test Data/Failed Formats of PDF/Campos Technologies/22.pdf  
  inflating: /content/extracted/Test Data/Failed Formats of PDF/Campos Technologies/20.pdf  
  inflating: /content/extracted/Test Data/Failed Formats of PDF/100 CUBES/IN3-100C24022601_signed.pdf  
  inflating: /content/extracted/Test Data/Failed Formats of PDF/

In [7]:
import re
import os
import json
import time
import pandas as pd
from tqdm import tqdm
from pdf2image import convert_from_path
import pytesseract
from pytesseract import Output
from PyPDF2 import PdfReader
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage

In [8]:
MIN_TEXT_LENGTH = 50       # Minimum text length before going to OCR
MAX_CHUNK_SIZE = 4000      # Maximum characters per chunk
RETRY_DELAY = 20           # Seconds to wait between retries (if some pdfs cannot be extracted at first time)
MAX_RETRIES = 3            # Number of retries per chunk
API_KEY = "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"
BASE_DIR = "/content/extracted"  # Directory containing PDF files

In [9]:
def extract_text_pypdf2(pdf_path):
    """Extract text from a PDF using PyPDF2."""
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n"
    return text

In [None]:
def extract_text_ocr(pdf_path, dpi=300):
    """Extract text from a PDF using OCR."""
    text = ""
    pages = convert_from_path(pdf_path, dpi=dpi)
    for page in pages:
        ocr_data = pytesseract.image_to_data(page, output_type=Output.DICT)
        page_text = " ".join(t for t in ocr_data['text'] if t.strip())
        text += page_text + "\n"
    return text

In [10]:
def chunk_text(text, max_length=MAX_CHUNK_SIZE):
    """Split text into chunks of at most max_length while trying to end on sentence boundaries."""
    chunks = []
    current_chunk = ""
    # Split text roughly by sentences
    sentences = re.split('([.!?]\s+)', text)
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

In [11]:
PROMPT_TEMPLATE = """
Extract invoice data from this text chunk. Map to these columns:
taxable_value, sgst_amount, cgst_amount, igst_amount, sgst_rate, cgst_rate, igst_rate,
tax_amount, tax_rate, final_amount, invoice_number, invoice_date, place_of_supply,
place_of_origin, gstin_supplier, gstin_recipient

Text chunk:
{text_chunk}

Return only a JSON dictionary with these exact keys. Use an empty string for missing values.
"""

In [12]:
def process_chunk(chunk, model):
    prompt = PROMPT_TEMPLATE.format(text_chunk=chunk)
    messages = [HumanMessage(content=prompt)]
    response = model(messages, max_tokens=1024)
    # Try to extract a JSON dictionary from the response
    match = re.search(r'\{.*\}', response.content, re.DOTALL)
    if match:
        return json.loads(match.group())
    return {}

In [13]:
def process_chunk_with_retries(chunk, model):
    for i in range(MAX_RETRIES):
        result = process_chunk(chunk, model)
        if result:
            return result
        time.sleep(RETRY_DELAY)
    return {"error": f"Failed after {MAX_RETRIES} attempts"}

In [14]:
def merge_results(results):
    """Merge the dictionary results from all chunks, preferring the first non-empty value."""
    merged = {}
    if not results:
        return merged
    for key in results[0]:
        for res in results:
            if res.get(key):
                merged[key] = res[key]
                break
        else:
            merged[key] = ""
    return merged


In [15]:
def process_invoice(pdf_path, model):
    """Extract text (using PyPDF2 or OCR), chunk it, and process each chunk with the model."""
    text = extract_text_pypdf2(pdf_path)
    if len(text) < MIN_TEXT_LENGTH:
        text = extract_text_ocr(pdf_path)
    chunks = chunk_text(text)
    results = []
    for chunk in chunks:
        result = process_chunk_with_retries(chunk, model)
        if "error" not in result:
            results.append(result)
        time.sleep(2)
    return merge_results(results) if results else {"error": "No valid results"}

In [16]:
def main():
    model = ChatGroq(model_name="gemma2-9b-it", temperature=0.75, api_key=API_KEY)
    invoice_results = []
    pdf_files = [os.path.join(root, file)
                 for root, _, files in os.walk(BASE_DIR)
                 for file in files if file.lower().endswith('.pdf')]

    for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
        result = process_invoice(pdf_path, model)
        result["file_name"] = os.path.basename(pdf_path)
        invoice_results.append(result)

    if invoice_results:
        pd.DataFrame(invoice_results).to_csv("extracted_invoices.csv", index=False)
        print("Extraction complete. Results saved to extracted_invoices.csv.")

if __name__ == "__main__":
    main()

  response = model(messages, max_tokens=1024)
Processing PDFs: 100%|██████████| 76/76 [04:42<00:00,  3.72s/it]

Extraction complete. Results saved to extracted_invoices.csv.



