In [8]:
!pip install PyPDF2 requests pillow pytesseract opencv-python-headless numpy

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10


In [9]:
#Final script which works with both images and pdfs
import PyPDF2
import requests
import json
import os
from typing import List, Dict
from google.colab import files
from io import BytesIO
from PIL import Image
import pytesseract
import cv2
import numpy as np

# Install necessary libraries
!apt-get install -y tesseract-ocr
!pip install pytesseract opencv-python-headless

def upload_files():
    print("Please upload your PDF invoices and/or image files:")
    uploaded = files.upload()
    return uploaded

def extract_text_from_pdf(pdf_file: BytesIO) -> str:
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def extract_text_from_image(image_file: BytesIO) -> str:
    # Read the image file
    image = Image.open(image_file)

    # Convert to numpy array for OpenCV processing
    image_np = np.array(image)

    # Convert to grayscale
    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)

    # Apply thresholding to preprocess the image
    threshold = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

    # Perform text extraction
    text = pytesseract.image_to_string(threshold)

    return text

def extract_invoice_details(text: str) -> Dict[str, any]:
    prompt = f"""
    Extract the following details from the invoice text:
    1. Customer details (name, address, phone, email)
    2. Products (including quantity, price, and HSN code if available)
    3. Total Amount
    4. GSTIN (if available)
    5. Invoice Number
    6. Invoice Date

    Invoice text:
    {text}

    Respond in JSON format.
    """

    response = call_llm_api(prompt)

    invoice_details = json.loads(response)
    return invoice_details

def call_llm_api(prompt: str) -> str:
    api_url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {openai_api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.2
    }
    response = requests.post(api_url, headers=headers, json=data)
    return response.json()['choices'][0]['message']['content']

def process_file(file_content: BytesIO, filename: str) -> Dict[str, any]:
    if filename.lower().endswith('.pdf'):
        text = extract_text_from_pdf(file_content)
    elif filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
        text = extract_text_from_image(file_content)
    else:
        return {"filename": filename, "error": "Unsupported file format"}

    invoice_details = extract_invoice_details(text)
    return {
        "filename": filename,
        "invoice_data": invoice_details
    }

def process_all_files(uploaded_files: Dict[str, BytesIO]) -> List[Dict[str, any]]:
    results = []
    for filename, file_content in uploaded_files.items():
        result = process_file(BytesIO(file_content), filename)
        results.append(result)
    return results

# Main execution
uploaded_files = upload_files()

# Get OpenAI API key
openai_api_key = input("Enter your OpenAI API key: ")

all_file_data = process_all_files(uploaded_files)

# Print the results
for file_data in all_file_data:
    print(f"File: {file_data['filename']}")
    if 'error' in file_data:
        print(f"Error: {file_data['error']}")
    else:
        print(json.dumps(file_data['invoice_data'], indent=2))
    print("\n" + "="*50 + "\n")

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (3,191 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123598 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

Saving Sample Invoice image.png to Sample Invoice image.png
Enter your OpenAI API key: sk-I1vFYx2vR86rczgBF7qg2tsfvhlTbIfLtw2Uo0CwpFT3BlbkFJ1aECUqekXrNXZO8KvFVnYDvzM94PQAP2y2DCVMR2MA
File: Sample Invoice image.png
{
  "Customer": {
    "name": "TATA MOTORS LIMITED",
    "address": "TATA MOTORS LIMITED Nigadi Bhosari Road, PIMPRI Pune, MAHARASHTRA, 411018",
    "phone": "9108239284",
    "email": "test@gmail.com"
  },
  "Products": [
    {
      "name": "WASTE AND SCRAP OF STAINLESS STEEL",
      "rate": 95.0,
      "quantity": "6,790 KGS",
      "total_amount": "6,45,050.00",
      "HSN_code": "72042190"
    }
  ],
  "Total Amount": "7,68,771.00",
  "GSTIN": "27AAACT2727Q1ZW",
  "Invoice Number": "inva",
  "Invoice Date": "18. Jul 2024"
}


