In [5]:
!pip install opencv-python
!pip install pytesseract
!pip install transformers
!pip install datasets
!pip install onnxruntime
!pip install onnx
!pip install onnxruntime-tools
!apt-get install -y poppler-utils
!apt-get install tesseract-ocr
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets
!pip install onnxruntime
!pip install onnx
!pip install onnxruntime-tools
!apt-get install -y poppler-utils
!apt-get install tesseract-ocr
!pip install pdf2image
!pip install pytesseract

Collecting opencv-python
  Downloading opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (62.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: opencv-python
Successfully installed opencv-python-4.10.0.84
Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10
Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting filelock
  Downloading filelock-3.15.4-py3-none-any.whl (16 kB)
Collecting regex!=2019.12.17
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (775 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Part 1: Model Training
### 1. Environment Setup
#### Install Python and necessary libraries


In [None]:
# Install Python
sudo apt-get update
sudo apt-get install python3.8 python3.8-venv python3-pip

# Set up a virtual environment 
python3.8 -m venv invoice_env
source invoice_env/bin/activate

# Install necessary libraries
pip install numpy pandas scikit-learn torch transformers onnx onnxruntime pytorch-lightning pdf2image pytesseract

### 2. Data Collection
#### Collect a diverse dataset of invoices in PDF format

In [None]:
from pdf2image import convert_from_path
import pytesseract
import re
import os

def pdf_to_text(pdf_path):
    images = convert_from_path(pdf_path)
    text = ''
    for image in images:
        text += pytesseract.image_to_string(image)
    return text

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    return text

def process_invoices(data_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for filename in os.listdir(data_dir):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(data_dir, filename)
            text = pdf_to_text(pdf_path)
            clean_text = clean_text(text)
            output_path = os.path.join(output_dir, filename.replace(".pdf", ".txt"))
            with open(output_path, 'w') as f:
                f.write(clean_text)

# Example usage
if __name__ == "__main__":
    data_dir = "data/sample_invoices"
    output_dir = "data/processed_invoices"
    process_invoices(data_dir, output_dir)


### 3. Data Preprocessing
#### Clean and preprocess the extracted text

In [None]:
import re

def clean_text(text):
    # Remove unwanted characters and clean text
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    return text

### 4. Model Training
#### Use a pre-trained model and fine-tune it

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_metric

# Load a pre-trained model and tokenizer
model_name = "dbmdz/bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# Prepare the dataset
dataset = load_dataset('your_dataset')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

## Part 2: Model Optimization
### 1. Convert to ONNX
### Export the trained model to ONNX format

In [None]:
import torch
import onnx

# Export the model
dummy_input = torch.randn(1, 128, requires_grad=True)  # Adjust input size as needed
torch.onnx.export(model, dummy_input, "invoice_model.onnx")

### 2. Optimize the Model
#### Use techniques like quantization

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

# Quantize the model
quantized_model_path = "invoice_model_quantized.onnx"
quantize_dynamic("invoice_model.onnx", quantized_model_path, weight_type=QuantType.QUInt8)

## Part 3: Model Deployment
### 1. Set Up Client Environment
### Ensure the client machine has the necessary libraries installed

In [None]:
# Install ONNX Runtime
pip install onnxruntime

### 2. Load and Run the Model
#### Write a script to load and run the optimized model on the client desktop

In [None]:
import onnxruntime as ort

def load_model(model_path):
    # Load the model
    session = ort.InferenceSession(model_path)
    return session

def run_model(session, input_data):
    # Prepare input
    inputs = {session.get_inputs()[0].name: input_data}

    # Run the model
    outputs = session.run(None, inputs)
    return outputs

# Load the quantized model
session = load_model("invoice_model_quantized.onnx")

# Prepare input data (example)
input_data = np.random.randn(1, 128).astype(np.float32)  # Adjust input size as needed

# Run the model
outputs = run_model(session, input_data)
print(outputs)