In [1]:
import json
import pandas as pd
import torch
import accelerate
from datasets import Dataset
from transformers import LayoutLMTokenizer, LayoutLMForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

print("Imports successful.")

Imports successful.


In [2]:
# Dataset
dataset_path = '/Users/almonsubba/Desktop/pdf_app/dataset.json'

with open(dataset_path, 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)
print("Dataset loaded successfully.")
df.head()

Dataset loaded successfully.


Unnamed: 0,image_path,words,bbox,ner_tags
0,pdf_images/sept23Alchemy_page_1.png,"[organization_name, organization_name, invoice...","[[0.04941176470588235, 0.03772727272727273, 0....","[organization_name, organization_name, invoice..."
1,pdf_images/sep23cloudflare_page_1.png,"[invoice_number, invoice_date, organization_na...","[[0.22294117647058823, 0.09227272727272727, 0....","[invoice_number, invoice_date, organization_na..."
2,pdf_images/sep23cloudflare_page_2.png,"[amount, organization_name, amount, amount, in...","[[0.5670588235294117, 0.11136363636363636, 0.6...","[amount, organization_name, amount, amount, in..."
3,pdf_images/sept23figma_page_1.png,"[invoice_number, invoice_date, organization_na...","[[0.16588235294117648, 0.08681818181818182, 0....","[invoice_number, invoice_date, organization_na..."


In [3]:
# Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df)
print("Hugging Face Dataset created successfully.")
hf_dataset

Hugging Face Dataset created successfully.


Dataset({
    features: ['image_path', 'words', 'bbox', 'ner_tags'],
    num_rows: 4
})

In [4]:
# Initialize Tokenizer and Model
tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
model = LayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=5)

print("Tokenizer and Model initialized.")

Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer and Model initialized.


In [5]:
# Preprocess Data 
label_map = {"organization_name": 0, "invoice_number": 1, "invoice_date": 2, "amount": 3, "invoice_currency": 4}

def preprocess_data(batch):
    words = batch['words']
    bbox = batch['bbox']
    
    encoding = tokenizer.batch_encode_plus(
        words, 
        boxes=bbox, 
        truncation=True, 
        padding=True, 
        return_tensors="pt", 
        is_split_into_words=True
    )
    
    labels = [[label_map[label] for label in label_list] for label_list in batch['ner_tags']]
    
    encoding['labels'] = labels
    
    return encoding

encoded_dataset = hf_dataset.map(preprocess_data, batched=True)
print("Data preprocessed successfully.")
encoded_dataset


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Keyword arguments {'boxes': [[[0.04941176470588235, 0.03772727272727273, 0.29, 0.09136363636363637], [0.29470588235294115, 0.045909090909090906, 0.35705882352941176, 0.06272727272727273], [0.8629411764705882, 0.08318181818181818, 0.9488235294117647, 0.10545454545454545], [0.8564705882352941, 0.10818181818181818, 0.9488235294117647, 0.12772727272727272], [0.8647058823529412, 0.3418181818181818, 0.9235294117647059, 0.36272727272727273], [0.8629411764705882, 0.4081818181818182, 0.9217647058823529, 0.42863636363636365], [0.8652941176470588, 0.34363636363636363, 0.8782352941176471, 0.36272727272727273], [0.8652941176470588, 0.4113636363636364, 0.8794117647058823, 0.42954545454545456], [0.17, 0.5081818181818182, 0.2023529411764706, 0.5222727272727272]], [[0.22294117647058823, 0.09227272727272727, 0.40058823529411763, 0.11863636363636364], [0.22058823529411764, 0.12318181818181818, 0.31, 0.13863636363636364], [0.11411764705882353, 0.17636363636363636, 0.1776470588235294, 0.18818181818181817],

Data preprocessed successfully.


Dataset({
    features: ['image_path', 'words', 'bbox', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 4
})

In [6]:
# Split Dataset
train_test_split = encoded_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

print("Dataset split into training and testing sets.")
train_dataset, test_dataset

Dataset split into training and testing sets.


(Dataset({
     features: ['image_path', 'words', 'bbox', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 3
 }),
 Dataset({
     features: ['image_path', 'words', 'bbox', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 1
 }))

In [7]:
# Training
training_args = TrainingArguments(
    output_dir="/Users/almonsubba/Desktop/pdf_app/pdf_training_folder",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
#     print("Trainer initialized successfully.")
# except Exception as e:
#     print(f"Error initializing Trainer: {e}")

TypeError: Accelerator.__init__() got an unexpected keyword argument 'use_seedable_sampler'