In [39]:
%pip install -q datasets jiwer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [22]:
import os
import torch
import pandas as pd
import requests
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric
from tqdm.notebook import tqdm

# Load pre-trained model checkpoint `VisionEncoderåDecoderModel`
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
model = VisionEncoderDecoderModel.from_pretrained(
    'microsoft/trocr-base-printed')

# Set the decoder_start_token_id
model.config.decoder_start_token_id = processor.tokenizer.bos_token_id

# Ensure the pad_token_id is set correctly
model.config.pad_token_id = processor.tokenizer.pad_token_id

# Test specific example
url = 'https://fki.tic.heia-fr.ch/static/img/a01-122-02-00.jpg'
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

pixel_values = processor(images=image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(
    generated_ids, skip_special_tokens=True)[0]
print("Generated text:", generated_text)

# Path to the IAM image data
root_dir = '/Users/adamking/Documents/NaturalLanguageProcessing/nlp-final-project/IAM/image'

# Path to the gt_test.txt test information file
gt_file_path = os.path.join(root_dir, '../gt_test.txt')

# Character error rate for accuracy measurement
cer = load_metric("cer")

# Read gt_test.txt and create file_names and texts lists
file_names = []
texts = []

with open(gt_file_path, 'r') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 2:
            file_names.append(parts[0])
            texts.append(parts[1])

# Check if any files are missing texts
assert len(file_names) == len(
    texts), "File names and texts lists must be of the same length"

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generated text: INDLUS THE


In [23]:
# Define the dataset class
class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]
        # prepare image (i.e. resize + normalize)
        image = Image.open(os.path.join(
            self.root_dir, file_name)).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(
            text, padding="max_length", max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label !=
                  self.processor.tokenizer.pad_token_id else -100 for label in labels]
        encoding = {"pixel_values": pixel_values.squeeze(),
                    "labels": torch.tensor(labels)}
        return encoding

In [24]:
# Create DataFrame
df = pd.DataFrame({'file_name': file_names, 'text': texts})

# Split DataFrame into train, test, and validation sets
train_df = df.sample(frac=0.8, random_state=42).reset_index(drop=True)
remaining = df.drop(train_df.index).reset_index(drop=True)
test_df = remaining.sample(frac=0.5, random_state=42).reset_index(drop=True)
valid_df = remaining.drop(test_df.index).reset_index(drop=True)

# Create datasets
train_dataset = IAMDataset(root_dir=root_dir, df=train_df, processor=processor)
test_dataset = IAMDataset(root_dir=root_dir, df=test_df, processor=processor)
eval_dataset = IAMDataset(root_dir=root_dir, df=valid_df, processor=processor)

# DataLoaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(eval_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [17]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

In [18]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [44]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
    return total_loss / len(dataloader)


# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 7  # Increase this, as early stopping will prevent overfitting
early_stopping = EarlyStopping(patience=5, min_delta=0.01)

In [45]:
# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f"Training loss: {train_loss}")

    val_loss = evaluate(model, valid_loader, device)
    print(f"Validation loss: {val_loss}")

    early_stopping(val_loss)
    if early_stopping.early_stop:
        print("Early stopping")
        break

    # Optional: Save the model if it's the best so far
    if early_stopping.best_loss == val_loss:
        torch.save(model.state_dict(), 'best_model.pth')

# Load the best model before final evaluation
model.load_state_dict(torch.load('best_model.pth'))

# Final evaluation on test set
test_loss = evaluate(model, test_loader, device)
print(f"Test loss: {test_loss}")

Epoch 1/10


Training:   0%|          | 0/292 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
# Load model
drive_model_path = 'PATH TO MODEL'
model = VisionEncoderDecoderModel.from_pretrained(
    'microsoft/trocr-base-printed')
model.load_state_dict(torch.load(drive_model_path, map_location=torch.device('cpu')))

# Define a function to preprocess and predict the text from an image


# def predict_text_from_image(image_path, model, processor, device):
#     image = Image.open(image_path).convert("RGB")
#     pixel_values = processor(
#         images=image, return_tensors="pt").pixel_values.to(device)
#     model.eval()
#     with torch.no_grad():
#         generated_ids = model.generate(pixel_values)
#     generated_text = processor.batch_decode(
#         generated_ids, skip_special_tokens=True)[0]
#     return generated_text


# # Paths to the new sample images
# new_image_paths = [
#     'PATH TO WRITING 1/adam_writing.jpeg',
#     'PATH TO WRITING/tarun_writing.jpeg'
# ]

# # Predict and print the text for each new sample image
# for image_path in new_image_paths:
#     predicted_text = predict_text_from_image(
#         image_path, model, processor, device)
#     print(f"Predicted text for {image_path}: {predicted_text}")

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [28]:
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from datasets import load_metric
from tqdm.notebook import tqdm

# Function to calculate accuracy
def calculate_accuracy(pred_str, label_str):
    correct = 0
    total = 0
    for pred, label in zip(pred_str, label_str):
        if pred.strip() == label.strip():
            correct += 1
        total += 1
    accuracy = correct / total if total > 0 else 0
    return accuracy

# Function to evaluate the model and calculate accuracy and CER
def evaluate_model(model, dataloader, processor, device):
    model.eval()
    total_loss = 0
    all_pred_str = []
    all_label_str = []
    cer_metric = load_metric("cer")

    with torch.no_grad():
        for batch in tqdm(dataloader):
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            generated_ids = model.generate(pixel_values)
            pred_str = processor.batch_decode(
                generated_ids, skip_special_tokens=True)
            labels[labels == -100] = processor.tokenizer.pad_token_id
            label_str = processor.batch_decode(
                labels, skip_special_tokens=True)

            all_pred_str.extend(pred_str)
            all_label_str.extend(label_str)
            cer_metric.add_batch(predictions=pred_str, references=label_str)

    avg_loss = total_loss / len(dataloader)
    accuracy = calculate_accuracy(all_pred_str, all_label_str)
    cer = cer_metric.compute()

    return avg_loss, accuracy, cer


# Load the best model before final evaluation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VisionEncoderDecoderModel.from_pretrained(
    'microsoft/trocr-base-printed')
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')

# Set the decoder_start_token_id and pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

# Load the saved model state
model.load_state_dict(torch.load('PATH TO MODEL'), map_location=device))
model.to(device)

# Final evaluation on test set
test_loss, test_accuracy, test_cer = evaluate_model(
    model, test_loader, processor, device)
print(f"Test loss: {test_loss}")
print(f"Test accuracy: {test_accuracy * 100:.2f}%")
print(f"Character error rate on test set: {test_cer * 100:.2f}%")

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/37 [00:00<?, ?it/s]



Test loss: 0.5260870975215692
Test accuracy: 49.66%
Character error rate on test set: 10.57%
