Import OS, setting OS environment here.

In [None]:
# ALert! This cuda_launch_blocking is only for debugging purposes. It is not recommended to use it in production.
# This will SLOW DOWN the training process.

import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

Import the libraries used for training.

In [20]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
from torch import nn, Tensor
from torch.nn import CrossEntropyLoss
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.models import resnet18, ResNet50_Weights, ResNet18_Weights
from transformers import PreTrainedTokenizerFast
import time
from PIL import Image
import os

Initialize the Wandb to record the experiment process

In [None]:
import wandb

wandb.init(project="OCR_Recognition", name="ResNet18_LSTM_1")

wandb.config.update({"starting_learning_rate": 0.001, "epochs": 200, "batch_size": 32})
wandb.config.update({"cnn_backend": "ResNet50", "dataset": "OCR", "optimizer": "AdamW", "scheduler": "ReduceLROnPlateau"})
wandb.config.update({"loss_function": "CTCLoss", "pretrained": True, "pretrained_weights": "IMAGENET1K_V2"})
wandb.config.update({"lr_scheduler": "ReduceLROnPlateau", "lr_patience": 5, "lr_factor": 0.1, "lr_min": 1e-6})

Define the Custom Tokenizer used for OCR Detection Task

In [2]:
# Custom Tokenizer for OCR Detection task
tokenizer = PreTrainedTokenizerFast(tokenizer_file = 'C:/Users/ra78lof/occinference/byte-level-BPE.tokenizer.json')
#feature_extractor = AutoFeatureExtractor.from_pretrained('microsoft/swin-base-batch4-window7-224-in22k')

# Add PAD token to the vocabulary, otherwise it will throw an error
tokenizer.add_special_tokens({'pad_token': "pad_token"})

# Debug test for blank token, this token is required for CTC loss
# tokenizer.decode(62)

# Debug test for pad token, this token is required for padding sequences
# print(tokenizer.pad_token_id)

# Debug test for token length, this is required for the model building
# print(len(tokenizer))

0

Define the CustomDataset Class

In [3]:
class CustomDataset(Dataset):
    def __init__(self, excel_file, img_dir, tokenizer = tokenizer, feature_extractor = None, transform=None, max_target_length = 45):
        self.data = pd.read_excel(excel_file)
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.transform = transform
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = self.data['ImageName'][idx]
        text = self.data['Labels'][idx]
        image = Image.open(img_name).convert('L')

        if self.transform:
            image = self.transform(image)

        # The labels MUST be tokenized and padded
        labels = self.tokenizer(text, padding = 'max_length', max_length = self.max_target_length).input_ids
        

        return image, torch.as_tensor(labels)

Define our model architecture, this CNN-LSTM architecture includes the following part: A modified resnet18 with pretrained weights, some medium CNN layers and LSTM layers.

In [17]:
from torch import nn, Tensor

class ModifiedResNet(nn.Module):
    """
    A modified ResNet architecture for Optical Character Recognition (OCR).

    Attributes:
        features (nn.Sequential): A sequential container of the original ResNet layers excluding avgpool and fc layers.
        conv1 (nn.Conv2d): Convolution layer to adjust input channels to 1 (grayscale images).
        post_resnet1 (nn.Conv2d): Convolution layer following the features layer.
        bn1 (nn.BatchNorm2d): Batch normalization layer following post_resnet1.
        relu1 (nn.ReLU): ReLU activation layer following bn1.
        post_resnet2 (nn.Conv2d): Another convolution layer following relu1.
        bn2 (nn.BatchNorm2d): Batch normalization layer following post_resnet2.
        relu2 (nn.ReLU): ReLU activation layer following bn2.
        post_resnet3 (nn.Conv2d): Another convolution layer following relu2.
        bn3 (nn.BatchNorm2d): Batch normalization layer following post_resnet3.
        relu3 (nn.ReLU): ReLU activation layer following bn3.
        dwv (nn.Conv2d): Depthwise convolution layer for channel reduction following relu3.
        lstm1 (nn.LSTM): LSTM layer following the depthwise convolution.
        linear1 (nn.Linear): Fully connected layer to project LSTM output to class scores.
    """

    def __init__(self, original_resnet: nn.Module):
        """
        Initializes the ModifiedResNet with an original_resnet model.

        Args:
            original_resnet (nn.Module): The original ResNet model.
        """
        super(ModifiedResNet, self).__init__()
        self.features = nn.Sequential(*list(original_resnet.children())[:-2]) # Remove avgpool and fc layers in the original resnet
        self.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) # Adjust input channels to 1, since the input images are grayscale
        
        self.post_resnet1 = nn.Conv2d(512, 512, kernel_size=(2, 3), stride=(2, 2), padding=(1, 1), bias=False)
        self.bn1 = nn.BatchNorm2d(512) # Batch normalization after post_resnet1, important for training
        self.relu1 = nn.ReLU(inplace=True) # ReLU activation after post_resnet1, import for training
        
        self.post_resnet2 = nn.Conv2d(512, 512, kernel_size=(3, 4), stride=(1, 1), padding=(1, 1), bias=False)
        self.bn2 = nn.BatchNorm2d(512) # Batch normalization after post_resnet2, important for training
        self.relu2 = nn.ReLU(inplace=True) # ReLU activation after post_resnet2, import for training
        
        self.post_resnet3 = nn.Conv2d(512, 512, kernel_size=(2, 3), stride=(2, 2), padding=(1, 1), bias=False)
        self.bn3 = nn.BatchNorm2d(512) # Batch normalization after post_resnet3, important for training
        self.relu3 = nn.ReLU(inplace=True) # ReLU activation after post_resnet3, import for training
        
        self.dwv = nn.Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False) # Depthwise Convolution for channel reduction
        self.lstm1 = nn.LSTM(bidirectional=True, num_layers=2, input_size=128, hidden_size=128, dropout=0)
        self.linear1 = nn.Linear(256, 82) # Project first dimension of LSTM output to 82 (number of classes including the PAD token)

    def forward(self, x: Tensor) -> Tensor:
        """
        Defines the forward pass of the ModifiedResNet.

        Args:
            x (Tensor): The input tensor.

        Returns:
            Tensor: The output tensor.
        """
        x = self.features(x)
        x = self.post_resnet1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        
        x = self.post_resnet2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        
        x = self.post_resnet3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        
        x = self.dwv(x)
        
        batch_size, channels, height, width = x.size()
        x = x.permute(0, 2, 3, 1).contiguous() # Change the order of the dimensions, this is required for the LSTM layer
        x = x.view(batch_size, height * width, channels) # Reshape to (batch_size, sequence_length, input_dim)
        x, _ = self.lstm1(x)
        x = self.linear1(x)
        return x


A simple debug test with a dummy input

In [5]:
# Debug test for the ModifiedResNet


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dummy_input = torch.randn(32, 1, 500, 1200).to(device)

original_resnet = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
original_resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

model= ModifiedResNet(original_resnet).to(device)

with torch.no_grad():
    output = model(dummy_input)
    print(output.shape)


torch.Size([32, 45, 82])


Preparation for Train, valid and Test Dataset

In [6]:
# Define the transforms for the dataset

transform = transforms.Compose([
    transforms.Resize((500, 1200)),  # Resize images to the required dimensions
    transforms.ToTensor(),  # Convert PIL image to PyTorch tensor
])

In [None]:
# Load Training, Validation and Test data

train_dataset = CustomDataset(excel_file='C:/Users/ra78lof/occinference/Test_data.xlsx',
                             img_dir='C:/Users/ra78lof/occinference/Test_data/', tokenizer = tokenizer, transform=transform)

valid_dataset = CustomDataset(excel_file='C:/Users/LMMISTA-WAP265/OcciGen/data/dom_project/Val_data.xlsx',
                              img_dir='C:/Users/LMMISTA-WAP265/OcciGen/data/dom_project/Val_data/', tokenizer = tokenizer, transform=transform)

test_dataset = CustomDataset(excel_file='C:/Users/ra78lof/occinference/Test_data.xlsx',
                             img_dir='C:/Users/ra78lof/occinference/Test_data/', tokenizer = tokenizer, transform=transform)

In [None]:
# Define the DataLoaders

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

Define the model, optimizer, scheduler and loss function

In [9]:
# Define the model, optimizer, scheduler and loss function

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

original_resnet = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
original_resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
model= ModifiedResNet(original_resnet).to(device)

optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = 10, eta_min=0, last_epoch=-1, verbose=False)

# The following scheduler can be used during validation
# scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=4, threshold=0.0001, min_lr=0.00001)
# ...valid_loss += loss.item()
#    scheduler.step(valid_loss)...

# The CTC loss function is returning blank predictions for some reason
# criterion = CTCLoss(blank=0, reduction='mean', zero_infinity=True)
criterion = nn.CrossEntropyLoss()

Define the train process

In [11]:
# Use Wandb to track the Epochs
# wandb.watch(model, log="all")
epochs = 200

def train(model, train_loader, optimizer, epochs):
    model.train()
    train_loss = 0
    start_time = time.time()

    criterion = CrossEntropyLoss()

    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device) 

        optimizer.zero_grad()
        output = model(data) 

        loss = criterion(output.view(-1, output.size(2)), target.view(-1))  

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # wandb.log({'Train Loss': train_loss / (batch_idx + 1)})

        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tAverage loss: {:.6f}'.format(
                epochs, batch_idx * len(data), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader),
                       train_loss / (batch_idx + 1)))
    scheduler.step()       
    end_time = time.time()
    print("Time taken for epoch: ", end_time - start_time)
    return train_loss / len(train_loader.dataset)

Define the validation process

In [None]:
def validation(model, valid_loader):
    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for _, (data, target) in enumerate(valid_loader):
            data = data.to(device)
            target = target.to(device)
            output = model(data)
           
            loss = criterion(output.view(-1, output.size(2)), target.view(-1))
            
            valid_loss += loss.item()
            # scheduler.step(valid_loss)

    valid_loss /= len(valid_loader)
    wandb.log({'Validation Loss': valid_loss})
    print('\nValidation set: Average loss: {:.4f}\n'.format(valid_loss))
    return valid_loss

Define the test process

In [None]:
def test(model, test_loader):
    model.load_state_dict(torch.load('C:/Users/ra78lof/occinference/ocr_model.pt'))
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for _, (data, target) in enumerate(test_loader):
            data = data.to(device)
            target = target.to(device)
            output = model(data)
        

            #tokenizer.batch_decode(output, skip_special_tokens=True)
            loss = criterion(output.view(-1, output.size(2)), target.view(-1))
            test_loss += loss.item()

    test_loss /= len(test_loader)
    wandb.log({'Test Loss': test_loss})
    print('\nTest set: Average loss: {:.4f}\n'.format(test_loss))
    return test_loss


Define the whole training, validating and test process

In [None]:
def main(training_epochs, save_dir, model_save_name):
    best_loss = float('inf')
    # Lists to keep track of losses over epochs
    train_losses = []
    valid_losses = []
    test_losses = []

    for epoch in range(1, training_epochs + 1):
        # Capture train loss
        train_loss = train(model, train_loader, optimizer, epoch)
        train_losses.append(train_loss)
        
        # Capture validation loss
        valid_loss = validation(model, valid_loader)
        valid_losses.append(valid_loss)

        if valid_loss < best_loss:
            best_loss = valid_loss
            model_save_path = os.path.join(save_dir, model_save_name)
            torch.save(model.state_dict(), model_save_path)
            print(f'Model saved at epoch {epoch} with validation loss: {valid_loss:.6f}')
        
        # Capture test loss
        test_loss = test(model, test_loader, criterion)
        test_losses.append(test_loss)

        print(f'Test Loss at epoch {epoch}: {test_loss:.6f}')
        print(f'Epoch {epoch}/{training_epochs}, Best Loss: {best_loss:.6f}\n')

    # Visualize the losses over epochs
    
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_losses, label='Training Loss', marker='o')
    plt.plot(epochs, valid_losses, label='Validation Loss', marker='o')
    plt.plot(epochs, test_losses, label='Test Loss', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss over Epochs')
    plt.legend()
    plt.grid(True)
    plt.show()

# Call your main function
main(200, 'save_directory', 'model_name.pth')

Everything is setted up, let's go!

In [23]:
# Set parameters
training_epochs = 200
save_dir = 'C:/Users/ra78lof/occinference'
model_save_name = 'ocr_model.pt'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [18]:
model= ModifiedResNet(original_resnet).to(device)

# Load our trained model
model.load_state_dict(torch.load('C:/Users/ra78lof/occinference/ocr_model_10.22_9.pt'))
# Call main function
main(200, 'save_directory', 'model_name.pth')

<All keys matched successfully>

In [24]:
for epoch in range(1, 200):
    train_loss = train(model, train_loader, optimizer)
    torch.save(model.state_dict(), os.path.join(save_dir, 'ocr_model_10.22_nico.pt'))
    print('Model Saved')

Time taken for epoch:  177.0628731250763
Model Saved
Time taken for epoch:  176.05991744995117
Model Saved
Time taken for epoch:  176.69951248168945
Model Saved
Time taken for epoch:  176.2782781124115
Model Saved
Time taken for epoch:  176.85756039619446
Model Saved
Time taken for epoch:  176.91308784484863
Model Saved
Time taken for epoch:  177.16970205307007
Model Saved
Time taken for epoch:  176.50205731391907
Model Saved
Time taken for epoch:  188.41478276252747
Model Saved
Time taken for epoch:  211.09990048408508
Model Saved
Time taken for epoch:  210.6844358444214
Model Saved
Time taken for epoch:  205.44177412986755
Model Saved
Time taken for epoch:  210.15373754501343
Model Saved
Time taken for epoch:  210.32260370254517
Model Saved
Time taken for epoch:  210.1802523136139
Model Saved
Time taken for epoch:  210.14498233795166
Model Saved
Time taken for epoch:  210.13938093185425
Model Saved
Time taken for epoch:  207.47882270812988
Model Saved
Time taken for epoch:  175.71558

KeyboardInterrupt: 

Decode the Inference result

In [26]:
#Enumerate the test dataset to get the predictions
label_list = []
pred_list = []

model= ModifiedResNet(original_resnet).to(device)

# Load our trained model
model.load_state_dict(torch.load('C:/Users/ra78lof/occinference/ocr_model_10.22_nico.pt'))

for _, (data, target) in enumerate(train_loader):
    data = data.to(device)
    target = target.to(device)
    with torch.no_grad():

        output = model(data)
        # Get the index of the class with the highest probability score
        pred = output.softmax(dim=2)
        pred = torch.argmax(pred, dim=2)
        label_list += tokenizer.batch_decode(target, skip_special_tokens=True)
        # label_list = label_list + tokenizer.batch_decode(target, skip_special_tokens=True)
        pred_list += tokenizer.batch_decode(pred, skip_special_tokens=True)
        #print(f'Current length label list: {len(label_list)}')
        #print(f'Current length pred list: {len(pred_list)}')
        #print(f'Label: {tokenizer.batch_decode(target, skip_special_tokens=True)}')
        #print(f'Prediction: {tokenizer.batch_decode(pred, skip_special_tokens=True)}')
        #print(f'Final length label list: {len(label_list)}')
        #print(f'Final length pred list: {len(pred_list)}')

In [28]:
# Decode one batch of the test dataset to get the predictions
# This is just for debugging purposes  
'''
data, target = next(iter(train_loader))
data = data.to(device)
target = target.to(device)
with torch.no_grad():
    output = model(data)
pred = output.softmax(dim=2)
pred = torch.argmax(pred, dim=2)

# Decode the predictions and labels
label = tokenizer.batch_decode(target, skip_special_tokens=True)
pred = tokenizer.batch_decode(pred, skip_special_tokens=True)

# Print the predictions and labels
print(f'Label: {label}')
print(f'Prediction: {pred}')
'''

Label: ['SEQUA@SECA', 'BLANQUINEU@BLANQUINEL', 'SEMMANA@SEMANA', 'BARRIU@BARRIL', 'BATISME@BAPTISME', 'ANIC@AMIC', 'MIRAC@MIRAT', 'DESCRIEURE@DESCRIURE', 'EMBOCHAR@EMBOCAR', 'POYSANSA@POISANSA', 'BASTARDO@BASTARDA', 'BENDENHA@VENDEMIA', 'MINIEYRA@MENIERA', 'SORTY@SORTIR', 'MEYNADES@MAINADA', 'FOPELANDA@OPALANDA', 'MESTURE@MESTURA', 'RECEPTATION@RECEPTACION', 'DAROCAR@DEROCAR', 'MELHUIRAR@MELHORAR', 'MARTYRIAR@MARTIRIAR', 'CARAMELAR@CALAMELAR', 'FILLOL@FILHOL', 'ASO@AZON', 'FOGASSE@FOGASA', 'CAMBA@CAMBE', 'PATEYAR@PATIAR', 'APENDRE@APRENDRE', 'CABIROUS@CABRION', 'FORASTEYR@FORESTIER', 'FRAGURA@FRACHURA', 'MERSÉS@MERCE']
Prediction: ['SEQUA@SEA', 'BLANQUINEU@BLANQUINEL', 'SEMMANA@SEMANA', 'BARRIU@BARRIL', 'BATISME@BAPTISME', 'ANIC@AMIC', 'MIRAC@MIRAT', 'DESCRIEURE@DESCRIURE', 'EMBOCHAR@EMBOCAR', 'POYSANSA@POISANSA', 'BASTARDO@BASTARDA', 'BENDENHA@VENDEMIA', 'MINIEYRA@MENIERA', 'SORTY@SORTIR', 'MEYNADES@MAINADA', 'FOPELANSA@OPALANDA', 'MESTURE@MESTURA', 'RECEPTATION@RECEPTACION', 'DAROCAR

In [30]:
# Store the predictions in an excel file

pd.DataFrame({'label': label_list, 'pred': pred_list}).to_excel('C:/Users/ra78lof/occinference/ocr_predictions_10.22_10.xlsx', index=False)