In [1]:
!pip install git+https://github.com/githubharald/CTCDecoder.git jiwer python-bidi arabic-reshaper

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting git+https://github.com/githubharald/CTCDecoder.git
  Cloning https://github.com/githubharald/CTCDecoder.git to /tmp/pip-req-build-sy3qrcf0
  Running command git clone -q https://github.com/githubharald/CTCDecoder.git /tmp/pip-req-build-sy3qrcf0
  Resolved https://github.com/githubharald/CTCDecoder.git to commit 6b5c3dd34944e5399a7308e241319b7f9c47e7c3


In [2]:
# Memilih GPU yang akan digunakan (contohnya: GPU #7)
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
model_name = 'ctc-crnn'

In [4]:
import importlib
import pegon_utils
importlib.reload(pegon_utils)
from pegon_utils import PEGON_CHARS, CHAR_MAP

In [5]:
for i in CHAR_MAP.keys():
    try:
        assert len(i) == 1
    except AssertionError:
        print(i)
        raise

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils as utils
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import transforms

from PIL import Image

import json
import os
import glob
import re
import datetime
import shutil
import pickle
import unicodedata

from functools import partial

from tqdm import tqdm
import matplotlib.pyplot as plt

import numpy as np
import random

import matplotlib.pyplot as plt

seed = 2023
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [7]:
os.makedirs(model_name, exist_ok=True)

In [8]:
importlib.reload(pegon_utils)
from pegon_utils import OCRDataset, QuranAnnotatedDataset
from torch.utils.data import random_split, ConcatDataset

pegon_synth_dataset = OCRDataset().load('/workspace/Dataset/Synthesized-split/metadata.json')

pegon_synth_dataset.char_map = CHAR_MAP

dataset_transforms = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((pegon_synth_dataset.avg_img_h,
                       pegon_synth_dataset.avg_img_w)),
    transforms.RandomHorizontalFlip(p=1),
    transforms.ToTensor(),
])

pegon_synth_dataset.transform = dataset_transforms

train_synth_dataset, val_synth_dataset = random_split(pegon_synth_dataset,
                                                      lengths=[round(len(pegon_synth_dataset) * frac) for frac in [0.7, 0.3]])

# quran_train_dataset = QuranAnnotatedDataset('/workspace/Dataset/Quran data set/dicriticText/traning',
#                       image_transform=dataset_transforms)
# quran_test_dataset = QuranAnnotatedDataset('/workspace/Dataset/Quran data set/dicriticText/test',
#                       image_transform=dataset_transforms)

train_dataset = ConcatDataset((train_synth_dataset,))
                               # quran_train_dataset))
val_dataset = ConcatDataset((val_synth_dataset,))
                                # quran_test_dataset))

In [9]:
assert pegon_synth_dataset.char_map == CHAR_MAP

In [10]:
# Define the OCR model architecture
class CTCCRNN(nn.Module):
    def __init__(self, num_classes, image_height, image_width,
                 model_output_len, dropout_rate, conv_channels,
                 lstm_sizes):
        super().__init__()

        self.model_output_len = model_output_len
        self.image_height = image_height
        self.image_width = image_width
        self.conv_channels = conv_channels

        # CNN part, downsampling 2 times
        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=conv_channels[0],
                               kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv2d(in_channels=conv_channels[0],
                               out_channels=conv_channels[1],
                               kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        # first dense
        self.fc1 = nn.Linear(in_features=conv_channels[1] * (self.image_height//4),
                             out_features=conv_channels[1])
        self.dropout = nn.Dropout(dropout_rate)

        # RNN part
        self.lstm1 = nn.LSTM(input_size=conv_channels[1], hidden_size=lstm_sizes[0],
                             bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=lstm_sizes[0]*2, hidden_size=lstm_sizes[1],
                             bidirectional=True, batch_first=True)

        self.fc2 = nn.Linear(in_features=2*lstm_sizes[1], out_features=num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)

        x = self.pool1(x)

        x = self.conv2(x)
        x = F.relu(x)

        x = self.pool2(x)
        
        x = F.interpolate(x, size=(self.image_height//4, self.model_output_len),
                          mode='bilinear', align_corners=False)
        
        x = x.permute(0, 3, 1, 2).reshape(x.shape[0],
                                          x.shape[3],
                                          x.shape[1] * x.shape[2])
        x = self.fc1(x)
        x = self.dropout(x)
    
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = self.fc2(x)
        x = x.log_softmax(2)
        return x

In [11]:
importlib.reload(pegon_utils)
from pegon_utils import model_length

model = CTCCRNN(num_classes=len(CHAR_MAP),
                 image_height=pegon_synth_dataset.avg_img_h,
                 image_width=pegon_synth_dataset.avg_img_w,
                 model_output_len=model_length(b=2,c=100)(pegon_synth_dataset.max_seq_len),
                 dropout_rate=0,
                 conv_channels=[16, 16],
                 lstm_sizes=[256, 128])

In [12]:
importlib.reload(pegon_utils)
from pegon_utils import CTCTrainer, FocalCTCLoss

# Train the model
trainer = CTCTrainer(model=model,
                     max_norm=None,
#                      lr=1e-3,
                     optimizer=optim.AdamW(model.parameters(), lr=1e-3),
                     batch_size=4,
                     num_workers=2,
#                      criterion=FocalCTCLoss(zero_infinity=True,
#                                             alpha=0.5,
#                                             gamma=2),
                     dataset=train_dataset)

In [13]:
assert trainer.model(train_dataset[0][0].unsqueeze(0).to(trainer.device)).shape[-1] == len(CHAR_MAP)

In [14]:
timestamp = datetime.datetime.now()
print(timestamp)

2023-05-19 00:00:41.715324


In [None]:
trainer.train(num_epochs=10)

Epoch [1/10] | Batch [13732/13732] | Running Loss: 54576.4899: 100%|██████████| 13732/13732 [22:59<00:00,  9.95it/s] 
Epoch [2/10] | Batch [13732/13732] | Running Loss: 62836.0411: 100%|██████████| 13732/13732 [22:30<00:00, 10.17it/s] 
Epoch [3/10] | Batch [6760/13732] | Running Loss: 26561.0908:  49%|████▉     | 6760/13732 [10:54<13:23,  8.68it/s]

In [None]:
trainer.plot_history(path=f'{model_name}/{timestamp}.train.png')
trainer.save(f'{model_name}/{timestamp}.pt')

In [None]:
importlib.reload(pegon_utils)
from pegon_utils import CTCDecoder, BestPathDecoder, evaluate, plot_cer_wer

In [None]:
# dataloader = trainer.dataloader
dataloader = DataLoader(val_dataset,
                        batch_size=trainer.batch_size,
                        num_workers=trainer.num_workers,
                        shuffle=True,
                        collate_fn=trainer.collate_fn)

In [None]:
# timestamp = '2023-05-02 15:56:58.754379'
# timestamp = '2023-05-02 20:53:48.971202'
model_path = f'{model_name}/{timestamp}.pt'

In [None]:
decoder = BestPathDecoder.from_path(model_path, CHAR_MAP, blank_char=PEGON_CHARS[0])
cers, wers = evaluate(decoder, dataloader)

In [None]:
plot_cer_wer(cers, wers, path=f'{model_name}/{timestamp}.wer-cer.png')

In [None]:
# demo

import arabic_reshaper
from bidi.algorithm import get_display

to_arabic_display = lambda text: get_display(text)
img, label = random.choice(val_dataset)
img, _, _ = dataloader.collate_fn([[img, label]])

gt = (''.join([PEGON_CHARS[l] for l in filter(lambda x:x!= 0, label)]))

predicted = decoder.infer(img.cuda())[0]

tensor_to_display = lambda x : transforms.ToPILImage()(transforms.RandomHorizontalFlip(p=1)(x))

plt.imshow(tensor_to_display(img[0]), cmap='gray'); plt.title(f'predicted = {to_arabic_display(predicted)}\nground truth = {to_arabic_display(gt)}'); plt.show()

In [None]:
import importlib
import pegon_utils
importlib.reload(pegon_utils)
from pegon_utils import FilenameOCRDataset, PegonAnnotatedDataset, ctc_collate_fn
import pdb

annotated_dataset = PegonAnnotatedDataset('/workspace/Dataset/pegon-ocr-patched',
                                          image_transform=dataset_transforms)

annotated_dataloader = DataLoader(annotated_dataset, shuffle=True,
                                  batch_size=trainer.batch_size,
                                  num_workers=trainer.num_workers,
                                  collate_fn=ctc_collate_fn)
cers, wers = evaluate(decoder, annotated_dataloader)

In [None]:
plot_cer_wer(cers, wers, path=f'{model_name}/{timestamp}.eval.wer-cer.png')

In [None]:
# demo

import arabic_reshaper
from bidi.algorithm import get_display

to_arabic_display = lambda text: get_display(arabic_reshaper.reshape(text))
img, label, _ = dataloader.collate_fn([random.choice(annotated_dataset)])

predicted = decoder.infer(img.cuda())[0]

tensor_to_display = lambda x : transforms.ToPILImage()(transforms.RandomHorizontalFlip(p=1)(x))

plt.imshow(tensor_to_display(img[0]),cmap='gray'); plt.title(f'predicted = {to_arabic_display(predicted)}'); plt.show()