In [1]:
import os, sys, random, gc
import numpy as np
import pandas as pd
import torch

sys.path.append('../')

from playdict_ocr.tokenization import TokenizerNAT
from datasets import PartitionedTrainDataset, TrainDataset, TestDataset

In [2]:
''.join(TokenizerNAT().i2w)

'<PAD><PAD_1><PAD_2>ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz~$%&@0123456789* -([)]"!,.:;?'

In [3]:
class CFG:
    max_dec_len=25
    size=(144, 32)
    epochs, batch_size = 2, 256
    max_grad_norm=4
    encoder_dim, decoder_dim = 192, 256
    use_ctc = False
    num_pixels = 36

if CFG.use_ctc:
    CFG.max_dec_len *= 2

tokenizer = TokenizerNAT()

# MODEL

In [4]:
import keras4torch as k4t
from models import EncoderDecoderModel

from encoders.repvgg import RepVGG
from decoders.nat import NATDecoder

encoder = RepVGG(
        num_blocks=[2, 4, 6],
        width_multiplier=[0.75, 0.75, 0.75],
        use_se=False, in_channels=1, output_channels=CFG.encoder_dim)

decoder = NATDecoder(encoder_dim=CFG.encoder_dim,
                    decoder_dim=CFG.decoder_dim,
                    vocab_size=tokenizer.vocab_size,
                    max_dec_len=CFG.max_dec_len,
                    num_pixels=CFG.num_pixels)

model = EncoderDecoderModel(encoder, decoder)

In [5]:
from torch.optim.lr_scheduler import OneCycleLR
import torch.nn as nn
import torch.nn.functional as F
from torch_optimizer import AdaBelief

class CombinedOpt(torch.optim.Optimizer):
    def __init__(self, model):
        super().__init__(model.parameters(), {'lr': float('-inf')})
        self.encoder_opt = AdaBelief(
            model.encoder.parameters(), lr=2e-3*0.3, weight_decay=1e-5)
        self.decoder_opt = torch.optim.Adam(
            model.decoder.parameters(), lr=1e-3*0.3, weight_decay=1e-5)

    def step(self):
        self.encoder_opt.step()
        self.decoder_opt.step()

opt = CombinedOpt(model)

def ce_loss(y_pred, y_true):
    y_pred = y_pred.reshape(-1, tokenizer.vocab_size)
    y_true = y_true.reshape(-1)
    return F.cross_entropy(y_pred, y_true)

def acc(y_pred, y_true):
    y_pred = y_pred.argmax(-1).cpu().numpy()
    y_true = y_true.cpu().numpy()

    y_ = [(tokenizer.indices_to_string(i) == tokenizer.indices_to_string(j))
            for i,j in zip(y_pred, y_true)]
    return torch.tensor(y_, dtype=float).mean()

class MyLoopConfig(k4t.configs.TrainerLoopConfig):
    def process_batch(self, batch):
        *x_batch, y_batch = batch
        self.target_lengths = x_batch[1]
        return x_batch[:1], y_batch

    def prepare_for_optimizer_step(self, model):
        torch.nn.utils.clip_grad_norm_(model.model.encoder.parameters(), CFG.max_grad_norm)
        torch.nn.utils.clip_grad_norm_(model.model.decoder.parameters(), CFG.max_grad_norm)

    def ctc_loss(self, y_pred, y_true):
        batch_size = y_pred.size(0)
        y_pred = torch.log_softmax(y_pred, dim=-1)      # [bs, max_dec_len*2, vocab_size]
        y_pred = y_pred.transpose(0, 1)                 # [max_dec_len*2, bs, vocab_size]
        return F.ctc_loss(y_pred, y_true,
            input_lengths=torch.full([batch_size], CFG.max_dec_len), target_lengths=self.target_lengths)

    def ctc_acc(self, y_pred, y_true):
        y_pred = y_pred.argmax(-1).cpu().numpy()
        y_true = y_true.cpu().numpy()

        y_ = [(tokenizer.indices_to_string_ctc(i) == tokenizer.indices_to_string(j))
                for i,j in zip(y_pred, y_true)]

        return torch.tensor(y_, dtype=float).mean()

model = k4t.Model(model)

model.build([1, 32, 144])
model.summary()

loop = MyLoopConfig()

if CFG.use_ctc:
    model.compile(optimizer=opt, loss=loop.ctc_loss, metrics=[loop.ctc_acc], loop_config=loop, disable_val_loss=False)
else:
    model.compile(optimizer=opt, loss=ce_loss, metrics=[acc], loop_config=loop, disable_val_loss=False)

model.model.print_params()

Layer (type:depth-idx)                             Output Shape              Param #
├─RepVGG: 1-1                                      [8, 192, 2, 36]           --
|    └─RepVGGBlock: 2-1                            [8, 48, 16, 72]           --
|    |    └─Sequential: 3-1                        [8, 48, 16, 72]           528
|    |    └─Sequential: 3-2                        [8, 48, 16, 72]           144
|    |    └─Identity: 3-3                          [8, 48, 16, 72]           --
|    |    └─ReLU: 3-4                              [8, 48, 16, 72]           --
|    └─Sequential: 2-2                             [8, 48, 8, 36]            --
|    |    └─RepVGGBlock: 3-5                       [8, 48, 8, 36]            23,232
|    |    └─RepVGGBlock: 3-6                       [8, 48, 8, 36]            23,328
|    └─Sequential: 2-3                             [8, 96, 4, 36]            --
|    |    └─RepVGGBlock: 3-7                       [8, 96, 4, 36]            46,464
|    |    └─RepVGGBlo

In [6]:
file_list = [f"../preprocessed/synth_{i}.pkl" for i in range(2)]
cnt_list = [2000000] * 2

val_data = pd.read_pickle("../preprocessed/val_data.pkl")

train_set = PartitionedTrainDataset(file_list, cnt_list, CFG.max_dec_len, tokenizer, CFG.size)
val_set = TrainDataset(val_data, CFG.max_dec_len, tokenizer, CFG.size)

# Train loop

In [7]:
model.load_weights('saved_model/best_pretrain.pt')

In [8]:
from torch.utils.data import DataLoader
from keras4torch.callbacks import LRScheduler
from torch.optim.lr_scheduler import MultiStepLR
from keras4torch.utils.data import RestrictedRandomSampler
from keras4torch.callbacks import ModelCheckpoint

torch.backends.cudnn.benchmark = True

scheduler_1 = LRScheduler(MultiStepLR(opt.encoder_opt, [1, 2], 0.3))
scheduler_2 = LRScheduler(MultiStepLR(opt.decoder_opt, [1, 2], 0.3))

model.fit(train_set,
            validation_data=val_set,
            epochs=CFG.epochs,
            batch_size=CFG.batch_size,
            validation_batch_size=CFG.batch_size*2,
            sampler=RestrictedRandomSampler(cnt_list),
            callbacks=[scheduler_1, scheduler_2, ModelCheckpoint('saved_model/best.pt', monitor='val_acc')]
)

Train on 4000000 samples, validate on 802733 samples:
Epoch 1/2
15625/15625 - 2417s - loss: 0.0627 - acc: 0.7651 - val_loss: 0.4460 - val_acc: 0.4599 - lr: -inf
Epoch 2/2
15625/15625 - 2407s - loss: 0.0387 - acc: 0.8369 - val_loss: 0.4723 - val_acc: 0.4659 - lr: -inf


Unnamed: 0,loss,acc,val_loss,val_acc,lr
1,0.062697,0.765091,0.445976,0.45985,-inf
2,0.038664,0.836902,0.472305,0.465948,-inf


In [9]:
model.load_weights('saved_model/best.pt')

model.model.deploy()

In [10]:
_ = torch.onnx.export(model.model.cpu(),
    train_set[0][0].unsqueeze_(0), "saved_model/vgg_transformer.onnx", verbose=True, opset_version=11, input_names=['x'], output_names=['y'], do_constant_folding=False)

s=[1], device=cpu) = onnx::Unsqueeze[axes=[0]](%254)
  %258 : Long(1, strides=[1], device=cpu) = onnx::Unsqueeze[axes=[0]](%225)
  %259 : Long(1, strides=[1], device=cpu) = onnx::Unsqueeze[axes=[0]](%255)
  %260 : Long(1, strides=[1], device=cpu) = onnx::Unsqueeze[axes=[0]](%256)
  %261 : Long(4, strides=[1], device=cpu) = onnx::Concat[axis=0](%257, %258, %259, %260)
  %262 : Float(1, 25, 8, 32, strides=[6400, 256, 32, 1], requires_grad=1, device=cpu) = onnx::Reshape(%253, %261) # g:\playdict\recognizer\train\decoders\efficient_transformers\_mha.py:31:0
  %263 : Float(1, 8, 25, 32, strides=[6400, 32, 256, 1], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 1, 3]](%262) # g:\playdict\recognizer\train\decoders\efficient_transformers\_mha.py:31:0
  %264 : Float(1, 8, 32, 25, strides=[6400, 32, 1, 256], requires_grad=1, device=cpu) = onnx::Transpose[perm=[0, 2, 3, 1]](%250) # g:\playdict\recognizer\train\decoders\efficient_transformers\_mha.py:59:0
  %265 : Float(1, 8, 25, 25, s