In [1]:
import os, sys, random, gc
import numpy as np
import pandas as pd
import torch

In [2]:
train_labels = pd.read_pickle("../preprocessed/train_labels.pkl")
print(train_labels.shape)
train_labels.head()

(7224612, 3)


Unnamed: 0,path,tgt,tgt_len
0,../mjsynth/90kDICT32px/2425/1/115_Lube_45484.jpg,"[1, 25, 60, 41, 44, 2]",6
1,../mjsynth/90kDICT32px/2425/1/114_Spencerian_7...,"[1, 32, 55, 44, 53, 42, 44, 57, 48, 40, 53, 2]",12
2,../mjsynth/90kDICT32px/2425/1/113_accommodatin...,"[1, 40, 42, 42, 54, 52, 52, 54, 43, 40, 59, 48...",17
3,../mjsynth/90kDICT32px/2425/1/112_CARPENTER_11...,"[1, 16, 14, 31, 29, 18, 27, 33, 18, 31, 2]",11
4,../mjsynth/90kDICT32px/2425/1/111_REGURGITATIN...,"[1, 31, 18, 20, 34, 31, 20, 22, 33, 14, 33, 22...",15


In [4]:
class CFG:
    num_workers=0
    max_dec_len=25
    size=(128, 32)
    epochs, batch_size = 2, 128
    encoder_lr, decoder_lr = 1e-4, 4e-4
    weight_decay, dropout = 1e-5, 0.3
    max_grad_norm=5
    embed_dim, attention_dim = 256, 256
    encoder_dim, decoder_dim = 512, 512
    seed, n_fold = 42, 5

# Transforms

In [5]:
from matplotlib import pyplot as plt
from tokenization import Tokenizer

from datasets import TrainDataset, TestDataset

tokenizer = Tokenizer()

{'<PAD>': 0, '<START>': 1, '<END>': 2, '0': 3, '1': 4, '2': 5, '3': 6, '4': 7, '5': 8, '6': 9, '7': 10, '8': 11, '9': 12, ' ': 13, 'A': 14, 'B': 15, 'C': 16, 'D': 17, 'E': 18, 'F': 19, 'G': 20, 'H': 21, 'I': 22, 'J': 23, 'K': 24, 'L': 25, 'M': 26, 'N': 27, 'O': 28, 'P': 29, 'Q': 30, 'R': 31, 'S': 32, 'T': 33, 'U': 34, 'V': 35, 'W': 36, 'X': 37, 'Y': 38, 'Z': 39, 'a': 40, 'b': 41, 'c': 42, 'd': 43, 'e': 44, 'f': 45, 'g': 46, 'h': 47, 'i': 48, 'j': 49, 'k': 50, 'l': 51, 'm': 52, 'n': 53, 'o': 54, 'p': 55, 'q': 56, 'r': 57, 's': 58, 't': 59, 'u': 60, 'v': 61, 'w': 62, 'x': 63, 'y': 64, 'z': 65}


# MODEL

In [6]:
from models import EncoderDecoderModel
import keras4torch as k4t

model = EncoderDecoderModel(CFG, tokenizer)

In [7]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

class CollateWrapper:
    # run on cpu
    def __call__(self, batch):
        src, tgt, tgt_lens = [], [], []
        for t in batch:
            src.append(t[0])
            tgt.append(torch.from_numpy(t[1]))
            tgt_lens.append(t[2])

        src = torch.stack(src)
        tgt = pad_sequence(tgt, batch_first=True, padding_value=0)
        tgt_lens = torch.tensor(tgt_lens, dtype=torch.int64)
        return src, tgt, tgt_lens, torch.tensor(0)

In [8]:
class MyLoopConfig(k4t.configs.TrainerLoopConfig):
    # run on gpu
    def process_batch(self, batch):
        src, tgt, tgt_lens, _ = batch
        if not self.training:
            return (src,), tgt

        tgt_lens, sort_idx = tgt_lens.sort(dim=0, descending=True)
        src, tgt = src[sort_idx], tgt[sort_idx]
        return (src, tgt, tgt_lens), tgt

    def prepare_for_optimizer_step(self, model):
        torch.nn.utils.clip_grad_norm_(model.model.encoder.parameters(), CFG.max_grad_norm)
        torch.nn.utils.clip_grad_norm_(model.model.decoder.parameters(), CFG.max_grad_norm)

In [9]:
from torch.optim.lr_scheduler import OneCycleLR
import torch.nn as nn
import torch.nn.functional as F

from torch_optimizer import AdaBelief

class CombinedOpt(torch.optim.Optimizer):
    def __init__(self, model):
        super().__init__(model.parameters(), {'lr': float('-inf')})
        self.encoder_opt = AdaBelief(
            model.encoder.parameters(), lr=CFG.encoder_lr, weight_decay=CFG.weight_decay)
        self.decoder_opt = torch.optim.Adam(
            model.decoder.parameters(), lr=CFG.decoder_lr)

    def step(self):
        self.encoder_opt.step()
        self.decoder_opt.step()

opt = CombinedOpt(model)

model = k4t.Model(model)

def ce_loss(y_pred, y_true):
    y_pred = y_pred.reshape(-1, tokenizer.vocab_size)
    y_true = y_true.reshape(-1)
    nonzero_indices = torch.nonzero(y_true).view(-1)
    return F.cross_entropy(y_pred[nonzero_indices], y_true[nonzero_indices])

def acc(y_pred, y_true):
    y_pred = y_pred.argmax(-1).cpu().numpy()
    y_true = y_true.cpu().numpy()

    y_ = [(tokenizer.indices_to_string(i) == tokenizer.indices_to_string(j))
            for i,j in zip(y_pred, y_true)]

    return torch.tensor(y_, dtype=float).mean()

model.compile(optimizer=opt, loss=ce_loss, metrics=[acc], loop_config=MyLoopConfig(), disable_val_loss=True)

# Train loop

In [10]:
from torch.utils.data import DataLoader
from keras4torch.callbacks import LRScheduler
import pickle

with open("../preprocessed/train_part_0.pkl", 'rb') as f:
    train_part = pickle.load(f)

train_set = TrainDataset(train_labels, train_part, CFG.size)
'''
encoder_scheduler = OneCycleLR(opt.encoder_opt, max_lr=CFG.encoder_lr*10, total_steps=12500*CFG.epochs)
decoder_scheduler = OneCycleLR(opt.encoder_opt, max_lr=CFG.decoder_lr*10, total_steps=12500*CFG.epochs)

def update_scheduler(t):
    encoder_scheduler.step()
    decoder_scheduler.step()
'''
model.fit(train_set,
            validation_split=0.2,
            epochs=CFG.epochs,
            batch_size=CFG.batch_size,
            validation_batch_size=CFG.batch_size*2,
            collate_fn=CollateWrapper(),
            num_workers=CFG.num_workers,
            #callbacks=[k4t.callbacks.LambdaCallback(on_batch_end=update_scheduler)],
)

model.save_weights('saved_model/best.pt')

Train on 1600000 samples, validate on 400000 samples:
Epoch 1/2
12500/12500 - 8456s - loss: 0.7043 - acc: 0.6418 - val_acc: 0.8129 - lr: -inf
Epoch 2/2
12500/12500 - 8423s - loss: 0.4270 - acc: 0.8330 - val_acc: 0.8490 - lr: -inf


Train on 1600000 samples, validate on 400000 samples:

Epoch 1/4

6250/6250 - 1833s - loss: 0.9510 - acc: 0.4394 - val_acc: 0.6709 - lr: -inf

Epoch 2/4

6250/6250 - 1843s - loss: 0.5409 - acc: 0.7010 - val_acc: 0.7443 - lr: -inf

Epoch 3/4

3370/6250 [===============>..............] - ETA: 12:27 - loss: 0.4926 - acc: 0.7514

In [None]:
# test

In [None]:
#model.load_weights('saved_model/best.pt')
#test_set = TestDataset(pd.read_pickle('test_path_list.pkl'), img_size=CFG.size)

In [None]:
#y_pred = model.predict(test_set, batch_size=CFG.batch_size*4, activation='argmax', progress_bar=True)
#y_pred.shape

In [None]:
#from tqdm import tqdm

#a = []

#for i in tqdm(range(len(y_pred))):
#    a.append(tokenizer.indices_to_string(y_pred[i]))