In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/OCR_HW/task2/

/content/drive/MyDrive/OCR_HW/task2


In [3]:
from data_reader import Vocabulary, HWDBDatasetHelper, ArchivedHWDBReader

# your path to data
train_path = '/content/drive/MyDrive/CASIA HWDB/HWDBTrain/Images.zip'
test_path = '/content/drive/MyDrive/CASIA HWDB/HWDBTest/Images.zip'
gt_path = './gt.txt'

In [1]:
# Веса модели:
# https://drive.google.com/file/d/1xCo1ET1y6ZKEqGLdIlkuYygVHtvg9mzy/view?usp=sharing

# Simple CNN baseline

pytorch is required for this baseline implementation

## Baseline method

- Naively resize to 32x32 (DON'T DO THIS IN YOUR WORK, try to save geometry somehow, it is important)
- Train LeNet-like CNN
- Enjoy :)

In [6]:
import cv2
import numpy as np

### Data tools

In [7]:
train_reader = ArchivedHWDBReader(train_path)
train_reader.open()
train_helper = HWDBDatasetHelper(train_reader)

In [8]:
train_helper, val_helper = train_helper.train_val_split()

In [9]:
train_helper.size(), val_helper.size()

(2578433, 644609)

In [11]:
import torch

from torch.utils.data import Dataset, DataLoader
from torch import nn

class HWDBDataset(Dataset):
    def __init__(self, helper: HWDBDatasetHelper):
        self.helper = helper
    
    def __len__(self):
        return self.helper.size()
    
    def __getitem__(self, idx):
        im, label = self.helper.get_item(idx)

        ratio = 128.0/max(im.shape[:2])
        new_size = tuple([int(x * ratio) for x in im.shape[:2]])
        im = cv2.resize(im, (new_size[1], new_size[0]))

        new_im = cv2.copyMakeBorder(im, (128.0 - new_size[0])//2, 
                                    (128.0 - new_size[0]) - ((128.0 - new_size[0])//2), 
                                    (128.0 - new_size[1])//2, 
                                    (128.0 - new_size[1]) - ((128.0 - new_size[1])//2), cv2.BORDER_CONSTANT,
            value=[255, 255, 255])

        im_rgb = cv2.cvtColor(new_im, cv2.COLOR_GRAY2RGB)
        return (im_rgb - 127.5) / 255., label

In [None]:
train_dataset = HWDBDataset(train_helper)
val_dataset = HWDBDataset(val_helper)

### Model & training

In [None]:
from torchvision import models
from pytorch_metric_learning import losses

In [None]:
model = models.resnet18(pretrained=True)
in_features_size = model.fc.in_features
model.fc = nn.Linear(in_features_size, 1024)
model.eval()
model = model.cuda()

In [None]:
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)

In [36]:
loss_fn = losses.ArcFaceLoss(num_classes=train_helper.vocabulary.num_classes(), embedding_size=1024).to(torch.device('cuda'))
optim_loss = torch.optim.Adam(loss_fn.parameters(), lr=0.0001)
optim = torch.optim.Adam(model.parameters(), lr=0.001)

In [31]:
from tqdm import tqdm


def run_validation(val_loader: DataLoader, model: nn.Module, loss_fn, n_steps=None):
    model.eval()
    n_good = 0
    n_all = 0
    wrapper = lambda x: x
    if n_steps is None:
        n_steps = len(val_loader)
        wrapper = tqdm
    
    with torch.no_grad():
        for batch, (X, y) in enumerate(wrapper(val_loader)):
            if batch == n_steps:
                break
            logits = loss_fn.get_logits(model(torch.swapaxes(X, 1, 3).to(torch.float32).cuda()))
            classes = torch.argmax(logits, dim=1).cpu().numpy()
            n_good += sum(classes == y.cpu().numpy())
            n_all += len(classes)
    
    return n_good / n_all


def train_epoch(train_loader: DataLoader, val_loader: DataLoader, model: nn.Module, optim, optim_loss, loss_fn):
    for batch, (X, y) in enumerate(tqdm(train_loader)):
        model.train()
        logits = model(torch.swapaxes(X, 1, 3).to(torch.float32).cuda())

        loss = loss_fn(logits, y.to(torch.long).cuda())
        
        optim.zero_grad()
        optim_loss.zero_grad()
        loss.backward()
        optim_loss.step()
        optim.step()

In [None]:
for epoch in range(3):
    print(f'Epoch {epoch}:')
    train_epoch(train_loader, val_loader, model, optim, optim_loss, loss_fn)
    accuracy = run_validation(val_loader, model, loss_fn)
    print(f'accuracy: {accuracy}')
    torch.save(model.state_dict(), f'resnet18_epoch_{epoch}.pth')

Epoch 0:


 10%|█         | 528/5036 [18:01<2:47:26,  2.23s/it]

### Evaluation

In [27]:
model = models.resnet18(pretrained=True)
in_features_size = model.fc.in_features
model.fc = nn.Linear(in_features_size, 1024)

In [28]:
model.load_state_dict(torch.load('resnet18_epoch_0.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
print(model)

In [15]:
test_path = r'/content/drive/MyDrive/CASIA HWDB/HWDBTest/Images.zip'
pred_path = './pred.txt'

test_reader = ArchivedHWDBReader(test_path)
test_reader.open()
test_helper = HWDBDatasetHelper(test_reader, prefix='Test')

In [16]:
test_dataset = HWDBDataset(test_helper)
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)

In [37]:
preds = []
model.eval()
with torch.no_grad():
    for X, _ in tqdm(test_loader):
        logits = loss_fn.get_logits(model(torch.swapaxes(X, 1, 3).to(torch.float32)))
        classes = torch.argmax(logits, dim=1).numpy()
        preds.extend(classes)

100%|██████████| 380/380 [6:00:39<00:00, 56.95s/it]


In [38]:
with open(pred_path, 'w') as f_pred:
    for idx, pred in enumerate(preds):
        name = test_helper.namelist[idx]
        cls = train_helper.vocabulary.class_by_index(pred)
        print(name, cls, file=f_pred)

In [None]:
!python -m cource_ocr_t2.evaluate