In [1]:
from pathlib import Path
from tqdm import tqdm
from matplotlib import pyplot as plt
import numpy as np
from tqdm.notebook import tqdm
import os
import wandb
import cv2
import pandas as pd

In [2]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

In [3]:
class BarCodeDataset(Dataset):
    def __init__(self, root_dir, ):
        self.root_dir = root_dir
        self.data_list = pd.read_csv(f'{root_dir}/markup.csv', encoding = "utf-16", 
                     names=['file_name', 'code', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4', 'binary'])
        self.data_list = self.data_list.loc[self.data_list['file_name'].isin(os.listdir(f'{root_dir}/Images/'))]
        self.resize = torchvision.transforms.Resize([200, 400])
        
    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        sample_data = self.data_list.iloc[idx]
        image = torch.FloatTensor(cv2.imread(self.root_dir + '/Images/' + sample_data['file_name'])) / 255
        mask = np.zeros(image.shape[:2])
        mask = torch.LongTensor(cv2.fillConvexPoly(mask, 
                np.array(sample_data[['x1', 'y1', 'x2', 'y2', 'x3', 
                'y3', 'x4', 'y4']].to_numpy().reshape((4, 2)), dtype=int), (1, ))).unsqueeze(0)
        image = image.permute([2, 0, 1])
        if image.shape[1] > image.shape[2]:
            image = torch.rot90(image, k=1, dims=[1, 2])
            mask = torch.rot90(mask, k=1, dims=[1, 2])
        return self.resize(image), self.resize(mask).squeeze(0)

In [4]:
class DoubleConv(nn.Module):
    """(convolution => [BN] => ReLU) * 2"""

    def __init__(self, in_channels, out_channels, mid_channels=None):
        super().__init__()
        if not mid_channels:
            mid_channels = out_channels
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.double_conv(x)


class Down(nn.Module):
    """Downscaling with maxpool then double conv"""

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.maxpool_conv = nn.Sequential(
            nn.MaxPool2d(2),
            DoubleConv(in_channels, out_channels)
        )

    def forward(self, x):
        return self.maxpool_conv(x)


class Up(nn.Module):
    """Upscaling then double conv"""

    def __init__(self, in_channels, out_channels, bilinear=True):
        super().__init__()

        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
            self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
        else:
            self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
            self.conv = DoubleConv(in_channels, out_channels)

    def forward(self, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                        diffY // 2, diffY - diffY // 2])
        # if you have padding issues, see
        # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
        # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)


class OutConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(OutConv, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)

    def forward(self, x):
        return self.conv(x)

In [5]:
class UNet(nn.Module):
    def __init__(self, n_channels, n_classes, bilinear=False):
        super(UNet, self).__init__()
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.bilinear = bilinear

        self.inc = (DoubleConv(n_channels, 64))
        self.down1 = (Down(64, 128))
        self.down2 = (Down(128, 256))
        self.down3 = (Down(256, 512))
        factor = 2 if bilinear else 1
        self.down4 = (Down(512, 1024 // factor))
        self.up1 = (Up(1024, 512 // factor, bilinear))
        self.up2 = (Up(512, 256 // factor, bilinear))
        self.up3 = (Up(256, 128 // factor, bilinear))
        self.up4 = (Up(128, 64, bilinear))
        self.outc = (OutConv(64, n_classes))

    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5, x4)
        x = self.up2(x, x3)
        x = self.up3(x, x2)
        x = self.up4(x, x1)
        logits = self.outc(x)
        return logits

    def use_checkpointing(self):
        self.inc = torch.utils.checkpoint(self.inc)
        self.down1 = torch.utils.checkpoint(self.down1)
        self.down2 = torch.utils.checkpoint(self.down2)
        self.down3 = torch.utils.checkpoint(self.down3)
        self.down4 = torch.utils.checkpoint(self.down4)
        self.up1 = torch.utils.checkpoint(self.up1)
        self.up2 = torch.utils.checkpoint(self.up2)
        self.up3 = torch.utils.checkpoint(self.up3)
        self.up4 = torch.utils.checkpoint(self.up4)
        self.outc = torch.utils.checkpoint(self.outc)

In [8]:
train_set = BarCodeDataset('/DATA/asaginbaev/CourseOCRTask3/Train/')

In [9]:
train_loader = DataLoader(train_set, batch_size=4, shuffle=True, num_workers=4)

In [6]:
def dice_coeff(input, target, reduce_batch_first = False, epsilon = 1e-6):
    assert input.size() == target.size()
    assert input.dim() == 3 or not reduce_batch_first

    sum_dim = (-1, -2) if input.dim() == 2 or not reduce_batch_first else (-1, -2, -3)

    inter = 2 * (input * target).sum(dim=sum_dim)
    sets_sum = input.sum(dim=sum_dim) + target.sum(dim=sum_dim)
    sets_sum = torch.where(sets_sum == 0, inter, sets_sum)

    dice = (inter + epsilon) / (sets_sum + epsilon)
    return dice.mean()

In [7]:
def dice_loss(input, target):
    return 1 - dice_coeff(input, target, reduce_batch_first=True)

In [8]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [10]:
!nvidia-smi

Sun Apr 23 13:04:01 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.182.03   Driver Version: 470.182.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN RTX    Off  | 00000000:18:00.0 Off |                  N/A |
| 41%   25C    P8    18W / 280W |   1241MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA TITAN V      Off  | 00000000:3B:00.0 Off |                  N/A |
| 28%   31C    P2    35W / 250W |   2131MiB / 12066MiB |      0%      Default |
|       

In [14]:
logger = wandb.init(project='abbyy_ocr')

[34m[1mwandb[0m: Currently logged in as: [33mazatiusssss[0m (use `wandb login --relogin` to force relogin)


In [15]:
criterion = nn.BCEWithLogitsLoss()

In [25]:
model = UNet(3, 1)

In [26]:
model.to(device);

In [18]:
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, foreach=True)

In [19]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=5)

In [20]:
global_step = 0
for epoch in tqdm(range(5)):
    for images, masks in tqdm(train_loader):
        images = images.to(device=device, dtype=torch.float32, memory_format=torch.channels_last)
        masks = masks.to(device=device, dtype=torch.long)
        masks_pred = model(images)
        loss = criterion(masks_pred.squeeze(1), masks.float())
        loss += dice_loss(F.sigmoid(masks_pred.squeeze(1)), masks.float())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        logger.log({
                    'train loss': loss.item(),
                    'step': global_step,
                    'epoch': epoch
                })
        global_step += 1
    torch.save(model.state_dict(), f'checkpoint_epoch_{epoch}.pth')

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2050 [00:00<?, ?it/s]



  0%|          | 0/2050 [00:00<?, ?it/s]

  0%|          | 0/2050 [00:00<?, ?it/s]

  0%|          | 0/2050 [00:00<?, ?it/s]

  0%|          | 0/2050 [00:00<?, ?it/s]

In [22]:
wandb.finish()

In [27]:
model.load_state_dict(torch.load('checkpoint_epoch_4.pth'))

<All keys matched successfully>

In [9]:
class BarCodeDecoder(nn.Module):
    def __init__(self, ):
        super(BarCodeDecoder, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3, padding='same')
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, 3, padding='same')
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 64, 3, padding='same')
        self.bn3 = nn.BatchNorm2d(64)
        self.conv4 = nn.Conv2d(64, 128, 3, padding='same')
        self.bn4 = nn.BatchNorm2d(128)
        self.lstm = nn.LSTM(128, 128, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(256, 10)
        
    def forward(self, x):
        x = F.max_pool2d(self.bn1(F.relu(self.conv1(x))), 2)
        x = F.max_pool2d(self.bn2(F.relu(self.conv2(x))), 2)
        x = F.max_pool2d(self.bn3(F.relu(self.conv3(x))), 2)
        x = F.max_pool2d(self.bn4(F.relu(self.conv4(x))), 2)
        x = torch.flatten(x, start_dim=1, end_dim=2)
        x = x.permute([0, 2, 1])
        x, _ = self.lstm(x)
        out = self.fc(F.relu(x))  
        
        return out.permute([0, 2, 1])

In [10]:
class BarCodeDecodingDataset(Dataset):
    def __init__(self, root_dir, ):
        self.root_dir = root_dir
        self.data_list = pd.read_csv(f'{root_dir}/markup.csv', encoding = "utf-16", 
                     names=['file_name', 'code', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4', 'binary'])
        self.data_list = self.data_list.loc[self.data_list['file_name'].isin(os.listdir(f'{root_dir}/Images/'))]
        self.data_list = self.data_list.loc[self.data_list['code'].astype(str).str.len() == 13]
        self.resize = torchvision.transforms.Resize([16, 192])
        
    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        sample_data = self.data_list.iloc[idx]
        image = torch.FloatTensor(cv2.imread(self.root_dir + '/Images/' + sample_data['file_name'])) / 255
        x_s = sample_data[['x1', 'x2', 'x3', 'x4']].to_numpy()
        y_s = sample_data[['y1', 'y2', 'y3', 'y4']].to_numpy()
        image = image[np.min(y_s):np.max(y_s), np.min(x_s):np.max(x_s)]
        image = image.permute([2, 0, 1])
        if image.shape[1] > image.shape[2]:
            image = torch.rot90(image, k=1, dims=[1, 2])
        return self.resize(image), torch.tensor(list(map(int, list(str(sample_data['code']))[1:])))

In [11]:
decoder_train_set = BarCodeDecodingDataset('/DATA/asaginbaev/CourseOCRTask3/Train/')

In [12]:
decoder_model = BarCodeDecoder()

In [13]:
decoder_model.to(device);

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(decoder_model.parameters(), lr=1e-4, foreach=True)

In [17]:
train_loader = DataLoader(decoder_train_set, batch_size=4, shuffle=True, num_workers=4, pin_memory=False)

In [18]:
logger = wandb.init(project='abbyy_ocr')

[34m[1mwandb[0m: Currently logged in as: [33mazatiusssss[0m (use `wandb login --relogin` to force relogin)


In [19]:
global_step = 0
for epoch in tqdm(range(10)):
    for images, labels in tqdm(train_loader):
        images = images.to(device=device, dtype=torch.float32, memory_format=torch.channels_last)
        labels = labels.to(device=device, dtype=torch.long)
        labels_pred = decoder_model(images)
        loss = criterion(labels_pred, labels)
        accuracy = torch.all(labels_pred.argmax(dim=1) == labels, dim=1).type(torch.float).mean()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        logger.log({
                    'train loss': loss.item(),
                    'accuracy': accuracy.item(),
                    'step': global_step,
                    'epoch': epoch
                })
        global_step += 1
    torch.save(decoder_model.state_dict(), f'checkpoint_decoder_1_epoch_{epoch}.pth')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2040 [00:00<?, ?it/s]

  0%|          | 0/2040 [00:00<?, ?it/s]

  0%|          | 0/2040 [00:00<?, ?it/s]

  0%|          | 0/2040 [00:00<?, ?it/s]

  0%|          | 0/2040 [00:00<?, ?it/s]

  0%|          | 0/2040 [00:00<?, ?it/s]

  0%|          | 0/2040 [00:00<?, ?it/s]

  0%|          | 0/2040 [00:00<?, ?it/s]

  0%|          | 0/2040 [00:00<?, ?it/s]

  0%|          | 0/2040 [00:00<?, ?it/s]

In [20]:
wandb.finish()




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,█▅▅▅▅▅█▅█▅▅▅▅████▅▅▅█▅▁█████▅██████▅████
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train loss,▁▅█▃█▂▁▃▁▂▂▂▃▁▁▁▂▆▂▅▂▂▇▁▁▁▂▁▇▁▂▁▂▁▁▂▁▁▂▁

0,1
accuracy,0.0
epoch,9.0
step,20399.0
train loss,0.13088


In [38]:
decoder_model.load_state_dict(torch.load('checkpoint_decoder_1_epoch_9.pth'))

<All keys matched successfully>

In [21]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [22]:
def find_corners(mask):
    binary_mask = 255 * (sigmoid(mask) > 0.5).astype(np.uint8)
    contours, hierarchy = cv2.findContours(binary_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    areas = list(map(cv2.contourArea, contours))
    contour = contours[np.argmax(areas)]
    x, y = np.array(contour)[:, 0].T
    i_r_b = np.argmax(y + x)
    i_l_b = np.argmax(y - x)
    i_l_t = np.argmax(-y - x)
    i_r_t = np.argmax(-y + x)
    return np.array([[x[i_l_t], y[i_l_t],], 
                     [x[i_r_t], y[i_r_t],],
                     [x[i_r_b], y[i_r_b],],
                     [x[i_l_b], y[i_l_b],],], dtype=float)

In [23]:
def calc_checksum(labels):
    sum_even = np.sum(labels[1::2])
    sum_odd = np.sum(labels[::2])
    checksum = (10 - (3 * sum_odd + sum_even) % 10) % 10
    return checksum

In [24]:
from torchvision.transforms.functional import rotate

In [39]:
resize = torchvision.transforms.Resize([200, 400])
resize_1 = torchvision.transforms.Resize([16, 192])
predicted_masks = []
predicted_corners = []
predicted_labels = []
for image_name in tqdm(os.listdir('/DATA/asaginbaev/CourseOCRTask3/Test/Images/')):
    image = torch.FloatTensor(cv2.imread('/DATA/asaginbaev/CourseOCRTask3/Test/Images/' + image_name)) / 255
    image = image.permute([2, 0, 1])
    initial_size = image.shape[1:]
    was_rotated = False
    if image.shape[1] > image.shape[2]:
        was_rotated = True
        image = torch.rot90(image, k=1, dims=[1, 2])
        initial_size = initial_size[::-1]
    predicted_mask = model(resize(image).unsqueeze(0).cuda()).squeeze(0)
    predicted_mask = torchvision.transforms.Resize(initial_size)(predicted_mask)
    if was_rotated:
        predicted_mask = torch.rot90(predicted_mask, k=-1, dims=[1, 2])
    predicted_masks.append(predicted_mask.cpu().detach().numpy())
    predicted_corners.append(find_corners(predicted_masks[-1][0]))
    image = torch.FloatTensor(cv2.imread('/DATA/asaginbaev/CourseOCRTask3/Test/Images/' + image_name)) / 255
    
    angle = np.arctan((predicted_corners[-1][1, 1] - predicted_corners[-1][0, 1]) / \
            (predicted_corners[-1][1, 0] - predicted_corners[-1][0, 0]))
    rotated_image = rotate((image * (sigmoid(predicted_masks[-1][0]) > 0.5).reshape([*predicted_masks[-1][0].shape, 
                        1])).permute([2, 0, 1]), angle)
    x_s = np.where(rotated_image)[1]
    y_s = np.where(rotated_image)[2]
    rotated_image = rotated_image[:, np.min(x_s):np.max(x_s), np.min(y_s):np.max(y_s)]
    
    if rotated_image.shape[1] > rotated_image.shape[2]:
        rotated_image = torch.rot90(rotated_image, k=-1, dims=[1, 2])
    
    labels = decoder_model(resize_1(rotated_image).unsqueeze(0).cuda()).argmax(dim=1).cpu().detach().numpy()
    predicted_labels.append(labels)

  0%|          | 0/100 [00:00<?, ?it/s]

In [40]:
for i, image_name in tqdm(enumerate(os.listdir('/DATA/asaginbaev/CourseOCRTask3/Test/Images/'))):
    result_df = result_df.append([[image_name, 
        str(calc_checksum(predicted_labels[i][0])) + ''.join(list(map(str, predicted_labels[i][0]))),
        int(predicted_corners[i][0, 0]), int(predicted_corners[i][0, 1]),
        int(predicted_corners[i][1, 0]), int(predicted_corners[i][1, 1]),
        int(predicted_corners[i][2, 0]), int(predicted_corners[i][2, 1]),
        int(predicted_corners[i][3, 0]), int(predicted_corners[i][3, 1]), 0]])

0it [00:00, ?it/s]

  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_df = result_df.append([[image_name,
  result_d

In [41]:
result_df.to_csv('answer.csv', encoding = "utf-16", header=False, index=False)

In [42]:
!python course_ocr_t3/evaluate.py

Checking answer (/home/asaginbaev/course_ocr/task3/answer.csv) against markup(/home/asaginbaev/course_ocr/task3/markup.csv)
recognition_accuracy=0.43
detection_result=0.99
score=0.626
