<a href="https://colab.research.google.com/github/as9786/ComputerVision/blob/main/OCR/code/CRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Module

In [None]:
import random
import pandas as pd
import numpy as np
import os
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchvision.models import resnet18, resnet50
from torchvision import transforms

from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore')

# 2. 장치

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 3. 초매개변수

In [None]:
CFG = {
    'IMG_HEIGHT_SIZE':64,
    'IMG_WIDTH_SIZE':224,
    'EPOCHS':20,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':256,
    'NUM_WORKERS':4, # 본인의 GPU, CPU 환경에 맞게 설정
    'SEED':41
}

# 4. Data

## 4-1. Data unzip

In [None]:
!unzip /content/drive/MyDrive/롯데/신입과제/data/open.zip

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  inflating: train/TRAIN_71888.png   
  inflating: train/TRAIN_71889.png   
  inflating: train/TRAIN_71890.png   
  inflating: train/TRAIN_71891.png   
  inflating: train/TRAIN_71892.png   
  inflating: train/TRAIN_71893.png   
  inflating: train/TRAIN_71894.png   
  inflating: train/TRAIN_71895.png   
  inflating: train/TRAIN_71896.png   
  inflating: train/TRAIN_71897.png   
  inflating: train/TRAIN_71898.png   
  inflating: train/TRAIN_71899.png   
  inflating: train/TRAIN_71900.png   
  inflating: train/TRAIN_71901.png   
  inflating: train/TRAIN_71902.png   
  inflating: train/TRAIN_71903.png   
  inflating: train/TRAIN_71904.png   
  inflating: train/TRAIN_71905.png   
  inflating: train/TRAIN_71906.png   
  inflating: train/TRAIN_71907.png   
  inflating: train/TRAIN_71908.png   
  inflating: train/TRAIN_71909.png   
  inflating: train/TRAIN_71910.png   
  inflating: train/TRAIN_71911.png   
  inflating: train/TRAIN_71912.png   


## 4-2. Data load

In [None]:
df = pd.read_csv('./train.csv')

In [None]:
# 제공된 학습데이터 중 1글자 샘플들의 단어사전이 학습/테스트 데이터의 모든 글자를 담고 있으므로 학습 데이터로 우선 배치
df['len'] = df['label'].str.len()
train_v1 = df[df['len']==1]

In [None]:
# 제공된 학습데이터 중 2글자 이상의 샘플들에 대해서 단어길이를 고려하여 Train (80%) / Validation (20%) 분할
df = df[df['len']>1]
train_v2, val, _, _ = train_test_split(df, df['len'], test_size=0.2, random_state=CFG['SEED'])

In [None]:
# 학습 데이터로 우선 배치한 1글자 샘플들과 분할된 2글자 이상의 학습 샘플을 concat하여 최종 학습 데이터로 사용
train = pd.concat([train_v1, train_v2])
print(len(train), len(val))

66251 10637


# 5. 단어 사전

In [None]:
# 학습 데이터로부터 단어 사전(Vocabulary) 구축
train_gt = [gt for gt in train['label']]
train_gt = "".join(train_gt)
letters = sorted(list(set(list(train_gt))))
print(len(letters))

2349


In [None]:
vocabulary = ["-"] + letters
print(len(vocabulary))
idx2char = {k:v for k,v in enumerate(vocabulary, start=0)}
char2idx = {v:k for k,v in idx2char.items()}

2350


# 6. Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label_list, train_mode=True):
        self.img_path_list = img_path_list
        self.label_list = label_list
        self.train_mode = train_mode

    def __len__(self):
        return len(self.img_path_list)

    def __getitem__(self, index):
        image = Image.open(self.img_path_list[index]).convert('RGB')

        if self.train_mode:
            image = self.train_transform(image)
        else:
            image = self.test_transform(image)

        if self.label_list is not None:
            text = self.label_list[index]
            return image, text
        else:
            return image

    # Image Augmentation
    def train_transform(self, image):
        transform_ops = transforms.Compose([
            transforms.Resize((CFG['IMG_HEIGHT_SIZE'],CFG['IMG_WIDTH_SIZE'])),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
        ])
        return transform_ops(image)

    def test_transform(self, image):
        transform_ops = transforms.Compose([
            transforms.Resize((CFG['IMG_HEIGHT_SIZE'],CFG['IMG_WIDTH_SIZE'])),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
        ])
        return transform_ops(image)

In [None]:
train_dataset = CustomDataset(train['img_path'].values, train['label'].values, True)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True)

val_dataset = CustomDataset(val['img_path'].values, val['label'].values, False)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True)

In [None]:
image_batch, text_batch = next(iter(train_loader))
print(image_batch.size(), text_batch)

torch.Size([256, 3, 64, 224]) ('해', '빗', '음악', '뗏', '겁', '제외하다', '씸', '쌥', '고요하다', '타고나다', '묜', '말하다', '표시', '올리다', '운전', '세기', '집', '쑤', '간호', '수출하다', '쌀', '떨다', '식사', '다시', '고전', '와', '손뼉', '자살', '천', '갖추다', '국립', '년', '꿀', '터', '올', '혀', '개월', '발표되다', '셉', '캡', '가격', '주제', '포장', '걷기', '칠십', '화장지', '대사', '십일월', '도', '관점', '이별', '그다지', '간판', '작은아버지', '신부', '온몸', '지다', '어째서', '뉵', '국민', '큠', '쩡', '후춧가루', '살림', '형태', '상류', '쮜', '제시하다', '튀기다', '정반대', '얜', '큰소리', '꺄', '목', '기독교', '독립', '대사', '현대인', '잎', '소음', '꿈속', '밖', '듯', '라인', '분명해지다', '몲', '팟', '버리다', '이혼', '봇', '긋', '여', '맹', '눈뜨다', '쒜', '불과하다', '거', '온', '궁금하다', '모처럼', '에', '왹', '않', '앞뒤', '볶다', '그나마', '하', '무려', '아무래도', '세제', '첫', '쇌', '꺼', '뜨다', '도장', '건조하다', '돌', '찢어지다', '연구소', '군', '이날', '밀리미터', '빛', '대다수', '사전', '짜다', '읒', '틀림없다', '활', '감', '실시되다', '통합', '제발', '바라다', '몄', '솔직히', '추진', '청소년', '억', '사과', '실천하다', '넷째', '낚', '덜', '앰', '이리저리', '재', '추가되다', '뤽', '미역', '기성', '당신', '주로', '전날', '푠', '주전자', '쭉', '또는', '데', '발생', '냇', '과

# 7. 모형

In [None]:
class RecognitionModel(nn.Module):
    def __init__(self, num_chars=len(char2idx), rnn_hidden_size=256):
        super(RecognitionModel, self).__init__()
        self.num_chars = num_chars
        self.rnn_hidden_size = rnn_hidden_size

        # CNN Backbone = 사전학습된 resnet18 활용
        # https://arxiv.org/abs/1512.03385
        resnet = resnet50(pretrained=True)
        # CNN Feature Extract
        resnet_modules = list(resnet.children())[:-3]
        self.feature_extract = nn.Sequential(
            *resnet_modules,
            nn.Conv2d(1024, 256, kernel_size=(3,6), stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True)
        )

        self.linear1 = nn.Linear(1024, rnn_hidden_size)

        # RNN
        self.rnn = nn.RNN(input_size=rnn_hidden_size,
                            hidden_size=rnn_hidden_size,
                            bidirectional=True,
                            batch_first=True)
        self.linear2 = nn.Linear(self.rnn_hidden_size*2, num_chars)


    def forward(self, x):
        # CNN
        x = self.feature_extract(x) # [batch_size, channels, height, width]
        x = x.permute(0, 3, 1, 2) # [batch_size, width, channels, height]

        batch_size = x.size(0)
        T = x.size(1)
        x = x.view(batch_size, T, -1) # [batch_size, T==width, num_features==channels*height]
        x = self.linear1(x)

        # RNN
        x, hidden = self.rnn(x)

        output = self.linear2(x)
        output = output.permute(1, 0, 2) # [T==10, batch_size, num_classes==num_features]

        return output

## 손실 함수

In [None]:
criterion = nn.CTCLoss(blank=0) # idx 0 : '-'

In [None]:
def encode_text_batch(text_batch):
    text_batch_targets_lens = [len(text) for text in text_batch]
    text_batch_targets_lens = torch.IntTensor(text_batch_targets_lens)

    text_batch_concat = "".join(text_batch)
    text_batch_targets = [char2idx[c] for c in text_batch_concat]
    text_batch_targets = torch.IntTensor(text_batch_targets)

    return text_batch_targets, text_batch_targets_lens

In [None]:
def compute_loss(text_batch, text_batch_logits):
    """
    text_batch: list of strings of length equal to batch size
    text_batch_logits: Tensor of size([T, batch_size, num_classes])
    """
    text_batch_logps = F.log_softmax(text_batch_logits, 2) # [T, batch_size, num_classes]
    text_batch_logps_lens = torch.full(size=(text_batch_logps.size(1),),
                                       fill_value=text_batch_logps.size(0),
                                       dtype=torch.int32).to(device) # [batch_size]

    text_batch_targets, text_batch_targets_lens = encode_text_batch(text_batch)
    loss = criterion(text_batch_logps, text_batch_targets, text_batch_logps_lens, text_batch_targets_lens)

    return loss

# 8. 학습

In [None]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)

    best_loss = 999999
    best_model = None
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for image_batch, text_batch in tqdm(iter(train_loader)):
            image_batch = image_batch.to(device)

            optimizer.zero_grad()

            text_batch_logits = model(image_batch)
            loss = compute_loss(text_batch, text_batch_logits)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        _train_loss = np.mean(train_loss)

        _val_loss = validation(model, val_loader, device)
        print(f'Epoch : [{epoch}] Train CTC Loss : [{_train_loss:.5f}] Val CTC Loss : [{_val_loss:.5f}]')

        if scheduler is not None:
            scheduler.step(_val_loss)

        if best_loss > _val_loss:
            best_loss = _val_loss
            best_model = model

    return best_model

In [None]:
def validation(model, val_loader, device):
    model.eval()
    val_loss = []
    with torch.no_grad():
        for image_batch, text_batch in tqdm(iter(val_loader)):
            image_batch = image_batch.to(device)

            text_batch_logits = model(image_batch)
            loss = compute_loss(text_batch, text_batch_logits)

            val_loss.append(loss.item())

    _val_loss = np.mean(val_loss)
    return _val_loss

In [None]:
model = RecognitionModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [1] Train CTC Loss : [7.12363] Val CTC Loss : [5.05512]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [2] Train CTC Loss : [5.11595] Val CTC Loss : [3.65485]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [3] Train CTC Loss : [3.63675] Val CTC Loss : [2.03724]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [4] Train CTC Loss : [2.06661] Val CTC Loss : [1.14989]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [5] Train CTC Loss : [1.30456] Val CTC Loss : [0.83747]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [6] Train CTC Loss : [0.88210] Val CTC Loss : [0.47174]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [7] Train CTC Loss : [0.59418] Val CTC Loss : [0.39016]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [8] Train CTC Loss : [0.37254] Val CTC Loss : [0.43208]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [9] Train CTC Loss : [0.31147] Val CTC Loss : [1.05834]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [10] Train CTC Loss : [0.28713] Val CTC Loss : [0.48508]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [11] Train CTC Loss : [0.12139] Val CTC Loss : [0.19635]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [12] Train CTC Loss : [0.06559] Val CTC Loss : [0.17073]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [13] Train CTC Loss : [0.04926] Val CTC Loss : [0.16411]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [14] Train CTC Loss : [0.04965] Val CTC Loss : [0.15985]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [15] Train CTC Loss : [0.03716] Val CTC Loss : [0.15021]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [16] Train CTC Loss : [0.03008] Val CTC Loss : [0.15824]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [17] Train CTC Loss : [0.06455] Val CTC Loss : [0.18556]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [18] Train CTC Loss : [0.08350] Val CTC Loss : [0.17190]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [19] Train CTC Loss : [0.02278] Val CTC Loss : [0.13435]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [20] Train CTC Loss : [0.01242] Val CTC Loss : [0.13075]


# 9. 추론

In [None]:
test = pd.read_csv('./test.csv')

In [None]:
test_dataset = CustomDataset(test['img_path'].values, None, False)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=CFG['NUM_WORKERS'])

In [None]:
def decode_predictions(text_batch_logits):
    text_batch_tokens = F.softmax(text_batch_logits, 2).argmax(2) # [T, batch_size]
    text_batch_tokens = text_batch_tokens.numpy().T # [batch_size, T]

    text_batch_tokens_new = []
    for text_tokens in text_batch_tokens:
        text = [idx2char[idx] for idx in text_tokens]
        text = "".join(text)
        text_batch_tokens_new.append(text)

    return text_batch_tokens_new

def inference(model, test_loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for image_batch in tqdm(iter(test_loader)):
            image_batch = image_batch.to(device)

            text_batch_logits = model(image_batch)

            text_batch_pred = decode_predictions(text_batch_logits.cpu())

            preds.extend(text_batch_pred)
    return preds

In [None]:
val_dataset2 = CustomDataset(val['img_path'].values, None, False)
val_loader2 = DataLoader(val_dataset2, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=CFG['NUM_WORKERS'])

In [None]:
val_predictions = inference(infer_model, val_loader2, device)

  0%|          | 0/42 [00:00<?, ?it/s]

In [None]:
model.to(device)

RecognitionModel(
  (feature_extract): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
         

In [None]:
predictions = inference(model, test_loader, device)

  0%|          | 0/290 [00:00<?, ?it/s]

In [None]:
predictions = inference(infer_model, test_loader, device)

NameError: name 'infer_model' is not defined

In [None]:
# 샘플 별 추론결과를 독립적으로 후처리
def remove_duplicates(text):
    if len(text) > 1:
        letters = [text[0]] + [letter for idx, letter in enumerate(text[1:], start=1) if text[idx] != text[idx-1]]
    elif len(text) == 1:
        letters = [text[0]]
    else:
        return ""
    return "".join(letters)

def correct_prediction(word):
    parts = word.split("-")
    parts = [remove_duplicates(part) for part in parts]
    corrected_word = "".join(parts)
    return corrected_word

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['label'] = predictions
submit['label'] = submit['label'].apply(correct_prediction)

In [None]:
submit.to_csv('/content/drive/MyDrive/롯데/신입과제/CRNN.csv', index=False)

In [None]:
submit

Unnamed: 0,id,label
0,TEST_00000,낯말
1,TEST_00001,상향
2,TEST_00002,발아늘이다
3,TEST_00003,바구니
4,TEST_00004,살
...,...,...
74116,TEST_74116,캐아다
74117,TEST_74117,사무
74118,TEST_74118,친절하다
74119,TEST_74119,쪽


In [None]:
val['preds'] = val_predictions
val.head()

Unnamed: 0,id,img_path,label,len,preds
59410,TRAIN_59410,./train/TRAIN_59410.png,전기,2,전----기-----
24756,TRAIN_24756,./train/TRAIN_24756.png,이내,2,이--내내------
69341,TRAIN_69341,./train/TRAIN_69341.png,똑같이,3,똑-같---이----
15211,TRAIN_15211,./train/TRAIN_15211.png,전기밥솥,4,전-기--밥--솥--
22700,TRAIN_22700,./train/TRAIN_22700.png,만두,2,만---두------


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/롯데/신입과제/CRNN_ResNet50.pth')

# 외부 데이터

In [None]:
model = RecognitionModel()
model.eval()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 113MB/s]


RecognitionModel(
  (feature_extract): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
         

In [None]:
# 가중치 불러오기
model.load_state_dict(torch.load('/content/drive/MyDrive/롯데/신입과제/모형/CRNN_ResNet50.pth'))

<All keys matched successfully>

In [None]:
sample_img_path = '/content/drive/MyDrive/Naver/DataCentric/data2/AIHub/img/train/CST_1980_5350108_0002_0001.jpg'
sample_label_path = '/content/drive/MyDrive/Naver/DataCentric/data2/AIHub/label/CST_1980_5350108_0002_0001.json'

In [None]:
import json

with open(sample_label_path, 'r') as f:
    data = json.load(f)

In [None]:
data

{'Annotation': {'object_recognition': 1, 'text_language': 0},
 'Dataset': {'category': 0,
  'identifier': 'OCR(public)',
  'label_path': 'OCR(public)/CST/1980/5350108/0002',
  'name': '대규모 OCR 데이터(공공)',
  'src_path': 'OCR(public)/CST/1980/5350108/0002',
  'type': 1},
 'Images': {'acquisition_location': 1,
  'data_captured': '2022.08.09 10:57:13',
  'dpi': 300,
  'group': 1,
  'height': 3504,
  'identifier': 'CST_1980_5350108_0002_0001',
  'type': 'jpg',
  'width': 2480,
  'writing_style': 3,
  'year': 2},
 'Bbox': [{'data': '다.',
   'id': 1,
   'type': 1,
   'typeface': 1,
   'x': [534, 534, 598, 598],
   'y': [664, 709, 664, 709]},
  {'data': '가공',
   'id': 2,
   'type': 1,
   'typeface': 1,
   'x': [613, 613, 747, 747],
   'y': [663, 710, 663, 710]},
  {'data': '금긋기',
   'id': 3,
   'type': 1,
   'typeface': 1,
   'x': [614, 614, 745, 745],
   'y': [727, 778, 727, 778]},
  {'data': '작업은',
   'id': 4,
   'type': 1,
   'typeface': 1,
   'x': [761, 761, 900, 900],
   'y': [726, 779, 726

In [None]:
transform_ops = transforms.Compose([
            transforms.Resize((CFG['IMG_HEIGHT_SIZE'],CFG['IMG_WIDTH_SIZE'])),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
        ])

In [None]:
img = transform_ops(img)
img.shape

<PIL.Image.Image image mode=RGB size=2480x3504 at 0x797068CF9960>


torch.Size([3, 64, 224])

In [None]:
import cv2
from PIL import Image
img = cv2.imread(sample_img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
x = img.copy()
x = x[928:981, 517:560]
img = Image.fromarray(x)

In [None]:
img = transform_ops(img)
img.shape

torch.Size([3, 64, 224])

In [None]:
inp = torch.tensor(img,dtype=torch.float32).unsqueeze(0).to(device)

In [None]:
inp.shape

torch.Size([1, 3, 64, 224])

In [None]:
out = model(inp)
out

tensor([[[-5.7744, -8.1636, -9.1347,  ...,  4.1324,  5.0372,  3.5051]],

        [[25.9347,  7.3611,  0.2499,  ..., -6.4121, -6.8208,  0.9743]],

        [[26.0779,  7.9514,  0.7673,  ..., -6.9170, -7.0034,  0.3839]],

        ...,

        [[26.7456,  5.7911,  4.1729,  ..., -6.6709, -7.3175, -0.4509]],

        [[27.9928,  6.6115,  3.4585,  ..., -7.0438, -8.0763,  0.2096]],

        [[26.9672,  8.1228,  3.3437,  ..., -6.7556, -6.8454,  0.8969]]],
       device='cuda:0', grad_fn=<PermuteBackward0>)

In [None]:
decode_predictions(out.cpu())

['및----------']