In [1]:
import os
import os
import torch
import numpy as np
import pandas as pd
import glob as glob
import torch.optim as optim
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
 
 
from PIL import Image
from zipfile import ZipFile
from tqdm.notebook import tqdm
from dataclasses import dataclass
from torch.utils.data import Dataset
from urllib.request import urlretrieve
from transformers import (
    VisionEncoderDecoderModel,
    TrOCRProcessor,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator
)

  from .autonotebook import tqdm as notebook_tqdm
2024-01-09 07:52:14.915153: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-09 07:52:14.939943: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-09 07:52:14.939977: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-09 07:52:14.940668: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-09 07:52:14.9

In [2]:
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
 
seed_everything(42)
 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
#pytorch dataset where label is the image name.jpg and the image is the image itself
import os
import torch
import numpy as np
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms

class ImageDataset(Dataset):
    def __init__(self, root, processor,  transform=None , max_target_length=128):
        self.root = root
        self.transform = transform
        self.imgs = list(sorted(os.listdir(root)))
        self.processor = processor
        self.max_target_length = max_target_length

    def __getitem__(self, index):
        img_path = os.path.join(self.root, self.imgs[index])
        img = Image.open(img_path).convert("RGB")
        if self.transform is not None:
            img = self.transform(img)
        label = self.imgs[index][:-4]
        pixel_values = self.processor(img, return_tensors='pt').pixel_values
        # Pass the text through the tokenizer and get the labels,
        # i.e. tokenized labels.
        labels = self.processor.tokenizer(
            label,
            padding='max_length',
            max_length=self.max_target_length
        ).input_ids
        # We are using -100 as the padding token.
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

    def __len__(self):
        return len(self.imgs)

data_path = "archive"




In [4]:
from transformers import TrOCRProcessor
from torchvision import transforms

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-printed" , use_fast=False)


transforms = transforms.Compose([
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.4),
    
]) 

#keep 0.1 data aside for testing
dataset = ImageDataset(data_path, processor, transform=transforms)
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size] , generator=torch.Generator().manual_seed(42))



Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [5]:
from transformers import VisionEncoderDecoderModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VisionEncoderDecoderModel.from_pretrained("/home/arnesh/Desktop/CAPTCHA MODEL/models/0.08231476569407588_17_20240107T042532")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): DeiTModel(
    (embeddings): DeiTEmbeddings(
      (patch_embeddings): DeiTPatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DeiTEncoder(
      (layer): ModuleList(
        (0-11): 12 x DeiTLayer(
          (attention): DeiTAttention(
            (attention): DeiTSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): DeiTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): DeiTIntermediate(
            (dense): Linear(

In [6]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [7]:
from datasets import load_metric

cer_metric = load_metric("cer")
def compute_cer(pred_ids, label_ids):
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return cer

  cer_metric = load_metric("cer")


In [8]:
#cuda memory clear
import gc
gc.collect()
torch.cuda.empty_cache()

In [9]:
#0.1 percent test data and k fold cross validation with 5 folds
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np
from tqdm import tqdm
kf = KFold(n_splits=5)
kf.get_n_splits(train_dataset)
train_loss = []
val_loss = []
train_cer = []
val_cer = []

#train using pytorch library without using trainer
import torch
import torch.optim as optim
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import KFold


train_loss = []
val_loss = []
train_cer = []
val_cer = []
val_cer1 = []
train_cer1 = []

for epoch in range(10):
    for train_index, val_index in kf.split(train_dataset):
        # Shuffle indices manually
        np.random.shuffle(train_index)
        np.random.shuffle(val_index)

        train_dataset_split = Subset(train_dataset, train_index)
        val_dataset_split = Subset(train_dataset, val_index)
        train_dataloader = DataLoader(train_dataset_split, batch_size=12, shuffle=True)
        val_dataloader = DataLoader(val_dataset_split, batch_size=12, shuffle=True)

        optimizer = optim.AdamW(model.parameters(), lr=1e-4)


        print("Epoch: ", epoch)

        model.train()
        for batch in tqdm(train_dataloader):
        # get the inputs
            for k,v in batch.items():
                batch[k] = v.to(device)

            # forward + backward + optimize
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_loss.append(loss.item())
            labels = batch['labels'].to(device)
            input_ids = batch['pixel_values'].to(device)
            pred_ids = model.generate(input_ids)
            cer = compute_cer(pred_ids, labels)
            train_cer.append(cer)

        model.eval()
        for batch in tqdm(val_dataloader):
            outputs = model.generate(batch["pixel_values"].to(device))
            cer = compute_cer(pred_ids=outputs, label_ids=batch["labels"])
            val_cer.append(cer)

        print("Train Loss: ", np.mean(train_loss))
        print("Train CER: ", np.mean(train_cer))
        print("Val CER: ", np.mean(val_cer))
        print("-----------------------------------------------------------")

        val_cer1.append(np.mean(val_cer))
        train_cer1.append(np.mean(train_cer))

        
        train_loss = []
        val_loss = []
        train_cer = []
        val_cer = []

        # Save every epoch with cer score
        save_pretrained_dir = f'models_val/{np.mean(val_cer)}_{epoch}_'
        model.save_pretrained(save_pretrained_dir)
        

    

Epoch:  0


100%|██████████| 6784/6784 [1:31:19<00:00,  1.24it/s]
100%|██████████| 1696/1696 [10:17<00:00,  2.75it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Train Loss:  0.15545892854895083
Train CER:  0.03588222287735849
Val CER:  0.05316966838193254
-----------------------------------------------------------
Epoch:  0


100%|██████████| 6784/6784 [1:30:39<00:00,  1.25it/s]
100%|██████████| 1696/1696 [10:19<00:00,  2.74it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Train Loss:  0.16642942145397616
Train CER:  0.03993956367924529
Val CER:  0.04931300028587765
-----------------------------------------------------------
Epoch:  0


100%|██████████| 6784/6784 [1:30:37<00:00,  1.25it/s]
100%|██████████| 1696/1696 [10:27<00:00,  2.70it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Train Loss:  0.16804780812519723
Train CER:  0.03976513364779874
Val CER:  0.05456778873642082
-----------------------------------------------------------
Epoch:  0


100%|██████████| 6784/6784 [1:31:04<00:00,  1.24it/s]
100%|██████████| 1696/1696 [10:34<00:00,  2.67it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Train Loss:  0.16371020441133538
Train CER:  0.039028105345911945
Val CER:  0.04410913379073757
-----------------------------------------------------------
Epoch:  0


100%|██████████| 6784/6784 [1:30:40<00:00,  1.25it/s]
100%|██████████| 1696/1696 [10:18<00:00,  2.74it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Train Loss:  0.1549769294729488
Train CER:  0.034879864386792456
Val CER:  0.056731525157232704
-----------------------------------------------------------
Epoch:  1


100%|██████████| 6784/6784 [1:30:39<00:00,  1.25it/s]
100%|██████████| 1696/1696 [10:21<00:00,  2.73it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Train Loss:  0.1567444037246221
Train CER:  0.035317167845911955
Val CER:  0.04798009576901086
-----------------------------------------------------------
Epoch:  1


100%|██████████| 6784/6784 [1:30:33<00:00,  1.25it/s]
 15%|█▌        | 256/1696 [01:34<08:54,  2.70it/s]


KeyboardInterrupt: 

In [10]:
#save pretrained model for inference
save_pretrained_dir = 'final_model'
model.save_pretrained(save_pretrained_dir)



In [14]:
#inference setup
from transformers import TrOCRProcessor
from torchvision import transforms
from PIL import Image
import torch

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-printed")
model = VisionEncoderDecoderModel.from_pretrained("final_model")
model.to(device)
model.eval()

#load example image

img_path = "archive/1aXSz.jpg"

img = Image.open(img_path).convert("RGB")
pixel_values = processor(img, return_tensors='pt').pixel_values
outputs = model.generate(pixel_values.to(device))
pred_str = processor.batch_decode(outputs, skip_special_tokens=True)
print(pred_str) #print prediction



Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


['1aXSz']
