<a href="https://colab.research.google.com/github/agrawalsourav98/SignboardTranslation/blob/main/ImageCaptioning(Hindi).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [None]:
import torch
from torch import nn
from torchvision.models import resnet50
from torchvision import transforms
import numpy as np
from PIL import Image
from os import path
from pathlib import Path
import glob
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from torch.utils import data
import threading
from torch.nn.utils.rnn import pack_padded_sequence
import sys
import time
import os
import shutil
from filelock import FileLock
import torch.optim as optim
from sklearn.model_selection import train_test_split
import datetime

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Load the dataset

## Fetch the dataset

In [None]:
#Fetch the dataset and unzip it
!7z e '/content/drive/MyDrive/PadhAI/Synthetic Train Set (100k) - Detection & Recognition.tar.7z'
!tar -xvf '/content/Synthetic Train Set - Detection & Recognition.tar' > /dev/null


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/drive/MyDrive/PadhAI/                                        1 file, 546195564 bytes (521 MiB)

Extracting archive: /content/drive/MyDrive/PadhAI/Synthetic Train Set (100k) - Detection & Recognition.tar.7z
--
Path = /content/drive/MyDrive/PadhAI/Synthetic Train Set (100k) - Detection & Recognition.tar.7z
Type = 7z
Physical Size = 546195564
Headers Size = 202
Method = LZMA2:24
Solid = -
Blocks = 1

  0%      0% - Synthetic Train Set - Detection & Recognition.tar                                                          1% - Synthetic Train Set - 

## Load Train Test Set

In [None]:
!cp /content/drive/MyDrive/PadhAI/synthetic_dataset_2.zip .

In [None]:
!unzip -q synthetic_dataset_2.zip

## Define the vocabulary class

In [None]:
class Vocabulary(object):

  def __init__(self):
    super(Vocabulary,self).__init__()

    self.vocab2idx = {}
    self.idx2vocab = {}
    self.idx = 0

  def add2vocab(self,v):
    v = str(v).lower()
    if not v in self.vocab2idx:
      self.vocab2idx[v] = self.idx
      self.idx2vocab[self.idx] = v
      self.idx += 1

  def __call__(self,v):
    v = str(v).lower()
    return self.vocab2idx[v]

  def get_vocab(self,idx):
    return self.idx2vocab[idx]


  def __len__(self):
    return len(self.vocab2idx)
    

## Define the image dataset class and dataloader function

In [None]:
class ImageTextDataset(data.Dataset):
    def __init__(self,labelfile,vocab,transform=None):
        start_time = time.time()
        print("Dataset creation has started at",start_time)
        self.vocab = vocab
        self.transform = transform
        self.imgs = []
        self.labels = []
        #jpg_list = glob.glob(self.folder+'/*')
        #total_jpgs = len(jpg_list)
        #start = vocab('<start>')
        #end = vocab('<end>')
        lf = open(labelfile,'r')
        for index,line in enumerate(lf):
          line = line.strip('\n')
          splits = line.split('=')
          if len(splits) != 2:
            continue
          self.imgs.append(splits[0])
          label = []
          #label.append(start)
          label.extend([vocab(l) for l in splits[1]])
          #label.append(end)
          label = torch.Tensor(label)
          self.labels.append(label)
          sys.stdout.write("\r%i completed" % (index+1))
        # for index,jpg in enumerate(jpg_list):
        #     jpg_full = Path(jpg).absolute()
        #     self.imgs.append(str(jpg_full))
        #     f = open(jpg[:-4]+'.txt')
        #     word = f.readline()
        #     word = word.strip('\n')
        #     f.close()
        #     label = []
        #     label.append(start)
        #     label.extend([vocab(l) for l in word])
        #     label.append(end)
        #     label = torch.Tensor(label)
        #     self.labels.append(label)
            
        sys.stdout.write("\nInit completed in time %.3f" % (time.time()-start_time))

    def __getitem__(self,index):
        #print(self.imgs[index])
        image = Image.open(self.imgs[index])
        image = image.convert('RGB')
        #image = image.convert('L')
        if self.transform is not None:
            image = self.transform(image)
        label = self.labels[index]
        return image,label

    def __len__(self):
        return len(self.imgs)

In [None]:
def collate_fn(data):
  #Sort according to lengths in decreasing order
  data.sort(key=lambda x: len(x[1]), reverse=True)

  images, labels = zip(*data)

  images = torch.stack(images, 0)

  lengths = [len(label) for label in labels]

  targets = torch.zeros(len(labels), max(lengths))

  for idx,label in enumerate(labels):
    label_len = lengths[idx]
    targets[idx,:label_len] = label
  return images, targets, lengths

In [None]:
def get_loader(labelfile, vocab, transform, batch_size, shuffle, collate_fn, num_workers=2):
  labelfile = Path(labelfile).expanduser().absolute()
  dataset = ImageTextDataset(str(labelfile), vocab, transform)

  data_loader = data.DataLoader(
      dataset=dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      num_workers=num_workers,
      collate_fn=collate_fn
  )

  return data_loader

In [None]:
def crop_and_create_dataset(imgs_list,dest_folder):
    start = time.time() 
    # stdscr.addstr(int(x),int(y),"{0} images assigned to thread {1}".format(len(imgs_list),threading.current_thread().name))
    # stdscr.refresh()
    print("{0} images assigned to thread {1}".format(len(imgs_list),threading.current_thread().name))
    thread_name = threading.current_thread().name
    #sys.stdout.write('\n')
    label_file_pth = str(Path(str(dest_folder),'labels.txt').expanduser().absolute())
    for idx,img in enumerate(imgs_list):
        opened_img = Image.open(img)
        #img_path = Path(img).expanduser().absolute()
        with open(img[:-4]+'.txt','r') as af:
            for index,line in enumerate(af):
                splits = line.split(' ')
                if len(splits) != 9:
                    continue
                x1,x2,x3,x4,y1,y2,y3,y4,truth = splits
                x_list = [float(x1),float(x2),float(x3),float(x4)]
                y_list = [float(y1),float(y2),float(y3),float(y4)]
                xmin = min(x_list)
                xmax = max(x_list)
                ymin = min(y_list)
                ymax = max(y_list)
                cropped_img = opened_img.crop((x_list[0],y_list[0],x_list[0]+xmax-xmin,y_list[0]+ymax-ymin)) 
                dest_img_path = Path(dest_folder,'{0}_{1}_{2}.jpg'.format(thread_name,idx,index)).expanduser().absolute()
                dest_img_path.parent.mkdir(parents=True, exist_ok=True)
                try:
                    cropped_img.save(str(dest_img_path))
                except SystemError:
                    print("Current Image",img,"Destination Image",str(dest_img_path))
                    raise RuntimeError
                with FileLock(label_file_pth+'.lock'):
                  #print("Lock acquired.")
                  with open(label_file_pth,'a') as label_file:
                    # work with the file as it is now locked
                    label_file.write('{0}={1}'.format(str(dest_img_path),str(truth)))
        #stdscr.addstr(int(x+5),int(y),"Thread {0} has processed {1} files out of {2} files".format(threading.current_thread().name,idx+1,len(imgs_list)))
        #stdscr.refresh()
        sys.stdout.write('\rThread %s has processed %i files out of %i files' % (threading.current_thread().name,idx+1,len(imgs_list)))
    end = time.time()
    print("\nThread",threading.current_thread().name,"finished in",(end-start))

## Create Dataset

In [None]:
# Find all the images in the given dataset
imgs = glob.glob('/content/Synthetic Train Set - Detection & Recognition/Image/**/*.jpg')
print(len(imgs))

116132


In [None]:
X_train, X_val = train_test_split(imgs)

In [None]:
len(X_train)

87099

In [None]:
len(X_val)

29033

In [None]:
!mkdir -p synthetic_dataset

### Create the train set

In [None]:
!mkdir -p /content/synthetic_dataset/train
!rm -rf '/content/Synthetic Train Set - Detection & Recognition/train_imgs/'
!mkdir -p '/content/Synthetic Train Set - Detection & Recognition/train_imgs/'
for idx,img in enumerate(X_train):
  shutil.copy(img,'/content/Synthetic Train Set - Detection & Recognition/train_imgs/'+str(idx)+'.jpg')
  img_path = Path(img)
  img_path_parts = list(img_path.parts)
  img_path_parts[-3] = 'Annotation'
  img_path_parts[-1] = img_path_parts[-1][:-4] + '.txt'
  img_path = Path(*img_path_parts)
  shutil.copy(str(img_path),'/content/Synthetic Train Set - Detection & Recognition/train_imgs/'+str(idx)+'.txt')
  sys.stdout.write('\rCompleted %i of %i'%(idx+1,len(X_train)))

Completed 87099 of 87099

In [None]:
%%time
threads = []
batches = 2
dest_folder = '/content/synthetic_dataset/train'
image_list = glob.glob('/content/Synthetic Train Set - Detection & Recognition/train_imgs/*.jpg')
!rm -rf '/content/Synthetic Train Set - Detection & Recognition/train_imgs_cropped'
batch = len(image_list)//batches
for i in range(batches):
    if i != batches-1:
        imgs_list = image_list[i*batch:(i+1)*batch]
    else:
        imgs_list = image_list[i*batch:]
    t = threading.Thread(target=crop_and_create_dataset,args=(imgs_list,dest_folder),name=str(i))
    threads.append(t)

for t in threads:
    t.start()
for t in threads:
    t.join()

del threads, batches, batch

43549 images assigned to thread 0
43550 images assigned to thread 1
Thread 1 has processed 43550 files out of 43550 files
Thread 1 finished in 448.0424180030823
Thread 0 has processed 43549 files out of 43549 files
Thread 0 finished in 451.909973859787
CPU times: user 8min 7s, sys: 58.2 s, total: 9min 5s
Wall time: 7min 32s


## Create the val set

In [None]:
!mkdir -p /content/synthetic_dataset/val
!rm -rf '/content/Synthetic Train Set - Detection & Recognition/train_imgs/'
!mkdir -p '/content/Synthetic Train Set - Detection & Recognition/train_imgs/'
for idx,img in enumerate(X_val):
  shutil.copy(img,'/content/Synthetic Train Set - Detection & Recognition/train_imgs/'+str(idx)+'.jpg')
  img_path = Path(img)
  img_path_parts = list(img_path.parts)
  img_path_parts[-3] = 'Annotation'
  img_path_parts[-1] = img_path_parts[-1][:-4] + '.txt'
  img_path = Path(*img_path_parts)
  shutil.copy(str(img_path),'/content/Synthetic Train Set - Detection & Recognition/train_imgs/'+str(idx)+'.txt')
  sys.stdout.write('\rCompleted %i of %i'%(idx+1,len(X_val)))

Completed 29033 of 29033

In [None]:
%%time
threads = []
batches = 2
dest_folder = '/content/synthetic_dataset/val'
image_list = glob.glob('/content/Synthetic Train Set - Detection & Recognition/train_imgs/*.jpg')
!rm -rf '/content/Synthetic Train Set - Detection & Recognition/train_imgs_cropped'
batch = len(image_list)//batches
for i in range(batches):
    if i != batches-1:
        imgs_list = image_list[i*batch:(i+1)*batch]
    else:
        imgs_list = image_list[i*batch:]
    t = threading.Thread(target=crop_and_create_dataset,args=(imgs_list,dest_folder),name=str(i))
    threads.append(t)

for t in threads:
    t.start()
for t in threads:
    t.join()

del threads, batches, batch

14516 images assigned to thread 0
14517 images assigned to thread 1
Thread 1 has processed 14517 files out of 14517 files
Thread 1 finished in 149.2560384273529
Thread 0 has processed 14516 files out of 14516 files
Thread 0 finished in 149.32551407814026
CPU times: user 2min 45s, sys: 19.5 s, total: 3min 4s
Wall time: 2min 29s


In [None]:
!7z a synthetic_dataset_2.zip /content/synthetic_dataset


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive:
  0M Scan  /content/                     51M 40961 Scan  /content/synthetic_dataset/train/                                                  103M 81921 Scan  /content/synthetic_dataset/train/                                                  154M 122881 Scan  /content/synthetic_dataset/train/                                                   233M 163841 Scan  /content/synthetic_dataset/train/

In [None]:
!unzip synthetic_dataset_2.zip -d /content/test/

In [None]:
!cp synthetic_dataset.7z /content/drive/MyDrive/PadhAI/

## Perform the loading

In [None]:
# Create the vocab
hindi_vocabulary = Vocabulary()

#Add to the vocabulary
hindi_vocabulary.add2vocab('<pad>')
#hindi_vocabulary.add2vocab('<start>')
#hindi_vocabulary.add2vocab('<end>')
# Add
for alpha in range(2304, 2432):
  hindi_vocabulary.add2vocab(chr(alpha))

print("Hindi vocabulary contains {} items".format(len(hindi_vocabulary)))

Hindi vocabulary contains 129 items


In [None]:
# Define the transforms
transform = transforms.Compose([
                                 transforms.Resize((32,100)),
                                 transforms.RandomHorizontalFlip(),
                                 transforms.ToTensor(),
                                 transforms.Normalize((0.485, 0.456, 0.406),(0.229,0.224,0.225))
])

In [None]:
#Read all 

In [None]:
!mkdir -p '/content/Synthetic Train Set - Detection & Recognition/train_imgs' 
!mkdir -p '/content/Synthetic Train Set - Detection & Recognition/train_imgs_cropped'
imgs_list = glob.glob('/content/Synthetic Train Set - Detection & Recognition/Image/1/*.jpg')
imgs_list.extend(glob.glob('/content/Synthetic Train Set - Detection & Recognition/Image/2/*.jpg'))
#print("Number of images",len(imgs_list))
for idx,img in enumerate(imgs_list):
  shutil.copy(img,'/content/Synthetic Train Set - Detection & Recognition/train_imgs/'+str(idx)+'.jpg')
  img_path = Path(img)
  img_path_parts = list(img_path.parts)
  img_path_parts[-3] = 'Annotation'
  img_path_parts[-1] = img_path_parts[-1][:-4] + '.txt'
  img_path = Path(*img_path_parts)
  shutil.copy(str(img_path),'/content/Synthetic Train Set - Detection & Recognition/train_imgs/'+str(idx)+'.txt')
del imgs_list
# anno_list = glob.glob('/content/Synthetic Train Set - Detection & Recognition/Annotation/1/*.txt')
# anno_list.extend(glob.glob('/content/Synthetic Train Set - Detection & Recognition/Annotation/2/*.txt'))
# for anno in anno_list:
#   shutil.copy(anno,'/content/Synthetic Train Set - Detection & Recognition/train_imgs/')
# del anno_list

In [None]:
%%time
threads = []
batches = 2
dest_folder = '/content/Synthetic Train Set - Detection & Recognition/train_imgs_cropped'
image_list = glob.glob('/content/Synthetic Train Set - Detection & Recognition/train_imgs/*.jpg')
!rm -f '/content/Synthetic Train Set - Detection & Recognition/train_imgs_cropped/labels.txt'
batch = len(image_list)//batches
for i in range(batches):
    if i != batches-1:
        imgs_list = image_list[i*batch:(i+1)*batch]
    else:
        imgs_list = image_list[i*batch:]
    t = threading.Thread(target=crop_and_create_dataset,args=(imgs_list,dest_folder),name=str(i))
    threads.append(t)

for t in threads:
    t.start()
for t in threads:
    t.join()

del threads, batches, batch

In [None]:
labelfile = '/content/Synthetic Train Set - Detection & Recognition/train_imgs_cropped/labels.txt'
batch_size = 256
dataloader = get_loader(labelfile, hindi_vocabulary, transform, batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
batch_size = 256
train_label = '/content/synthetic_dataset/train/labels.txt'
val_label = '/content/synthetic_dataset/val/labels.txt'

train_loader = get_loader(train_label, hindi_vocabulary, transform, batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = get_loader(val_label, hindi_vocabulary, transform, batch_size, shuffle=True, collate_fn=collate_fn)

Dataset creation has started at 1608382874.4384353
503047 completed
Init completed in time 31.953Dataset creation has started at 1608382906.3922448
168286 completed
Init completed in time 10.317

In [None]:
(image,caption,length) = iter(train_loader).next()
print(image.size())
print(caption.size())
print(len(length))

# Model Definition

In [None]:
class BidirectionalLSTM(nn.Module):

  def __init__(
      self,
      numInputs,
      numHidden,
      numOutput
  ):
    super(BidirectionalLSTM, self).__init__()

    self.rnn = nn.LSTM(numInputs, numHidden, bidirectional=True)
    self.fc = nn.Linear(numHidden*2, numOutput)

  def forward(self, input):
    recurrent, _ = self.rnn(input)
    # T is the output seq length
    T, b, h = recurrent.size()
    # Convert it to a 2-D tensor
    t_rec = recurrent.view(T*b,h)

    output = self.fc(t_rec)
    output = output.view(T, b, -1)

    return output

In [None]:
class CRNN(nn.Module):

  def __init__(
      self,
      imgHeight,
      numChannels,
      numClasses,
      numHidden,
      leakyReLU=False
  ):
    super(CRNN,self).__init__()

    assert imgHeight % 16 == 0, 'Image height must be a multiple of 16'

    kernels = [3, 3, 3, 3, 3, 3, 2]
    paddings = [1, 1, 1, 1, 1, 1, 0]
    strides = [1, 1, 1, 1, 1, 1, 1]
    maps = [64, 128, 256, 256, 512, 512, 512]

    cnn = nn.Sequential()

    def convolutionalReLU(i, batchNorm=False):
      numInputs = numChannels if i == 0 else maps[i-1]
      numOutputs = maps[i]

      cnn.add_module('conv{}'.format(i),nn.Conv2d(numInputs,numOutputs,kernels[i],strides[i],paddings[i]))

      if batchNorm:
        cnn.add_module('batch_norm{}'.format(i),nn.BatchNorm2d(numOutputs))

      if leakyReLU:
        cnn.add_module('Activation{}:leakyReLU'.format(i),nn.LeakyReLU(0.2,inplace=True))
      else:
        cnn.add_module('Activation{}:ReLU'.format(i),nn.ReLU(inplace=True))

    convolutionalReLU(0)
    cnn.add_module('MaxPool{}'.format(0),nn.MaxPool2d(2,2))
    convolutionalReLU(1)
    cnn.add_module('MaxPool{}'.format(1),nn.MaxPool2d(2,2))
    convolutionalReLU(2,True)
    convolutionalReLU(3)
    cnn.add_module('MaxPool{}'.format(2),nn.MaxPool2d((2,2),(2,1),(0,1)))
    convolutionalReLU(4)
    convolutionalReLU(5)
    cnn.add_module('MaxPool{}'.format(3),nn.MaxPool2d((2,2),(2,1),(0,1)))
    convolutionalReLU(6,True)

    self.cnn = cnn
    self.rnn = nn.Sequential(
        BidirectionalLSTM(512, numHidden, numHidden),
        BidirectionalLSTM(numHidden, numHidden, numClasses)
    )

  def forward(self,input):
    output = self.cnn(input)

    b, c, h, w = output.size()

    assert h == 1, 'height after convulational layer must be 1'

    output = output.squeeze(2)
    output = output.permute(2, 0, 1) #w*b*c

    output = self.rnn(output)

    return output

In [None]:
class CRNN_V2(nn.Module):

  def __init__(
      self,
      imgHeight,
      numChannels,
      numClasses,
      numHidden,
      leakyReLU=False
  ):
    super(CRNN,self).__init__()

    assert imgHeight % 16 == 0, 'Image height must be a multiple of 16'

    kernels = [3, 3, 3, 3, 3, 3, 2]
    paddings = [1, 1, 1, 1, 1, 1, 0]
    strides = [1, 1, 1, 1, 1, 1, 1]
    maps = [64, 128, 256, 256, 512, 512, 512]

    cnn = nn.Sequential()

    def convolutionalReLU(i, batchNorm=False):
      numInputs = numChannels if i == 0 else maps[i-1]
      numOutputs = maps[i]

      cnn.add_module('conv{}'.format(i),nn.Conv2d(numInputs,numOutputs,kernels[i],strides[i],paddings[i]))

      if batchNorm:
        cnn.add_module('batch_norm{}'.format(i),nn.BatchNorm2d(numOutputs))

      if leakyReLU:
        cnn.add_module('Activation{}:leakyReLU'.format(i),nn.LeakyReLU(0.2,inplace=True))
      else:
        cnn.add_module('Activation{}:ReLU'.format(i),nn.ReLU(inplace=True))

    convolutionalReLU(0)
    cnn.add_module('MaxPool{}'.format(0),nn.MaxPool2d(2,2))
    convolutionalReLU(1)
    cnn.add_module('MaxPool{}'.format(1),nn.MaxPool2d(2,2))
    convolutionalReLU(2,True)
    convolutionalReLU(3)
    cnn.add_module('MaxPool{}'.format(2),nn.MaxPool2d((2,2),(2,1),(0,1)))
    convolutionalReLU(4)
    convolutionalReLU(5)
    cnn.add_module('MaxPool{}'.format(3),nn.MaxPool2d((2,2),(2,1),(0,1)))
    convolutionalReLU(6,True)

    self.cnn = cnn
    self.rnn = nn.Sequential(
        BidirectionalLSTM(512, numHidden, numHidden),
        BidirectionalLSTM(numHidden, numHidden, numClasses)
    )

  def forward(self,input):
    output = self.cnn(input)

    b, c, h, w = output.size()

    assert h == 1, 'height after convulational layer must be 1'

    output = output.squeeze(2)
    output = output.permute(2, 0, 1) #w*b*c

    output = self.rnn(output)

    return output

# Model Init

In [None]:
num_hidden = 512
height = image.size(2)
channels = image.size(1)
model = CRNN(height,channels,len(hindi_vocabulary),num_hidden)
model = model.to(device)

In [None]:
criterion = nn.CTCLoss()
criterion = criterion.to(device)

optimizer = optim.SGD(model.parameters(),lr=0.01)
#optimizer = optim.Adam(model.parameters(), lr=0.001,
#                           betas=(0.5, 0.999))

# Training

In [None]:
print(batch_size)

256


In [None]:
num_epochs = 5

In [None]:
print(image.size())

torch.Size([256, 1, 32, 100])


In [None]:
print(caption.size())

torch.Size([256, 7])


In [None]:
print(caption.size())

torch.Size([256, 7])


In [None]:
print(caption[1])
print(length[1])

tensor([57., 49., 44., 49., 63., 29.,  0.])
6


In [None]:
#Check
str_hindi = ''
for i in caption[0]:
  if not i==0:
    str_hindi += chr(2304-1+i)

print(str_hindi)

आकर्षित


In [None]:
for epoch in range(num_epochs):
  epoch_start = time.time()
  print("Epoch {} of {}".format(epoch+1,num_epochs))
  for index,(image,caption,length) in enumerate(train_loader):
    batch_start = time.time()
    image = image.to(device)
    caption = caption.to(device)
    #print(caption[0])
    preds = model(image)
    preds = preds.log_softmax(2)
    #preds = nn.functional.log_softmax(preds,dim=2)
    batch_size = image.size(0)

    input_lengths = torch.IntTensor(length).to(device)
    target_lengths = torch.IntTensor([preds.size(0)]*batch_size).to(device)

    loss = criterion(preds,caption,target_lengths,input_lengths)
    model.zero_grad()
    loss.backward()
    optimizer.step()

    #del image,caption,input_lengths,target_lengths
    #torch.cuda.empty_cache()

    batch_end = time.time()
    sys.stdout.write('\rBatch %i/%i | Loss: %.3f | Time: %.3f s | ETA: %s'%(index+1,len(train_loader),loss.item(),(batch_end - batch_start),str(datetime.timedelta(seconds=(len(train_loader)-index-1)*(batch_end - batch_start)))))
  epoch_end = time.time()
  sys.stdout.write('\n')
  print("Epoch {} completed in {}".format(epoch+1,(epoch_end-epoch_start)))

Epoch 1 of 5
Batch 1966/1966 | Loss: 3.698 | Time: 0.020 s | ETA: 0:00:00
Epoch 1 completed in 649.6219935417175
Epoch 2 of 5
Batch 1146/1966 | Loss: 3.450 | Time: 0.118 s | ETA: 0:01:37.032752

In [None]:
torch.save(model.state_dict(), 'chkpt_epoch_5.pth')

In [None]:
!cp chkpt_epoch_5.pth /content/drive/MyDrive/PadhAI/

In [None]:
sample_out = model(image)

In [None]:
print("Input size",image.size())
print("Output size",sample_out.size())

Input size torch.Size([256, 1, 32, 100])
Output size torch.Size([26, 256, 131])


In [None]:
print(torch.IntTensor([sample_out.size(0)]*256))

tensor([26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
        26, 26, 26, 26, 26, 26, 26, 26, 

In [None]:
print(length.size())

torch.Size([256])


In [None]:
input_lengths = torch.IntTensor([len(c) for c in caption])

In [None]:
print(caption.size())

torch.Size([256, 8])


In [None]:
input_lengths.size()

torch.Size([256])

In [None]:
target_lengths = torch.IntTensor([sample_out.size(0)]*256)
length = torch.Tensor(length)
loss = criterion(sample_out,caption,target_lengths,input_lengths) / 256

In [None]:
print(loss.item())

0.0001962413953151554


In [None]:
model.zero_grad()
loss.backward()
optimizer.step()

# Inference on val dataset

In [None]:
!cp /content/drive/MyDrive/PadhAI/chkpt_epoch_5.pth .

In [None]:
model.load_state_dict(torch.load('chkpt_epoch_5.pth'),strict=False)

<All keys matched successfully>

In [None]:
total_count_final = 0
total_correct = 0
for index,(image,caption,length) in enumerate(val_loader):
    model.eval()
    batch_start = time.time()
    image = image.to(device)
    #caption = caption.to(device)
    #print(caption[0])
    preds = model(image)
    #print(preds.size())
    _, preds = preds.max(2)
    #print(preds.size())
    #preds = preds.transpose(1, 0).contiguous().view(-1)
    #print(preds.size())
    preds = preds.transpose(1,0)
    preds = preds.to('cpu')
    #print(preds.size())
    
    # caption_str = ''
    # for c in caption[index]:
    #   if c.item() != 0:
    #     v = hindi_vocabulary.get_vocab(c.item())
    #     caption_str += v
    # print(caption_str)

    # pred_str = ''
    # for p in preds[index]:
    #   if p.item() != 0:
    #     v = hindi_vocabulary.get_vocab(p.item())
    #     pred_str += v
    # print(pred_str)

    #if caption_str == pred_str:
    total_count_final += 256
    count = 0
    for idx,p in enumerate(preds):
      preds_size = torch.IntTensor([p.size(0)])
      #rint(preds_size.data)
      #print(p)
      p = np.array(p)
      cap = np.array(caption[idx])
      resolved = ''
      c = ''
      for idx,item in enumerate(p.data):
        if item !=0 and (not (p.item(idx-1) == p.item(idx))):
          resolved += hindi_vocabulary.get_vocab(item)
      for i,item in enumerate(cap.data):
        if item !=0 :
          c += hindi_vocabulary.get_vocab(item)
      #resolved = decode(p.data,preds_size.data,raw=False)
      #sys.stdout.write('\rC: %s, R: %s'%(c,resolved))
      #time.sleep(0.1)
      if resolved == c:
        count += 1 
    total_correct += count
      #print(resolved)
      #break

    #raw_pred = decode(preds.data, preds_size.data, raw=True)
    #sim_pred = decode(preds.data, preds_size.data, raw=False)
    #print('%-20s \n%-20s' % (raw_pred, sim_pred))

    batch_end = time.time()
    sys.stdout.write('\rBatch %i/%i | Acc: %.3f | Time: %.3f s'%(index+1,len(val_loader),total_correct/total_count_final,(batch_end - batch_start)))
    #break

Batch 658/658 | Acc: 0.783 | Time: 0.030 s