In [1]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from skimage import io

In [94]:
class customDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
        image = io.imread(img_path)
        y_label = (self. annotations.iloc[index, 2])

        if self.transform:
            image = self.transform(image)

        return image, y_label

In [74]:
temp = pd.read_csv('./groundtruth.csv')
temp.iloc[0,2]

'FGJ235'

In [75]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision
from torch.utils.data import DataLoader

In [76]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [77]:
in_channel = 3
num_classes = 2
learning_rate = 1e-3
batch_size = 32
num_epochs = 10

In [99]:
# trans = transforms.Compose([transforms.Resize((32,32)),
#                                 transforms.ToTensor(), 
#                                 transforms.Normalize((0.5,),(0.5,))
#                                 ])

dataset = customDataset(
    csv_file="./groundtruth.csv",
    root_dir="./images/",
    transform=transforms.ToTensor()
)


In [103]:
for i in range(len(dataset)):
    sample = dataset[i]

    print(sample[0].shape)

AttributeError: 'tuple' object has no attribute 'resize'

In [82]:
class BidirectionalLSTM(nn.Module):

    def __init__(self, nIn, nHidden, nOut):
        super(BidirectionalLSTM, self).__init__()

        self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
        self.embedding = nn.Linear(nHidden * 2, nOut)

    def forward(self, input):
        recurrent, _ = self.rnn(input)
        T, b, h = recurrent.size()
        t_rec = recurrent.view(T * b, h)

        output = self.embedding(t_rec)  # [T * b, nOut]
        output = output.view(T, b, -1)

        return output


class CRNN(nn.Module):

    def __init__(self, imgH, nc, nclass, nh, n_rnn=2, leakyRelu=False):
        super(CRNN, self).__init__()
        assert imgH % 16 == 0, 'imgH has to be a multiple of 16'

        ks = [3, 3, 3, 3, 3, 3, 2]
        ps = [1, 1, 1, 1, 1, 1, 0]
        ss = [1, 1, 1, 1, 1, 1, 1]
        nm = [64, 128, 256, 256, 512, 512, 512]

        cnn = nn.Sequential()

        def convRelu(i, batchNormalization=False):
            nIn = nc if i == 0 else nm[i - 1]
            nOut = nm[i]
            cnn.add_module('conv{0}'.format(i),
                           nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
            if batchNormalization:
                cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
            if leakyRelu:
                cnn.add_module('relu{0}'.format(i),
                               nn.LeakyReLU(0.2, inplace=True))
            else:
                cnn.add_module('relu{0}'.format(i), nn.ReLU(True))

        convRelu(0)
        cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2))  # 64x16x64
        convRelu(1)
        cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2))  # 128x8x32
        convRelu(2, True)
        convRelu(3)
        cnn.add_module('pooling{0}'.format(2),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 256x4x16
        convRelu(4, True)
        convRelu(5)
        cnn.add_module('pooling{0}'.format(3),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 512x2x16
        convRelu(6, True)  # 512x1x16

        self.cnn = cnn
        self.rnn = nn.Sequential(
            BidirectionalLSTM(512, nh, nh),
            BidirectionalLSTM(nh, nh, nclass))

    def forward(self, input):
        # conv features
        conv = self.cnn(input)
        b, c, h, w = conv.size()
        assert h == 1, "the height of conv must be 1"
        conv = conv.squeeze(2)
        conv = conv.permute(2, 0, 1)  # [w, b, c]

        # rnn features
        output = self.rnn(conv)

        return output

In [100]:
train_set = dataset
train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)

dataiter = iter(train_loader)
dataiter.next()

RuntimeError: stack expects each tensor to be equal size, but got [3, 160, 320] at entry 0 and [3, 158, 320] at entry 1

In [10]:
model = BidirectionalLSTM(320, 2, 10)
model

BidirectionalLSTM(
  (rnn): LSTM(320, 2, bidirectional=True)
  (embedding): Linear(in_features=4, out_features=10, bias=True)
)

In [11]:
sample = dataset[0]
output = model(sample[0])
output

tensor([[[ 0.2283, -0.3976, -0.3418,  ...,  0.4375, -0.2321, -0.5293],
         [ 0.2298, -0.3969, -0.3395,  ...,  0.4366, -0.2354, -0.5279],
         [ 0.1957, -0.4338, -0.4235,  ...,  0.4416, -0.1935, -0.6059],
         ...,
         [ 0.2037, -0.4779, -0.4608,  ...,  0.4206, -0.2363, -0.6230],
         [ 0.1844, -0.5082, -0.5027,  ...,  0.4166, -0.2470, -0.6445],
         [ 0.1763, -0.5140, -0.5084,  ...,  0.4163, -0.2562, -0.6411]],

        [[ 0.2428, -0.4037, -0.3377,  ...,  0.4291, -0.2480, -0.5250],
         [ 0.2445, -0.4026, -0.3348,  ...,  0.4282, -0.2517, -0.5233],
         [ 0.2119, -0.4360, -0.4140,  ...,  0.4339, -0.2082, -0.5983],
         ...,
         [ 0.2206, -0.4463, -0.4237,  ...,  0.4273, -0.2138, -0.6095],
         [ 0.2063, -0.4720, -0.4611,  ...,  0.4239, -0.2175, -0.6328],
         [ 0.1967, -0.4840, -0.4770,  ...,  0.4227, -0.2240, -0.6384]],

        [[ 0.2544, -0.4059, -0.3295,  ...,  0.4231, -0.2621, -0.5163],
         [ 0.2594, -0.4032, -0.3213,  ...,  0

In [33]:
# print(len(dataset))
# print(batch_size)
# len(train_loader)
plt.show()

FGJ235


In [14]:
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
lr_sche = optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma = 0.9)

In [36]:
for epoch in range(30):  # loop over the dataset multiple times
    running_loss = 0.0
    running_corrects = 0.0

    for i, data in enumerate(train_loader):
        inputs, labels = data
        # zero the parameter gradients
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        _, preds = torch.max(outputs, 1)
        running_loss += loss.item()
        running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss/len(train_loader)
        epoch_acc = running_corrects.float()/ len(train_loader)
    print('epoch :', (epoch+1))
    print('training loss: {:.4f}, acc {:.4f} '.format(epoch_loss, epoch_acc.item()))

print('Finished Training')

RuntimeError: stack expects each tensor to be equal size, but got [3, 153, 320] at entry 0 and [3, 154, 320] at entry 1

In [None]:
def val(net, dataset, criterion, max_iter=100):
    print('Start val')

    for p in model.parameters():
        p.requires_grad = False

    net.eval()
    data_loader = torch.utils.data.DataLoader(
        dataset, shuffle=True, batch_size=batch_size, num_workers=int(-1))
    val_iter = iter(data_loader)

    i = 0
    n_correct = 0
    loss_avg = utils.averager()

    max_iter = min(max_iter, len(data_loader))
    for i in range(max_iter):
        data = val_iter.next()
        i += 1
        cpu_images, cpu_texts = data
        batch_size = cpu_images.size(0)
        utils.loadData(image, cpu_images)
        t, l = converter.encode(cpu_texts)
        utils.loadData(text, t)
        utils.loadData(length, l)

        preds = crnn(image)
        preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
        cost = criterion(preds, text, preds_size, length) / batch_size
        loss_avg.add(cost)

        _, preds = preds.max(2)
        preds = preds.squeeze(2)
        preds = preds.transpose(1, 0).contiguous().view(-1)
        sim_preds = converter.decode(preds.data, preds_size.data, raw=False)
        for pred, target in zip(sim_preds, cpu_texts):
            if pred == target.lower():
                n_correct += 1

    raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:opt.n_test_disp]
    for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts):
        print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt))

    accuracy = n_correct / float(max_iter * opt.batchSize)
    print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))


def trainBatch(net, criterion, optimizer):
    data = train_iter.next()
    cpu_images, cpu_texts = data
    batch_size = cpu_images.size(0)
    utils.loadData(image, cpu_images)
    t, l = converter.encode(cpu_texts)
    utils.loadData(text, t)
    utils.loadData(length, l)

    preds = crnn(image)
    preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
    cost = criterion(preds, text, preds_size, length) / batch_size
    crnn.zero_grad()
    cost.backward()
    optimizer.step()
    return cost

In [None]:
for epoch in range(10):
    train_iter = iter(train_loader)
    i = 0
    while i < len(train_loader):
        for p in model.parameters():
            p.requires_grad = True
        model.train()

        cost = trainBatch(model, criterion, optimizer)
        loss_avg.add(cost)
        i += 1

        if i % opt.displayInterval == 0:
            print('[%d/%d][%d/%d] Loss: %f' %
                  (epoch, opt.nepoch, i, len(train_loader), loss_avg.val()))
            loss_avg.reset()

        if i % opt.valInterval == 0:
            val(crnn, test_dataset, criterion)

        # do checkpointing
        if i % opt.saveInterval == 0:
            torch.save(
                crnn.state_dict(), '{0}/netCRNN_{1}_{2}.pth'.format(opt.expr_dir, epoch, i))