# Pytorch Implementation of a Keras Architecture 
by Amee Tan & Evan Chen 

### Goal:
The goal of this project was to take an existing Keras architecture and translate it into Pytorch. <br>
The Keras notebook can be found here:https://www.kaggle.com/samfc10/handwriting-recognition-using-crnn-in-keras

### Task: 
Given pictures of handwritten names, predict the name that was written in the picture. 

### Data: 
https://www.kaggle.com/landlord/handwriting-recognition <br>
Train: 331,059 images <br>
Valid: 41,382 images <br>


In [2]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.5.3-py3-none-any.whl (19 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.5.3


In [3]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
from tqdm.notebook import tqdm

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from torchinfo import summary

# Mixed Precision Training
from torch.cuda.amp import autocast
from torch.cuda.amp import GradScaler


# Load Data

In [4]:
df_valid = pd.read_csv('/kaggle/input/handwriting-recognition/written_name_validation_v2.csv') # Locally
df_valid.head()

Unnamed: 0,FILENAME,IDENTITY
0,VALIDATION_0001.jpg,BILEL
1,VALIDATION_0002.jpg,LAUMIONIER
2,VALIDATION_0003.jpg,LEA
3,VALIDATION_0004.jpg,JEAN-ROCH
4,VALIDATION_0005.jpg,RUPP


In [5]:
df_train = pd.read_csv('/kaggle/input/handwriting-recognition/written_name_train_v2.csv')
df_test = pd.read_csv('/kaggle/input/handwriting-recognition/written_name_test_v2.csv')



In [6]:
# Are there any null values? 

print("Number of nulls in train:", df_train['IDENTITY'].isnull().sum())
print("Number of nulls in valid:",df_valid['IDENTITY'].isnull().sum())

Number of nulls in train: 565
Number of nulls in valid: 78


In [7]:
# Take a look at some of them 
df_train.loc[df_train['IDENTITY'].isna()]

Unnamed: 0,FILENAME,IDENTITY
1913,TRAIN_01914.jpg,
2129,TRAIN_02130.jpg,
2624,TRAIN_02625.jpg,
4628,TRAIN_04629.jpg,
4872,TRAIN_04873.jpg,
...,...,...
328491,TRAIN_328492.jpg,
328653,TRAIN_328654.jpg,
329959,TRAIN_329960.jpg,
330160,TRAIN_330161.jpg,


In [8]:
# Drop the rows with null values for the label (IDENTITY column)

df_train.dropna(inplace=True)
df_valid.dropna(inplace=True)
df_test.dropna(inplace=True)

In [9]:
# How many unreadable images are there in each set? 
print(len(df_train.loc[df_train['IDENTITY']=='UNREADABLE']))
print(len(df_valid.loc[df_valid['IDENTITY']=='UNREADABLE']))
print(len(df_test.loc[df_test['IDENTITY']=='UNREADABLE']))

102
12
11


In [10]:
# Remove the unreadable images from the train and valid sets

df_train = df_train[df_train['IDENTITY'] != 'UNREADABLE']
df_valid = df_valid[df_valid['IDENTITY'] != 'UNREADABLE']
df_test = df_test[df_test['IDENTITY'] != 'UNREADABLE']

df_train.reset_index(inplace = True, drop=True) 
df_valid.reset_index(inplace = True, drop=True)
df_test.reset_index(inplace = True, drop=True)



In [11]:
# There are some labels that are lowercase. Convert all labels to uppercase

df_train['IDENTITY'] = df_train['IDENTITY'].str.upper()
df_valid['IDENTITY'] = df_valid['IDENTITY'].str.upper()
df_test['IDENTITY'] = df_test['IDENTITY'].str.upper()

In [12]:
# How long is the longest name that we'll encounter? 

df_train['LABEL LENGTH'] = df_train['IDENTITY'].apply(lambda x: len(x))
df_valid['LABEL LENGTH'] = df_valid['IDENTITY'].apply(lambda x: len(x))
df_test['LABEL LENGTH'] = df_test['IDENTITY'].apply(lambda x: len(x))

print(df_train.describe()) # 34 for the training set 
print(df_valid.describe()) # 21 for the valid set
print(df_test.describe()) # 24 for the test set

        LABEL LENGTH
count  330294.000000
mean        6.546531
std         2.123296
min         1.000000
25%         5.000000
50%         6.000000
75%         7.000000
max        34.000000
       LABEL LENGTH
count  41280.000000
mean       6.556613
std        2.127069
min        1.000000
25%        5.000000
50%        6.000000
75%        7.000000
max       21.000000
       LABEL LENGTH
count  41289.000000
mean       6.545860
std        2.137525
min        1.000000
25%        5.000000
50%        6.000000
75%        7.000000
max       24.000000


# Prepare Images 

In [13]:
# Code borrowed from https://www.kaggle.com/samfc10/handwriting-recognition-using-crnn-in-keras

def preprocess(img):
    (h, w) = img.shape
    
    final_img = np.ones([64, 256])*255 # blank white image
    
    # Width and height are cropped if greater than 256x64; If smaler, image is padded with white pixesls
    if w > 256:
        img = img[:, :256]
        
    if h > 64:
        img = img[:64, :]
    
    
    final_img[:h, :w] = img
    # Rotate clockwise
    return cv2.rotate(final_img, cv2.ROTATE_90_CLOCKWISE)

# Prepare Labels: Convert names into a sequence of integers

In [14]:
# Code adapted from same notebook as above 

alphabets = u"ABCDEFGHIJKLMNOPQRSTUVWXYZ-' "
max_str_len = 64 # max length of input labels
num_of_characters = len(alphabets) + 1 # +1 for ctc pseudo blank
num_of_timestamps = 64 # max length of predicted labels 

def label_to_num(label):
    label_num = []
    for ch in label:
        label_num.append(alphabets.find(ch))
        
    return np.array(label_num)

def num_to_label(num):
    ret = ""
    for ch in num:
        if ch == -1:  # CTC Blank
            break
        else:
            ret+=alphabets[ch]
    return ret

In [15]:
def encode_label(row):
    label = np.zeros(max_str_len)
    for i in range(max_str_len):
        label[0:len(row)] = label_to_num(row)

    return label
    

In [16]:
df_train['ENCODED LABEL'] = df_train['IDENTITY'].apply(lambda x: encode_label(x))
df_valid['ENCODED LABEL'] = df_valid['IDENTITY'].apply(lambda x: encode_label(x))

df_train

Unnamed: 0,FILENAME,IDENTITY,LABEL LENGTH,ENCODED LABEL
0,TRAIN_00001.jpg,BALTHAZAR,9,"[1.0, 0.0, 11.0, 19.0, 7.0, 0.0, 25.0, 0.0, 17..."
1,TRAIN_00002.jpg,SIMON,5,"[18.0, 8.0, 12.0, 14.0, 13.0, 0.0, 0.0, 0.0, 0..."
2,TRAIN_00003.jpg,BENES,5,"[1.0, 4.0, 13.0, 4.0, 18.0, 0.0, 0.0, 0.0, 0.0..."
3,TRAIN_00004.jpg,LA LOVE,7,"[11.0, 0.0, 28.0, 11.0, 14.0, 21.0, 4.0, 0.0, ..."
4,TRAIN_00005.jpg,DAPHNE,6,"[3.0, 0.0, 15.0, 7.0, 13.0, 4.0, 0.0, 0.0, 0.0..."
...,...,...,...,...
330289,TRAIN_330957.jpg,LENNY,5,"[11.0, 4.0, 13.0, 13.0, 24.0, 0.0, 0.0, 0.0, 0..."
330290,TRAIN_330958.jpg,TIFFANY,7,"[19.0, 8.0, 5.0, 5.0, 0.0, 13.0, 24.0, 0.0, 0...."
330291,TRAIN_330959.jpg,COUTINHO DESA,13,"[2.0, 14.0, 20.0, 19.0, 8.0, 13.0, 7.0, 14.0, ..."
330292,TRAIN_330960.jpg,MOURAD,6,"[12.0, 14.0, 20.0, 17.0, 0.0, 3.0, 0.0, 0.0, 0..."


In [18]:
# Create a dataset 

class HandwritingDataset(Dataset):
    def __init__(self, df, folder_path):
        self.df = df
        self.folder_path = folder_path  # ex. '/content/train_v2/train/'
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # retrieve image
        path = self.folder_path+self.df.loc[idx,'FILENAME']
        
        # read the img
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        
        img = preprocess(img)
        
        # convert to [0,1] scale -> normalize
        img = torch.tensor(img / 255.).float()
        
        # Encode the label 
        label = torch.tensor(self.df.loc[idx,'ENCODED LABEL'])
        #label = torch.tensor(label_to_num(self.df.loc[idx,'IDENTITY'])) # Returns label as a sequence of numbers 
        label_length = self.df.loc[idx,'LABEL LENGTH']
        
        return img, label, label_length

In [19]:
df_train = df_train.iloc[:5000]
ds_train = HandwritingDataset(df_train, '/kaggle/input/handwriting-recognition/train_v2/train/')
next(iter(ds_train))

(tensor([[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         ...,
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 0.9686, 0.9961, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 0.9961, 0.9961]]),
 tensor([ 1.,  0., 11., 19.,  7.,  0., 25.,  0., 17.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=torch.float64),
 9)

In [20]:
df_valid = df_valid.iloc[:500]
ds_valid = HandwritingDataset(df_valid, '/kaggle/input/handwriting-recognition/validation_v2/validation/')

next(iter(ds_valid))

(tensor([[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]),
 tensor([ 1.,  8., 11.,  4., 11.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=torch.float64),
 5)

In [21]:
# Create dataloaders 

dl_train = DataLoader(ds_train, batch_size = 32, shuffle=True)
dl_valid = DataLoader(ds_valid, batch_size = 32, shuffle=False)

In [21]:
next(iter(dl_valid))

[tensor([[[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ...,

## CNN Architecture --> RNN Arhitecture


In [50]:
class CNN_RNN(nn.Module):
    """CNN and RNN model from class"""
    def __init__(self, mish=False):
        super().__init__()
        
        # same padding!
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        
        
        # pooling
        self.pool_1 = nn.MaxPool2d(kernel_size=2)
        self.pool_2 = nn.MaxPool2d(kernel_size=(1,2))
        
        # activation
        self.relu = nn.ReLU()
        

        # batchnorm
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)

        # Linear Layer (Dense Layer)
        self.linear1 = nn.Linear(in_features=1024, out_features=64)
        
        
        self.linear2 = nn.Linear(in_features=1024, out_features=30)
        
        
        self.lstm1 = nn.LSTM(input_size=64,hidden_size=512, 
                             batch_first=True, bidirectional=True, 
                             num_layers=2)
        
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        
        # CNN
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool_1(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.pool_1(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.pool_2(x)
        

        # CNN to RNN
        # Reshape to a sequence vector that is 64 wide and 1024 deep 
        batch_size = x.shape[0]


        x = torch.reshape(x,(batch_size,64,-1)) # or 1024 instead of -1?? 
        
        # Now we shrink the sequence vector to be 512 deep 
        x = self.linear1(x) 


        # RNN        
        x = self.lstm1(x)[0] #[0] to get outputs, not hidden

        # OUTPUT
        x = self.linear2(x) # torch.Size([2, 2, 30])

        return x


In [23]:
model = CNN_RNN()
summary(model, input_size = (2, 1, 256, 64), device='cpu')  # inputsize = (batch_size, channels, image length, image width)


Layer (type:depth-idx)                   Output Shape              Param #
CNN_RNN                                  --                        --
├─Conv2d: 1-1                            [2, 32, 256, 64]          320
├─BatchNorm2d: 1-2                       [2, 32, 256, 64]          64
├─ReLU: 1-3                              [2, 32, 256, 64]          --
├─MaxPool2d: 1-4                         [2, 32, 128, 32]          --
├─Conv2d: 1-5                            [2, 64, 128, 32]          18,496
├─BatchNorm2d: 1-6                       [2, 64, 128, 32]          128
├─ReLU: 1-7                              [2, 64, 128, 32]          --
├─MaxPool2d: 1-8                         [2, 64, 64, 16]           --
├─Dropout: 1-9                           [2, 64, 64, 16]           --
├─Conv2d: 1-10                           [2, 128, 64, 16]          73,856
├─BatchNorm2d: 1-11                      [2, 128, 64, 16]          256
├─ReLU: 1-12                             [2, 128, 64, 16]          --
├─Ma

In [24]:
def one_pass(model, dataloader, optimizer, backwards=True, print_loss=True):
    
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0
    total_correct_char = 0
    correct = 0
    correct_chars = 0
    total_chars = 0
    for img, labels, label_length in dataloader:
        
        # Send to GPU
        img = img.to(device)
        labels = labels.to(device)
        label_length = label_length.to(device)
        
        
        model.train()
        y_pred = model(img.unsqueeze(1))
        lsm = nn.LogSoftmax()
        y_pred = lsm(y_pred)

        yinput = y_pred.permute(1,0,2)  # input sequence length, batch_size, number of classes 

        N = labels.shape[0] # batch size 
        input_lengths = torch.ones(N,dtype=torch.long)*64

        loss = lossFun(yinput, labels, input_lengths, label_length)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
  
        # ACCURACY 
        
        pred_nums = torch.argmax(y_pred, dim=2)
        
        # Character accuracy
        for pred, label, length in zip(pred_nums, labels, label_length):
            length = length.item()
            pred = torch.split(pred, length, dim=0)
            pred = pred[0]
            label = torch.split(label, length, dim=0)
            label = label[0]
            correct_chars += torch.sum(pred==label) 
            total_chars += length

                    
        # Check if words are same
        for i in range(N):
            pr = pred_nums[i]
            tr = labels[i]
            if torch.equal(pr, tr.long()):
                correct +=1
                
    avg_loss = total_loss / len(dataloader)
    avg_correct_chars = correct_chars/total_chars
    
    return avg_loss, avg_correct_chars, correct

## Experimenting with batch size and learning rate

In [29]:
# Baseline 

model = CNN_RNN()
optimizer = optim.Adam(model.parameters(), lr = 0.001)
lossFun = nn.CTCLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device(0)
model = model.to(device)

num_epochs = 10

train_losses = []
valid_losses = []
train_correct_chars = []
valid_correct_chars = []
train_correct_words = []
valid_correct_words = []
for epoch in range(num_epochs):
    print('Epoch: ', epoch)
    
    train_avg_loss, train_avg_correct_chars, train_num_correct_words = one_pass(model, dl_train, optimizer)
    train_losses.append(train_avg_loss)
    train_correct_chars.append(train_avg_correct_chars)
    train_correct_words.append(train_num_correct_words)
    print("Train:")
    print("CTC Loss:", round(train_avg_loss,4))
    print("Percent correct characters per word:", round(train_avg_correct_chars.item(),4))
    print("Number of correct words:", train_num_correct_words)

    
    valid_avg_loss, valid_avg_correct_chars, valid_num_correct_words = one_pass(model, dl_valid, optimizer, backwards=False)
    valid_losses.append(valid_avg_loss)
    valid_correct_chars.append(valid_avg_correct_chars)
    valid_correct_words.append(valid_num_correct_words)
    print("Valid")
    print("CTC Loss", round(valid_avg_loss,4))
    print("Percent correct characters per word", round(valid_avg_correct_chars.item(),4))
    print("Number of correct words", valid_num_correct_words)
    print("")

Epoch:  0




Train:
CTC Loss: 26.8664
Percent correct characters per word: 0.0685
Number of correct words: 0
Valid
CTC Loss 25.644
Percent correct characters per word 0.0715
Number of correct words 0

Epoch:  1
Train:
CTC Loss: 25.5211
Percent correct characters per word: 0.0663
Number of correct words: 0
Valid
CTC Loss 24.9334
Percent correct characters per word 0.0664
Number of correct words 0

Epoch:  2
Train:
CTC Loss: 25.1546
Percent correct characters per word: 0.0667
Number of correct words: 1
Valid
CTC Loss 24.7102
Percent correct characters per word 0.0609
Number of correct words 0

Epoch:  3
Train:
CTC Loss: 26.0625
Percent correct characters per word: 0.0501
Number of correct words: 0
Valid
CTC Loss 25.4158
Percent correct characters per word 0.0514
Number of correct words 0

Epoch:  4
Train:
CTC Loss: 25.3862
Percent correct characters per word: 0.0665
Number of correct words: 0
Valid
CTC Loss 24.7793
Percent correct characters per word 0.0658
Number of correct words 0

Epoch:  5
Train:

In [43]:
def train_epochs_gpu(batch_size, lr):
    
    dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
    dl_valid = DataLoader(ds_valid, batch_size=batch_size, shuffle=False)
    
    model = CNN_RNN(mish=True)
    optimizer = optim.Adam(model.parameters(), lr=lr )
#    lossFun = nn.CTCLoss()
    
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     device = torch.device(0)
    model = model.to(device)

    num_epochs = 10

    train_losses = []
    valid_losses = []
    train_correct_chars = []
    valid_correct_chars = []
    train_correct_words = []
    valid_correct_words = []
    for epoch in range(num_epochs):
        print('Epoch: ', epoch)

        train_avg_loss, train_avg_correct_chars, train_num_correct_words = one_pass(model, dl_train, optimizer)
        train_losses.append(train_avg_loss)
        train_correct_chars.append(train_avg_correct_chars)
        train_correct_words.append(train_num_correct_words)
        print("Train:")
        print("CTC Loss:", round(train_avg_loss,4))
        print("Percent correct characters per word:", round(train_avg_correct_chars.item(),4))
        print("Number of correct words:", train_num_correct_words)


        valid_avg_loss, valid_avg_correct_chars, valid_num_correct_words = one_pass(model, dl_valid, optimizer, backwards=False)
        valid_losses.append(valid_avg_loss)
        valid_correct_chars.append(valid_avg_correct_chars)
        valid_correct_words.append(valid_num_correct_words)
        print("Valid")
        print("CTC Loss", round(valid_avg_loss,4))
        print("Percent correct characters per word", round(valid_avg_correct_chars.item(),4))
        print("Number of correct words", valid_num_correct_words)
        print("")
        
    return train_losses, valid_losses, valid_correct_chars, valid_correct_words

In [31]:
# Increase batch size to 64
train_losses1, valid_losses1, valid_correct_chars1, valid_correct_words1 = train_epochs_gpu(batch_size=64, lr=0.001)

Epoch:  0




Train:
CTC Loss: 33.803
Percent correct characters per word: 0.0746
Number of correct words: 0
Valid
CTC Loss 32.3174
Percent correct characters per word 0.0776
Number of correct words 0

Epoch:  1
Train:
CTC Loss: 31.9879
Percent correct characters per word: 0.0647
Number of correct words: 0
Valid
CTC Loss 32.2615
Percent correct characters per word 0.0676
Number of correct words 0

Epoch:  2
Train:
CTC Loss: 31.7875
Percent correct characters per word: 0.0682
Number of correct words: 0
Valid
CTC Loss 32.06
Percent correct characters per word 0.0645
Number of correct words 0

Epoch:  3
Train:
CTC Loss: 31.2737
Percent correct characters per word: 0.0734
Number of correct words: 0
Valid
CTC Loss 32.7448
Percent correct characters per word 0.0737
Number of correct words 0

Epoch:  4
Train:
CTC Loss: 31.1584
Percent correct characters per word: 0.073
Number of correct words: 0
Valid
CTC Loss 30.7186
Percent correct characters per word 0.0588
Number of correct words 0

Epoch:  5
Train:
CT

In [32]:
# Increase learning rate 
train_losses1, valid_losses1, valid_correct_chars1, valid_correct_words1 = train_epochs_gpu(batch_size=128, lr=0.01)

Epoch:  0




Train:
CTC Loss: 40.921
Percent correct characters per word: 0.0726
Number of correct words: 0
Valid
CTC Loss 38.8885
Percent correct characters per word 0.063
Number of correct words 0

Epoch:  1
Train:
CTC Loss: 38.9995
Percent correct characters per word: 0.0594
Number of correct words: 0
Valid
CTC Loss 38.6706
Percent correct characters per word 0.049
Number of correct words 0

Epoch:  2
Train:
CTC Loss: 38.4457
Percent correct characters per word: 0.0543
Number of correct words: 0
Valid
CTC Loss 38.3796
Percent correct characters per word 0.0536
Number of correct words 0

Epoch:  3
Train:
CTC Loss: 38.6351
Percent correct characters per word: 0.0491
Number of correct words: 0
Valid
CTC Loss 39.0022
Percent correct characters per word 0.0554
Number of correct words 0

Epoch:  4
Train:
CTC Loss: 38.2428
Percent correct characters per word: 0.0544
Number of correct words: 0
Valid
CTC Loss 38.0206
Percent correct characters per word 0.0645
Number of correct words 0

Epoch:  5
Train:
C

In [41]:
# Adding lr scheduler 

def one_pass(model, dataloader, optimizer, backwards=True, print_loss=True):
    
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 100)   
    lossFun = nn.CTCLoss()
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0
    total_correct_char = 0
    correct = 0
    correct_chars = 0
    total_chars = 0
    for img, labels, label_length in dataloader:
        
        # Send to GPU
        img = img.to(device)
        labels = labels.to(device)
        label_length = label_length.to(device)
        
        
        model.train()
        y_pred = model(img.unsqueeze(1))
        lsm = nn.LogSoftmax()
        y_pred = lsm(y_pred)

        yinput = y_pred.permute(1,0,2)  # input sequence length, batch_size, number of classes 

        N = labels.shape[0] # batch size 
        input_lengths = torch.ones(N,dtype=torch.long)*64

        loss = lossFun(yinput, labels, input_lengths, label_length)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

  
        # ACCURACY 
        
        pred_nums = torch.argmax(y_pred, dim=2)
        
        # Character accuracy
        for pred, label, length in zip(pred_nums, labels, label_length):
            length = length.item()
            pred = torch.split(pred, length, dim=0)
            pred = pred[0]
            label = torch.split(label, length, dim=0)
            label = label[0]
            correct_chars += torch.sum(pred==label) 
            total_chars += length

                    
        # Check if words are same
        for i in range(N):
            pr = pred_nums[i]
            tr = labels[i]
            if torch.equal(pr, tr.long()):
                correct +=1
                
    avg_loss = total_loss / len(dataloader)
    avg_correct_chars = correct_chars/total_chars
    
    return avg_loss, avg_correct_chars, correct

In [45]:
# Increase batch size to 64 and incorporate learning rate 
train_losses1, valid_losses1, valid_correct_chars1, valid_correct_words1 = train_epochs_gpu(batch_size=64, lr=0.001)

Epoch:  0




Train:
CTC Loss: 33.624
Percent correct characters per word: 0.0726
Number of correct words: 0
Valid
CTC Loss 31.7609
Percent correct characters per word 0.0801
Number of correct words 0

Epoch:  1
Train:
CTC Loss: 31.7007
Percent correct characters per word: 0.0727
Number of correct words: 0
Valid
CTC Loss 30.8403
Percent correct characters per word 0.067
Number of correct words 0

Epoch:  2
Train:
CTC Loss: 31.202
Percent correct characters per word: 0.0667
Number of correct words: 0
Valid
CTC Loss 31.1687
Percent correct characters per word 0.0682
Number of correct words 0

Epoch:  3
Train:
CTC Loss: 31.0435
Percent correct characters per word: 0.0702
Number of correct words: 5
Valid
CTC Loss 30.5653
Percent correct characters per word 0.0788
Number of correct words 0

Epoch:  4
Train:
CTC Loss: 31.034
Percent correct characters per word: 0.0673
Number of correct words: 4
Valid
CTC Loss 30.8227
Percent correct characters per word 0.0749
Number of correct words 1

Epoch:  5
Train:
CT

In [24]:
class CNN_RNN_no_dropout(nn.Module):
    """CNN and RNN model from class"""
    def __init__(self):
        super().__init__()
        
        # same padding!
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        
        # pooling
        self.pool_1 = nn.MaxPool2d(kernel_size=2)
        self.pool_2 = nn.MaxPool2d(kernel_size=(1,2))
        
        # activation
        self.relu = nn.ReLU()

        # dropout
        #self.dropout = nn.Dropout(p)

        # batchnorm
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)

        # Linear Layer (Dense Layer)
        self.linear1 = nn.Linear(in_features=1024, out_features=64)
        self.linear2 = nn.Linear(in_features=1024, out_features=30)

        # RNN Layer --> Single LSTM with num_layers=2
        #self.lstm1 = nn.LSTM(input_size=64, hidden_size=600, batch_first=True, bidirectional=True, num_layers=2, proj_size=512)
        self.lstm1 = nn.LSTM(input_size=64, hidden_size=512, batch_first=True, bidirectional=True, num_layers=2)

        self.unroll = nn.Flatten()
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        
        # CNN
        # Start with image that is 256 wide x 64 tall and 1 channel
        # End with 64 wide x 8 tall and 128 channels
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool_1(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.pool_1(x)
        #x = self.dropout(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.pool_2(x)
        #x = self.dropout(x)
        

        # CNN to RNN
        # Reshape to a sequence vector that is 64 wide and 1024 deep 
        batch_size = x.shape[0]


        x = torch.reshape(x,(batch_size,64,-1)) # or 1024 instead of -1?? 

        x = self.linear1(x) 
        
        x = self.lstm1(x)[0] #[0] to get outputs, not hidden

        # OUTPUT
        x = self.linear2(x) # torch.Size([2, 2, 30])

        return x

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device(0)

In [37]:
def train_epochs_gpu_no_dropout(batch_size, lr):
    
    dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
    dl_valid = DataLoader(ds_valid, batch_size=batch_size, shuffle=False)
    
    model = CNN_RNN_no_dropout()
    optimizer = optim.Adam(model.parameters(), lr=lr )    

    model = model.to(device)

    num_epochs = 10

    train_losses = []
    valid_losses = []
    train_correct_chars = []
    valid_correct_chars = []
    train_correct_words = []
    valid_correct_words = []
    for epoch in range(num_epochs):
        print('Epoch: ', epoch)

        train_avg_loss, train_avg_correct_chars, train_num_correct_words = one_pass(model, dl_train, optimizer)
        train_losses.append(train_avg_loss)
        train_correct_chars.append(train_avg_correct_chars)
        train_correct_words.append(train_num_correct_words)
        print("Train:")
        print("CTC Loss:", round(train_avg_loss,4))
        print("Percent correct characters per word:", round(train_avg_correct_chars.item(),4))
        print("Number of correct words:", train_num_correct_words)


        valid_avg_loss, valid_avg_correct_chars, valid_num_correct_words = one_pass(model, dl_valid, optimizer, backwards=False)
        valid_losses.append(valid_avg_loss)
        valid_correct_chars.append(valid_avg_correct_chars)
        valid_correct_words.append(valid_num_correct_words)
        print("Valid")
        print("CTC Loss", round(valid_avg_loss,4))
        print("Percent correct characters per word", round(valid_avg_correct_chars.item(),4))
        print("Number of correct words", valid_num_correct_words)
        print("")
        
    return train_losses, valid_losses, valid_correct_chars, valid_correct_words

In [42]:
# lr scheduler + no dropout 

train_losses2, valid_losses2, valid_correct_chars2, valid_correct_words2 = train_epochs_gpu_no_dropout(batch_size=64, lr=0.001)

Epoch:  0




Train:
CTC Loss: 33.3093
Percent correct characters per word: 0.0855
Number of correct words: 0
Valid
CTC Loss 31.8573
Percent correct characters per word 0.0843
Number of correct words 0

Epoch:  1
Train:
CTC Loss: 31.9543
Percent correct characters per word: 0.0788
Number of correct words: 0
Valid
CTC Loss 31.2054
Percent correct characters per word 0.0767
Number of correct words 0

Epoch:  2
Train:
CTC Loss: 31.7189
Percent correct characters per word: 0.0725
Number of correct words: 0
Valid
CTC Loss 31.1364
Percent correct characters per word 0.0734
Number of correct words 0

Epoch:  3
Train:
CTC Loss: 31.3784
Percent correct characters per word: 0.0701
Number of correct words: 0
Valid
CTC Loss 31.4014
Percent correct characters per word 0.0813
Number of correct words 0

Epoch:  4
Train:
CTC Loss: 31.2645
Percent correct characters per word: 0.0681
Number of correct words: 0
Valid
CTC Loss 30.9398
Percent correct characters per word 0.0566
Number of correct words 0

Epoch:  5
Train

In [51]:
train_losses3, valid_losses3, valid_correct_chars3, valid_correct_words3 = train_epochs_gpu(batch_size=64, lr=0.001)

Epoch:  0




Train:
CTC Loss: 34.0709
Percent correct characters per word: 0.0758
Number of correct words: 0
Valid
CTC Loss 32.5416
Percent correct characters per word 0.0706
Number of correct words 0

Epoch:  1
Train:
CTC Loss: 32.5736
Percent correct characters per word: 0.0701
Number of correct words: 0
Valid
CTC Loss 31.773
Percent correct characters per word 0.0667
Number of correct words 0

Epoch:  2
Train:
CTC Loss: 31.7367
Percent correct characters per word: 0.073
Number of correct words: 0
Valid
CTC Loss 31.3094
Percent correct characters per word 0.0667
Number of correct words 0

Epoch:  3
Train:
CTC Loss: 31.7342
Percent correct characters per word: 0.0605
Number of correct words: 0
Valid
CTC Loss 31.9319
Percent correct characters per word 0.0572
Number of correct words 0

Epoch:  4
Train:
CTC Loss: 32.0528
Percent correct characters per word: 0.0652
Number of correct words: 0
Valid
CTC Loss 31.8245
Percent correct characters per word 0.0551
Number of correct words 0

Epoch:  5
Train:


In [52]:
train_losses3, valid_losses3, valid_correct_chars3, valid_correct_words3 = train_epochs_gpu(batch_size=32, lr=0.001)

Epoch:  0




Train:
CTC Loss: 25.889
Percent correct characters per word: 0.0783
Number of correct words: 0
Valid
CTC Loss 24.5113
Percent correct characters per word 0.0743
Number of correct words 0

Epoch:  1
Train:
CTC Loss: 25.3947
Percent correct characters per word: 0.0783
Number of correct words: 0
Valid
CTC Loss 26.5406
Percent correct characters per word 0.0916
Number of correct words 0

Epoch:  2
Train:
CTC Loss: 25.2977
Percent correct characters per word: 0.0851
Number of correct words: 0
Valid
CTC Loss 24.1977
Percent correct characters per word 0.0728
Number of correct words 0

Epoch:  3
Train:
CTC Loss: 24.8869
Percent correct characters per word: 0.0765
Number of correct words: 0
Valid
CTC Loss 23.9747
Percent correct characters per word 0.0761
Number of correct words 0

Epoch:  4
Train:
CTC Loss: 25.275
Percent correct characters per word: 0.0756
Number of correct words: 0
Valid
CTC Loss 24.5837
Percent correct characters per word 0.0834
Number of correct words 0

Epoch:  5
Train:


# Train model on full dataset
- ReLU
- batch size = 32
- Cosine Annealing Learning Rate Scheduler 

In [None]:
# Train for real on 30 epochs 

ds_train = HandwritingDataset(df_train, '/kaggle/input/handwriting-recognition/train_v2/train/')
ds_valid = HandwritingDataset(df_valid, '/kaggle/input/handwriting-recognition/validation_v2/validation/')

dl_train = DataLoader(ds_train, batch_size = 64, shuffle=True)
dl_valid = DataLoader(ds_valid, batch_size = 64, shuffle=False)

train_losses, valid_losses, valid_correct_chars, valid_correct_words = train_epochs_gpu(batch_size=32, lr=0.001, num_epochs=20)

Epoch:  0




Train:
CTC Loss: 23.6313
Percent correct characters per word: 0.0763
Number of correct words: 15
Valid
CTC Loss 21.6213
Percent correct characters per word 0.0948
Number of correct words 0

Epoch:  1
Train:
CTC Loss: 20.2599
Percent correct characters per word: 0.1137
Number of correct words: 4
Valid
CTC Loss 18.4421
Percent correct characters per word 0.1325
Number of correct words 0

Epoch:  2
Train:
CTC Loss: 17.8262
Percent correct characters per word: 0.1421
Number of correct words: 1
Valid
CTC Loss 16.6317
Percent correct characters per word 0.1529
Number of correct words 1

Epoch:  3
Train:
CTC Loss: 16.9938
Percent correct characters per word: 0.1516
Number of correct words: 1
Valid
CTC Loss 19.8691
Percent correct characters per word 0.097
Number of correct words 1

Epoch:  4
Train:
CTC Loss: 18.8868
Percent correct characters per word: 0.106
Number of correct words: 9
Valid
CTC Loss 18.2685
Percent correct characters per word 0.1144
Number of correct words 1

Epoch:  5
Train:

### Note: 
Kaggle GPU timed out after epoch 14