In [39]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.5.2-py3-none-any.whl (18 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.5.2


In [111]:

import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
from tqdm.notebook import tqdm

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from torchinfo import summary

# Mixed Precision Training
from torch.cuda.amp import autocast
from torch.cuda.amp import GradScaler

# import tensorflow as tf
# from keras import backend as K
# from keras.models import Model
# from keras.layers import Input, Conv2D, MaxPooling2D, Reshape, Bidirectional, LSTM, Dense, Lambda, Activation, BatchNormalization, Dropout
# from keras.optimizers import Adam

# Load Data

In [None]:
# Option 1: Original method
import zipfile

!unzip /content/handwriting.zip

Archive:  /content/handwriting.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of /content/handwriting.zip or
        /content/handwriting.zip.zip, and cannot find /content/handwriting.zip.ZIP, period.


In [None]:
os.listdir('/content/test_v2/test')[:5]

['TEST_32839.jpg',
 'TEST_18252.jpg',
 'TEST_13368.jpg',
 'TEST_14788.jpg',
 'TEST_2042.jpg']

In [None]:
# Option 2: From google drive 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PATH_OF_DATA= '/content/drive/"My Drive"/validation'
!ls {PATH_OF_DATA}

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 VALIDATION_22171.jpg	     VALIDATION_40912.jpg
 VALIDATION_22172.jpg	     VALIDATION_40913.jpg
 VALIDATION_22173.jpg	     VALIDATION_40914.jpg
 VALIDATION_22174.jpg	     VALIDATION_40915.jpg
 VALIDATION_22175.jpg	     VALIDATION_40916.jpg
 VALIDATION_22176.jpg	     VALIDATION_40917.jpg
 VALIDATION_22177.jpg	     VALIDATION_40918.jpg
 VALIDATION_22178.jpg	     VALIDATION_40919.jpg
 VALIDATION_22179.jpg	     VALIDATION_4091.jpg
 VALIDATION_2217.jpg	     VALIDATION_40920.jpg
 VALIDATION_22180.jpg	     VALIDATION_40921.jpg
 VALIDATION_22181.jpg	     VALIDATION_40922.jpg
 VALIDATION_22182.jpg	     VALIDATION_40923.jpg
 VALIDATION_22183.jpg	     VALIDATION_40924.jpg
 VALIDATION_22184.jpg	     VALIDATION_40925.jpg
 VALIDATION_22185.jpg	     VALIDATION_40926.jpg
 VALIDATION_22186.jpg	     VALIDATION_40927.jpg
 VALIDATION_22187.jpg	     VALIDATION_40928.jpg
 VALIDATION_22188.jpg	     VALIDATION_40929.jpg
 VALIDATION_22189.jpg	   

# Clean Data

In [78]:
#df_valid = pd.read_csv('/content/written_name_validation_v2.csv') # On drive 
df_valid = pd.read_csv('content/written_name_validation_v2.csv') # Locally
df_valid.head()

Unnamed: 0,FILENAME,IDENTITY
0,VALIDATION_0001.jpg,BILEL
1,VALIDATION_0002.jpg,LAUMIONIER
2,VALIDATION_0003.jpg,LEA
3,VALIDATION_0004.jpg,JEAN-ROCH
4,VALIDATION_0005.jpg,RUPP


In [79]:
# df_train = pd.read_csv('/content/written_name_train_v2.csv')
# df_test = pd.read_csv('/content/written_name_test_v2.csv')

#  Locally
df_train = pd.read_csv('content/written_name_train_v2.csv')
df_test = pd.read_csv('content/written_name_test_v2.csv')



In [80]:
# Are there any null values? 

print("Number of nulls in train:", df_train['IDENTITY'].isnull().sum())
print("Number of nulls in valid:",df_valid['IDENTITY'].isnull().sum())

Number of nulls in train: 565
Number of nulls in valid: 78


In [81]:
# Take a look at some of them 
df_train.loc[df_train['IDENTITY'].isna()]

Unnamed: 0,FILENAME,IDENTITY
1913,TRAIN_01914.jpg,
2129,TRAIN_02130.jpg,
2624,TRAIN_02625.jpg,
4628,TRAIN_04629.jpg,
4872,TRAIN_04873.jpg,
...,...,...
328491,TRAIN_328492.jpg,
328653,TRAIN_328654.jpg,
329959,TRAIN_329960.jpg,
330160,TRAIN_330161.jpg,


In [82]:
# Drop the rows with null values for the label (IDENTITY column)

df_train.dropna(inplace=True)
df_valid.dropna(inplace=True)
df_test.dropna(inplace=True)

In [83]:
# How many unreadable images are there in each set? 
print(len(df_train.loc[df_train['IDENTITY']=='UNREADABLE']))
print(len(df_valid.loc[df_valid['IDENTITY']=='UNREADABLE']))
print(len(df_test.loc[df_test['IDENTITY']=='UNREADABLE']))

102
12
11


In [84]:
# Remove the unreadable images from the train and valid sets
# Q: In the Kaggle notebook, they don't remove these from the test. Why? 

df_train = df_train[df_train['IDENTITY'] != 'UNREADABLE']
df_valid = df_valid[df_valid['IDENTITY'] != 'UNREADABLE']
df_test = df_test[df_test['IDENTITY'] != 'UNREADABLE']

df_train.reset_index(inplace = True, drop=True) 
df_valid.reset_index(inplace = True, drop=True)
df_test.reset_index(inplace = True, drop=True)



In [85]:
# There are some labels that are lowercase. Convert all labels to uppercase

df_train['IDENTITY'] = df_train['IDENTITY'].str.upper()
df_valid['IDENTITY'] = df_valid['IDENTITY'].str.upper()
df_test['IDENTITY'] = df_test['IDENTITY'].str.upper()

In [86]:
# How long is the longest name that we'll encounter? 

df_train['LABEL LENGTH'] = df_train['IDENTITY'].apply(lambda x: len(x))
df_valid['LABEL LENGTH'] = df_valid['IDENTITY'].apply(lambda x: len(x))
df_test['LABEL LENGTH'] = df_test['IDENTITY'].apply(lambda x: len(x))

print(df_train.describe()) # 34 for the training set 
print(df_valid.describe()) # 21 for the valid set
print(df_test.describe()) # 24 for the test set

        LABEL LENGTH
count  330294.000000
mean        6.546531
std         2.123296
min         1.000000
25%         5.000000
50%         6.000000
75%         7.000000
max        34.000000
       LABEL LENGTH
count  41280.000000
mean       6.556613
std        2.127069
min        1.000000
25%        5.000000
50%        6.000000
75%        7.000000
max       21.000000
       LABEL LENGTH
count  41289.000000
mean       6.545860
std        2.137525
min        1.000000
25%        5.000000
50%        6.000000
75%        7.000000
max       24.000000


# Prepare Images 

In [87]:
# Code borrowed from https://www.kaggle.com/samfc10/handwriting-recognition-using-crnn-in-keras

def preprocess(img):
    (h, w) = img.shape
    
    final_img = np.ones([64, 256])*255 # blank white image
    
    # Width and height are cropped if greater than 256x64; If smaler, image is padded with white pixesls
    if w > 256:
        img = img[:, :256]
        
    if h > 64:
        img = img[:64, :]
    
    
    final_img[:h, :w] = img
    # Rotate clockwise
    return cv2.rotate(final_img, cv2.ROTATE_90_CLOCKWISE)

# Prepare Labels: Convert names into a sequence of integers

In [188]:
# Code adapted from same notebook as above 

alphabets = u"ABCDEFGHIJKLMNOPQRSTUVWXYZ-' "
max_str_len = 64 # max length of input labels
num_of_characters = len(alphabets) + 1 # +1 for ctc pseudo blank
num_of_timestamps = 64 # max length of predicted labels 

def label_to_num(label):
    label_num = []
    for ch in label:
        label_num.append(alphabets.find(ch))
        
    return np.array(label_num)

def num_to_label(num):
    ret = ""
    for ch in num:
        if ch == -1:  # CTC Blank
            break
        else:
            ret+=alphabets[ch]
    return ret

In [89]:
# Make train/validation samaller for initial

In [90]:
name = 'AMEE'
print(name)
print(label_to_num(name))

AMEE
[ 0 12  4  4]


In [91]:
# Test out accessing each filename
df_train.loc[1,'FILENAME']

'TRAIN_00002.jpg'

In [92]:
df_train

Unnamed: 0,FILENAME,IDENTITY,LABEL LENGTH
0,TRAIN_00001.jpg,BALTHAZAR,9
1,TRAIN_00002.jpg,SIMON,5
2,TRAIN_00003.jpg,BENES,5
3,TRAIN_00004.jpg,LA LOVE,7
4,TRAIN_00005.jpg,DAPHNE,6
...,...,...,...
330289,TRAIN_330957.jpg,LENNY,5
330290,TRAIN_330958.jpg,TIFFANY,7
330291,TRAIN_330959.jpg,COUTINHO DESA,13
330292,TRAIN_330960.jpg,MOURAD,6


In [172]:
df_valid

Unnamed: 0,FILENAME,IDENTITY,LABEL LENGTH
0,VALIDATION_0001.jpg,BILEL,5
1,VALIDATION_0002.jpg,LAUMIONIER,10
2,VALIDATION_0003.jpg,LEA,3
3,VALIDATION_0004.jpg,JEAN-ROCH,9
4,VALIDATION_0005.jpg,RUPP,4
...,...,...,...
41275,VALIDATION_41366.jpg,CHAILLAN,8
41276,VALIDATION_41367.jpg,BAROUH,6
41277,VALIDATION_41368.jpg,MAXENCE,7
41278,VALIDATION_41369.jpg,HAMELIN,7


In [189]:
def encode_label(row):
    label = np.zeros(max_str_len)
    for i in range(max_str_len):
        label[0:len(row)] = label_to_num(row)

    return label
    

In [190]:
df_train['ENCODED LABEL'] = df_train['IDENTITY'].apply(lambda x: encode_label(x))
df_valid['ENCODED LABEL'] = df_valid['IDENTITY'].apply(lambda x: encode_label(x))

df_train

Unnamed: 0,FILENAME,IDENTITY,LABEL LENGTH,ENCODED LABEL
0,TRAIN_00001.jpg,BALTHAZAR,9,"[1.0, 0.0, 11.0, 19.0, 7.0, 0.0, 25.0, 0.0, 17..."
1,TRAIN_00002.jpg,SIMON,5,"[18.0, 8.0, 12.0, 14.0, 13.0, 0.0, 0.0, 0.0, 0..."
2,TRAIN_00003.jpg,BENES,5,"[1.0, 4.0, 13.0, 4.0, 18.0, 0.0, 0.0, 0.0, 0.0..."
3,TRAIN_00004.jpg,LA LOVE,7,"[11.0, 0.0, 28.0, 11.0, 14.0, 21.0, 4.0, 0.0, ..."
4,TRAIN_00005.jpg,DAPHNE,6,"[3.0, 0.0, 15.0, 7.0, 13.0, 4.0, 0.0, 0.0, 0.0..."
...,...,...,...,...
330289,TRAIN_330957.jpg,LENNY,5,"[11.0, 4.0, 13.0, 13.0, 24.0, 0.0, 0.0, 0.0, 0..."
330290,TRAIN_330958.jpg,TIFFANY,7,"[19.0, 8.0, 5.0, 5.0, 0.0, 13.0, 24.0, 0.0, 0...."
330291,TRAIN_330959.jpg,COUTINHO DESA,13,"[2.0, 14.0, 20.0, 19.0, 8.0, 13.0, 7.0, 14.0, ..."
330292,TRAIN_330960.jpg,MOURAD,6,"[12.0, 14.0, 20.0, 17.0, 0.0, 3.0, 0.0, 0.0, 0..."


In [164]:
# Code adapted from notebook -- WHERE DO WE USE THIS????

# TRAIN
train_y = np.ones([train_size, max_str_len]) * 0
train_label_len = np.zeros([train_size, 1])
train_input_len = np.ones([train_size, 1]) * (num_of_timestamps-2)
train_output = np.zeros([train_size])

for i in range(train_size):
    train_label_len[i] = len(df_train.loc[i, 'IDENTITY'])
    train_y[i, 0:len(df_train.loc[i, 'IDENTITY'])]= label_to_num(df_train.loc[i, 'IDENTITY'])

# VALID 
valid_y = np.ones([valid_size, max_str_len]) * -1
valid_label_len = np.zeros([valid_size, 1])
valid_input_len = np.ones([valid_size, 1]) * (num_of_timestamps-2)
valid_output = np.zeros([valid_size])

for i in range(valid_size):
    valid_label_len[i] = len(df_valid.loc[i, 'IDENTITY'])
    valid_y[i, 0:len(df_valid.loc[i, 'IDENTITY'])]= label_to_num(df_valid.loc[i, 'IDENTITY'])

# VERIFY        
print('True label : ',df_train.loc[49, 'IDENTITY'] , '\ntrain_y : ',train_y[49],'\ntrain_label_len : ',train_label_len[49], 
      '\ntrain_input_len : ', train_input_len[49])

True label :  LAQUERRIERE 
train_y :  [11.  0. 16. 20.  4. 17. 17.  8.  4. 17.  4.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.] 
train_label_len :  [11.] 
train_input_len :  [62.]


In [180]:
# Create a dataset 

class HandwritingDataset(Dataset):
    def __init__(self, df, folder_path):
        self.df = df
        self.folder_path = folder_path  # ex. '/content/train_v2/train/'
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # retrieve image
        path = self.folder_path+self.df.loc[idx,'FILENAME']
        
        # read the img
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        
        img = preprocess(img)
        
        # convert to [0,1] scale -> normalize
        img = torch.tensor(img / 255.).float()
        
        # Encode the label 
        label = torch.tensor(self.df.loc[idx,'ENCODED LABEL'])
        #label = torch.tensor(label_to_num(self.df.loc[idx,'IDENTITY'])) # Returns label as a sequence of numbers 
        label_length = self.df.loc[idx,'LABEL LENGTH']
        
        return img, label, label_length

In [95]:
train_size = 50
valid_size = 20

In [191]:
# ds_train = HandwritingDataset(df_train, '/content/train_v2/train/')

ds_train = HandwritingDataset(df_train, 'content/train_v2/train/')

next(iter(ds_train))

(tensor([[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         ...,
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 0.9686, 0.9961, 1.0000],
         [1.0000, 1.0000, 1.0000,  ..., 1.0000, 0.9961, 0.9961]]),
 tensor([ 1.,  0., 11., 19.,  7.,  0., 25.,  0., 17.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=torch.float64),
 9)

In [181]:
#ds_valid = HandwritingDataset(df_valid, '/content/validation_v2/validation/')
#ds_valid = HandwritingDataset(df_valid, '/content/drive/"My Drive"/validation') # Drive
ds_valid = HandwritingDataset(df_valid, 'content/validation_v2/validation/') # Locally

next(iter(ds_valid))

(tensor([[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]),
 tensor([ 1.,  8., 11.,  4., 11.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=torch.float64),
 5)

In [194]:
# Create dataloaders 

dl_train = DataLoader(ds_train, batch_size = 2, shuffle=True)
dl_valid = DataLoader(ds_valid, batch_size = 2, shuffle=False)

In [195]:
next(iter(dl_valid))

[tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],
 
         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]]]),
 tensor([[ 1.,  8., 11.,  4., 11.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [11.,  0., 20., 12.,  8., 14., 13.,  8.,  4., 17.,  0.,  0.,  0.,  0.,
         

## CNN Architecture --> RNN Arhitecture


In [144]:
class CNN_RNN(nn.Module):
    """CNN and RNN model from class"""
    def __init__(self, p=0.3):
        super().__init__()
        
        # same padding!
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        
        # # doing this to shrink size enough!
        # self.conv4 = nn.Conv2d(in_channels=128, out_channels=1, kernel_size=3, padding=1)
        
        # pooling
        self.pool_1 = nn.MaxPool2d(kernel_size=2)
        self.pool_2 = nn.MaxPool2d(kernel_size=(1,2))
        
        # activation
        self.relu = nn.ReLU()

        # dropout
        self.dropout = nn.Dropout(p)

        # batchnorm
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)

        # Linear Layer (Dense Layer)
        self.linear1 = nn.Linear(in_features=1024, out_features=64)
        self.linear2 = nn.Linear(in_features=1024, out_features=30)

        # RNN
        self.lstm1 = nn.LSTM(input_size=64, hidden_size=600, batch_first=True, bidirectional=True, proj_size=512, num_layers=2)
        #self.lstm1 = nn.LSTM(input_size=64, hidden_size=512, batch_first=True, bidirectional=True, num_layers=2)

        # Could also try a single LSTM with num_layers=2
        
        # for unrolling into FC layer
        self.unroll = nn.Flatten()
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        
        # CNN
        # Start with image that is 256 wide x 64 tall and 1 channel
        # End with 64 wide x 8 tall and 128 channels
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool_1(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.pool_1(x)
        x = self.dropout(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.pool_2(x)
        x = self.dropout(x)
        
        print("shape after CNN", x.shape)
        # CNN to RNN
        # Reshape to a sequence vector that is 64 wide and 1024 deep 
        batch_size = x.shape[0]
        print('batch_size', batch_size)
        
        #x = torch.reshape(x,(1,64,-1)) # or 1024 instead of -1?? 
        x = torch.reshape(x,(batch_size,64,-1)) # or 1024 instead of -1?? 
        #x = torch.reshape(x,(1,64,1024))
        print("shape after resize", x.shape)
        x = self.linear1(x) 
        # Now we shrink the sequence vector to be 512 deep 

        # RNN
        # RNN layer outputs a tuple, the output and the final hidden state
        # taking the final hidden state as output
        print("shape before LSTM", x.shape)
        
        x = self.lstm1(x)[0] #[0] to get outputs, not hidden
        print("after first LSTM")
        print(x.shape)
        print(x)
        # x = self.lstm2(x)[1]

        # OUTPUT
        x = self.linear2(x) # torch.Size([2, 2, 30])
        print("shape after Linear", x.shape)
        print(x)

        return self.softmax(x)

        # # unroll x for FC layer
        # x = self.linear1(self.unroll(x))
        # x = self.relu(x)
        # x = self.linear2(x)

In [145]:
model = CNN_RNN()
summary(model, input_size = (2, 1, 256, 64), device='cpu')  # inputsize = (batch_size, channels, image length, image width)


shape after CNN torch.Size([2, 128, 64, 8])
batch_size 2
shape after resize torch.Size([2, 64, 1024])
shape before LSTM torch.Size([2, 64, 64])
after first LSTM
torch.Size([2, 64, 1024])
tensor([[[-6.0126e-03, -1.4272e-03, -1.7771e-04,  ...,  2.2190e-02,
           6.8057e-04,  2.1031e-03],
         [-8.7793e-03, -2.0947e-03, -3.8742e-04,  ...,  2.2607e-02,
           6.4658e-04,  2.1253e-03],
         [-1.0169e-02, -2.3739e-03, -5.1360e-04,  ...,  2.2858e-02,
           5.7525e-04,  2.0963e-03],
         ...,
         [-1.1038e-02, -2.8731e-03, -1.1078e-03,  ...,  2.1842e-02,
          -4.8602e-05,  2.4985e-03],
         [-1.1226e-02, -3.0517e-03, -1.2100e-03,  ...,  1.9422e-02,
          -1.7145e-04,  2.4911e-03],
         [-1.1147e-02, -3.4148e-03, -1.6865e-03,  ...,  1.3678e-02,
          -8.7768e-05,  2.0570e-03]],

        [[-6.0277e-03, -1.4606e-03, -1.5450e-04,  ...,  2.2168e-02,
           6.1446e-04,  2.1041e-03],
         [-8.7888e-03, -2.1401e-03, -3.7656e-04,  ...,  2.2592

Layer (type:depth-idx)                   Output Shape              Param #
CNN_RNN                                  --                        --
├─Conv2d: 1-1                            [2, 32, 256, 64]          320
├─BatchNorm2d: 1-2                       [2, 32, 256, 64]          64
├─ReLU: 1-3                              [2, 32, 256, 64]          --
├─MaxPool2d: 1-4                         [2, 32, 128, 32]          --
├─Conv2d: 1-5                            [2, 64, 128, 32]          18,496
├─BatchNorm2d: 1-6                       [2, 64, 128, 32]          128
├─ReLU: 1-7                              [2, 64, 128, 32]          --
├─MaxPool2d: 1-8                         [2, 64, 64, 16]           --
├─Dropout: 1-9                           [2, 64, 64, 16]           --
├─Conv2d: 1-10                           [2, 128, 64, 16]          73,856
├─BatchNorm2d: 1-11                      [2, 128, 64, 16]          256
├─ReLU: 1-12                             [2, 128, 64, 16]          --
├─Ma

In [146]:
# the ctc loss function
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage
    y_pred = y_pred[:, 2:, :]
    return nn.CTCLoss(y_pred, labels, input_length, label_length)

In [None]:
# labels = Input(name='gtruth_labels', shape=[max_str_len], dtype='float32')
# input_length = Input(name='input_length', shape=[1], dtype='int64')
# label_length = Input(name='label_length', shape=[1], dtype='int64')

# ctc_loss = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
# model_final = Model(inputs=[input_data, labels, input_length, label_length], outputs=ctc_loss)


# ctc_lambda_func()

In [210]:
optimizer = optim.Adam(model.parameters(), lr = 0.001)

def one_pass(model, dataloader, optimizer, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for img, labels, label_length in dataloader:
        print('labels shape',labels.shape)
        model.train()
        print(img.shape)
        
        print(img.unsqueeze(-1).shape)
        print(np.transpose(img,(0,3,1,2)))
        y_pred = model(img.unsqueeze(-1))
        print('y_pred shape', y_pred.shape)
        input_length = np.ones(y_pred.shape[0])*y_pred.shape[1]
        loss = ctc_lambda_func(y_pred, labels, input_length, label_length)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

SyntaxError: invalid syntax (<ipython-input-210-a9568caa2ea3>, line 15)

In [211]:
train_loss = one_pass(model, dl_train, optimizer)
train_loss

labels shape torch.Size([2, 64])
torch.Size([2, 256, 64])
torch.Size([2, 256, 64, 1])


RuntimeError: Given groups=1, weight of size [32, 1, 3, 3], expected input[2, 256, 64, 1] to have 1 channels, but got 256 channels instead