Resources:

http://forums.fast.ai/t/fastai-for-image-captioning/17003/8

https://github.com/githubharald/SimpleHTR/blob/master/src/DataLoader.py

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.fastai.conv_learner import *
from fastai.fastai.text import *

In [3]:
PATH = Path('data/IAM_handwriting')
TMP_PATH = PATH/'tmp'

In [4]:
!ls {PATH}

[34mascii[m[m           line_labels.csv [34mmodels[m[m          [34mtmp[m[m             [34mwords[m[m
[34mforms[m[m           [34mlines[m[m           [34msentences[m[m       word_labels.csv words.csv


In [5]:
from PIL import Image

def open_image(fname):
    f = fname if fname.endswith('.png') else fname+'.png'
    return Image.open(f'{PATH}/lines/{f}')

In [None]:
open_image('r06-143-01')

# Lines

### Explore Data

In [None]:
line_labels = pd.read_csv(f'{PATH}/ascii/lines.txt', names=['filename','result','value'], escapechar='\\', delim_whitespace=True, skiprows=23, header=None, usecols=[0,1,8])

In [None]:
line_labels.tail()

In [None]:
line_labels['text'] = line_labels.apply(lambda row: row.value.replace('|', ' '), axis=1)
line_labels.tail()

### Tokenize Labels

In [None]:
tokens = Tokenizer().proc_all_mp(partition_by_cores(line_labels.text))

In [None]:
np.argmax([len(o) for o in tokens])

In [None]:
tokens[4964], len(tokens[4964])

In [None]:
np.percentile([len(o) for o in tokens], 99)

### Numericalize Labels

In [None]:
freq = Counter(word for line in tokens for word in line)
freq.most_common(25)

In [None]:
max_vocab=40000
min_freq=2

In [None]:
itos = [word for word,count in freq.most_common(max_vocab) if count>min_freq]
itos.insert(0, '_pad_')
itos.insert(1, '_bos_')
itos.insert(2, '_unk_')

stoi = collections.defaultdict(lambda: 2, {v:k for k,v in enumerate(itos)})
ids = np.array([np.array([1] + [stoi[word] for word in line]) for line in tokens])

In [None]:
len(ids), len(itos)#, len(stoi)

In [None]:
def idstoline(ids):
    return ' '.join(itos[i] for i in ids)

In [None]:
idstoline(ids[0])

In [None]:
np.save(TMP_PATH/'ids.npy', ids)
pickle.dump(itos, open(TMP_PATH/'itos.pkl', 'wb'))

In [None]:
# add ids to df
line_labels['ids'] = [' '.join(str(p) for p in o) for o in ids]
line_labels.tail()

### load Wikitext103 pretrained model

In [None]:
pre_path = Path('data/aclImdb/models/wt103')
pre_lm_path = pre_path/'fwd_wt103.h5'

In [None]:
wgts = torch.load(pre_lm_path, map_location=lambda storage, loc: storage )

In [None]:
dec_wgts = to_np(wgts['1.decoder.weight'])
row_mean = dec_wgts.mean(0)
dec_wgts.shape

In [None]:
wiki_itos = pickle.load((pre_path/'itos_wt103.pkl').open('rb'))
wiki_stoi = collections.defaultdict(lambda: -1, {v:k for k,v in enumerate(wiki_itos)})

In [None]:
def create_emb(dec_wgts, itos):
    row_mean = torch.from_numpy(dec_wgts.mean(0))
    em_sz = dec_wgts.shape[1]
    # embedding: simple lookup table - input=index; output=word vector
    # embeddings: rows = vocab size (4085), columns = determined by wiki pre-trained weights (400)
    emb = nn.Embedding(len(itos), em_sz, padding_idx=1)
    # learnable pytorch module has a 'weight' attribute => Variable
    # 'weight' attribute has a 'data' attribute => Tensor
    wgts = emb.weight.data
    # iterate through vocabulary and replace found words w/ pretrained vector weights if available
    for idx,word in enumerate(itos):
        wiki_int = wiki_stoi[word]
        wgts[idx] = torch.from_numpy(dec_wgts[wiki_int]) if wiki_int >= 0 else row_mean
    
    return emb

In [None]:
emb_dec = create_emb(dec_wgts, itos)

In [None]:
torch.save(emb_dec, TMP_PATH/'embedding_decoder.h5')

### Save DF as CSV

In [6]:
emb_dec = torch.load(TMP_PATH/'embedding_decoder.h5')

In [7]:
ids = np.load(TMP_PATH/'ids.npy')
itos = pickle.load(open(TMP_PATH/'itos.pkl', 'rb'))

In [8]:
CSV = PATH/'line_labels.csv'

In [9]:
# line_labels.to_csv(CSV, columns=['filename', 'ids'], index=False)
csv = pd.read_csv(CSV)
csv.head()

Unnamed: 0,filename,ids
0,a01-000u-00,1 9 32 1311 7 500 41 4 949 39
1,a01-000u-01,1 2 111 65 162 121 1312
2,a01-000u-02,1 16 7 25 104 28 9 387 6 162
3,a01-000u-03,1 894 501 4 41 4 1623 834 46
4,a01-000u-04,1 205 136 9 2148 26 3 708


In [None]:
len(csv)

In [None]:
# CSV.open().readlines()[:5]

### Get val_idxs

In [10]:
val_idxs = np.array(csv.sample(frac=0.15).index)
len(val_idxs)

2003

### Model Data (rotate, normalize)

In [11]:
sz = 300
bs = 50

In [12]:
# use this md object to load image data w/ transforms

# These values were generated initially with tfms_from_model(resnet34)
stats = A([ 0.92025,  0.92025,  0.92025], [ 0.12774,  0.12774,  0.12774])

aug_tfms = [RandomRotate(1, mode=0), RandomLighting(0.05, 0.05)]

tfms = tfms_from_stats(stats, sz, crop_type=CropType.NO, aug_tfms=aug_tfms)
data = ImageClassifierData.from_csv(PATH, 'lines', CSV, bs=bs, val_idxs=val_idxs, tfms=tfms, suffix='.png',
                                    continuous=True)

**Dataset**
- x: images (with aug_tfms)
- y: array of ints -> represent words in a line
    -- pre_pad=False, pad_idx=0, transpose_y=?????

In [13]:
# pad ends of lines with pad token for language model
data.aug_dl.pre_pad=False
data.trn_dl.pre_pad=False
data.val_dl.pre_pad=False

In [14]:
denorm = data.trn_ds.denorm
x,y = next(iter(data.aug_dl))
x = denorm(x)

In [15]:
def idstoline(ids):
    return ' '.join(itos[i] for i in ids)

In [None]:
ints = to_np(y[9]).astype(int)
idstoline(ints)

### View image transforms

In [None]:
def show_img(im, figsize=None, ax=None, alpha=None, title=None):
    if not ax: fig,ax = plt.subplots(figsize=figsize)
    ax.imshow(im, alpha=alpha)
    ax.set_axis_off()
    if title: ax.set_title(title)
    return ax

In [None]:
fig, axes = plt.subplots(4,1, figsize=(20, 10))
for i,ax in enumerate(axes.flat):
    ints = to_np(y[i]).astype(int)
    t = idstoline(ints)
#     t = data.classes[y[i]]
    ax=show_img(x[i], ax=ax, title=t)
    
plt.tight_layout(pad=0.1)

### Determine size of dataset (2000 x 200)

In [None]:
# create a dictionary comprehension of image sizes in the dataset
size_d = {k: PIL.Image.open(PATH/k).size for k in data.val_ds.fnames}

In [None]:
row_sz,col_sz = list(zip(*size_d.values()))

In [None]:
row_sz = np.array(row_sz); col_sz = np.array(col_sz)

In [None]:
plt.hist(row_sz)

In [None]:
plt.hist(col_sz)

## Initial Seq2Seq

In [None]:
# conv basis
# head_reg4 = nn.Sequential(Flatten(), nn.Linear(25088,em_sz))   # last layer has 7x7x512 in ResNet34
# learn = ConvLearner.pretrained(f_model, md, custom_head=head_reg4)

In [None]:
vs = len(itos)

In [None]:
# f = vgg19
# conv_model = nn.Sequential(*children(f(True))[:-1])

f = resnet34
conv_model = nn.Sequential(*children(f(True))[:6])

In [None]:
z = conv_model(T(x).permute(0,3,1,2)[0].unsqueeze(0))

In [None]:
z.size()

In [None]:
conv_model[-1][-1].bn2.num_features

**GRU**

Inputs: input, h_0
    - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
      of the input sequence. The input can also be a packed variable length
      sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
      for details.
    - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
      containing the initial hidden state for each element in the batch.
      Defaults to zero if not provided.

Outputs: output, h_n
    - **output** of shape `(seq_len, batch, hidden_size * num_directions)`: tensor
      containing the output features h_t from the last layer of the GRU,
      for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
      given as the input, the output will also be a packed sequence.
    - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
      containing the hidden state for `t = seq_len`

In [None]:
em_sz = 400
# vocab_size = 4085
# batch_size = 50

In [None]:
class CNN_Encoder(nn.Module):
    def __init__(self, conv_model, em_sz, p_num):
        super().__init__()
#         c = conv_model[-2].num_features  #vgg
        c = conv_model[-1][-1].bn2.num_features  #resnet
        
        self.pool = nn.AdaptiveMaxPool2d(p_num)      #(bs,128,7,7)
        self.fc1 = nn.Linear(c*(p_num**2), 1000)
        self.bn = nn.BatchNorm1d(1000, momentum=0.01)
        self.fc2 = nn.Linear(1000, em_sz)
    
    def forward(self, inp):
        feats = conv_model(inp)                  #=> ([50, 128, ?, ?])  (bs,c,h,w)
        feats = self.pool(feats)                 #=> ([50, 128, 7, 7])
        feats = feats.view(feats.size(0), -1)    #(bs,6272)
        feats = self.bn(F.relu(self.fc1(feats))) #(bs,1000)
        return self.fc2(feats)                   #(bs,256)

In [None]:
# based on Show,Attend,Tell - https://github.com/parksunwoo/show_attend_and_tell_pytorch/blob/master/model.py

class Seq2SeqCNN_RNN(nn.Module):
    def __init__(self, conv_model, vs, em_sz, sl=20, nl=1, p_num=7):
        super().__init__()
        
        self.encoder = CNN_Encoder(conv_model, em_sz, p_num)

        self.emb = emb_dec #nn.Embedding.from_pretrained(emb_dec)
        # self.emb = nn.Embedding(vs, em_sz) #=> Embedding(80, 256)
        self.gru = nn.GRU(em_sz, em_sz, num_layers=nl) #, dropout=0.1)
        self.drop = nn.Dropout(0.35)
        self.out = nn.Linear(em_sz, vs) #=> ([256, 80])
        
    def forward(self, inp):
        ### Encoder (Conv Layer) ###
        feats = self.encoder(inp)
        bs,c = feats.size()   # need this because last batch isn't 50

        h = feats.unsqueeze(0)         #(1,bs,256)
    
        ### DECODER LOOP ###    loop through ~20 times -> for each word in output
        dec_inp = V(torch.ones(bs).long()) #=> ([bs])   # initialize first word (with _bos_ token) and then replace 
        res = []

        for i in range(self.sl):            
            emb = self.emb(dec_inp).unsqueeze(0)         # embedding => ([1, bs, 256])
            outp, h = self.gru(emb, h)                   # rnn => ([1, bs, 256]),  ([1, bs, 256])
            outp = self.out(self.drop(outp[0]))          # dropout, linear layer => ([256, 4085])
            res.append(outp)
            dec_inp = V(outp.data.max(1)[1])   #=> ([64])
#             if (dec_inp==1).all(): break       # 1: padding token => stop, we're done (padding at the end)
        return torch.stack(res)                  # stack up list of results into single tensor and return it

In [None]:
# categorical cross entropy loss
# list of probabilities for each character in vocab; target is correct character

def seq2seq_loss(input, target):    
    targ = target.transpose(0,1).contiguous()  # need to switch the axes to line up
    sl,bs = targ.size()   #=> ([13,50])
    sl_in,bs_in,nc = input.size()  #=> ([20, 50, 80])
        
    # tweak 1: align sequence lengths (input is always 20 but target often is less)
    if sl_in>sl: targ = F.pad(targ, (0,0,0,sl_in-sl)) # rank2 tensor requires 4 padding values
    # (padLeft, padRight, padTop, padBottom) => (before 1, after 1, before 0, after 0)
    # sequence length: add as much padding as necessary at the end
    
    # input = input[:sl]
    targ = targ[:20,:].long()
    # cross_entropy expects rank2 tensor but we have sl * bs so we need to flatten out both
    # combination of LogSoftmax and NLLLoss
    return F.cross_entropy(input.view(-1,nc), targ.view(-1))  #=> ([1000, 80]), ([1000])

In [None]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))
rnn = Seq2SeqCNN_RNN(conv_model, vs, em_sz)
# SingleModel => way to handle learning rate groups -> treats whole thing as single group
# easy way to turn pytorch module into fastai model
learn = RNN_Learner(data, SingleModel(to_gpu(rnn)), opt_fn=opt_fn)
learn.crit = seq2seq_loss

In [None]:
learn.lr_find(start_lr=1e-7)
learn.sched.plot()

## Attention model

In [None]:
em_sz,sl = 400,20
# vocab_size = 4085
# batch_size = 50

### Hybrid try1

In [None]:
class CNN_Encoder(nn.Module):
    def __init__(self, conv_model, em_sz, p_num):
        super().__init__()
        # self.c = conv_model[-2].num_features  #vgg
        self.c = conv_model[-1][-1].bn2.num_features  #resnet
        
        self.pool = nn.AdaptiveMaxPool2d(p_num)      #(bs,128,7,7)
        self.fc = nn.Linear(self.c, em_sz)

    def forward(self, inp):
        feats = conv_model(inp)                  #=> ([50, 128, ?, ?])  (bs,c,h,w)
        feats = self.pool(feats)                 #=> ([50, 128, 7, 7])
        feats = feats.permute(2,3,0,1).view(-1,bs,self.c)  #(p_num**2,bs,c)
        feats = self.fc(feats)                   #(49,bs,400)
        return feats                 

In [None]:
# based on Show,Attend,Tell - https://github.com/parksunwoo/show_attend_and_tell_pytorch/blob/master/model.py
def rand_t(*sz): return torch.randn(sz)/math.sqrt(sz[0])
def rand_p(*sz): return nn.Parameter(rand_t(*sz))

#class ATTN_Decoder(nn.Module):


class Attn_RNN(nn.Module):
    def __init__(self, conv_model, vs, em_sz, sl=20, nl=1, p_num=7):
        super().__init__()
        self.encoder = CNN_Encoder(conv_model, em_sz, p_num)
        
        self.emb = emb_dec #nn.Embedding.from_pretrained(emb_dec)
        # self.emb = nn.Embedding(vs, em_sz) #=> Embedding(80, 256)
        self.gru = nn.GRU(em_sz, em_sz, num_layers=nl) #, dropout=0.1)
        self.drop = nn.Dropout(0.35)
        self.out = nn.Linear(em_sz, vs) #=> ([256, 80])
        
        # setting up Attention Layers
        self.W1 = rand_p(em_sz, em_sz)
        self.l2 = nn.Linear(em_sz, em_sz) 
        self.l3 = nn.Linear(em_sz*2, em_sz) #=> ([768, 256])
        self.V = rand_p(em_sz)  #=> ([256])

        
    def forward(self, inp):
        ### Encoder (Conv Layer) ###
        feats = self.encoder(inp)
        mask_sz,bs,c = feats.size()             # need this because last batch isn't 50
        #h = feats.unsqueeze(0)             #(bs,256)
        
    
        ### DECODER LOOP ###    loop through ~20 times -> for each word in output
        dec_inp = V(torch.zeros(bs).long()) #=> ([bs])   # initialize first word (with _unk_??) and then replace 
        hidden = torch.zeros(1, 1, em_sz)
        res,attns = [],[]

        
        for i in range(sl):   
            # embedded input
            emb = self.emb(dec_inp) #=> ([50, 400])
            w1e = emb @ self.W1     #=> ([50, 400])     

            # hidden state
            #w2h = self.l2(h[-1])  #=> ([125, 300])
            w2h = self.l2(hidden)    #=> ([1, 1, 400])
            
            # non-linear activation
            u = F.tanh(w1e + w2h) #=> ([30, 125, 300])
            # matrix multiply; softmax ensures all weights add up to 1 and 1 is higher than the rest
            a = F.softmax(u @ self.V, 0)  #=> ([30, 125])
            attns.append(a)
            Xa = (a.unsqueeze(2) * feats).sum(0) #=> ([125, 256])
#             emb = self.emb(dec_inp) #=> ([125, 556])
            wgt_enc = self.l3(torch.cat([emb, Xa], 1)) #=> ([125, 300])
            
            outp, hidden = self.gru(feats, wgt_enc.unsqueeze(0))

            outp = self.out(self.drop(outp[0]))          # dropout, linear layer => ([256, 4085])
            res.append(outp)
            dec_inp = V(outp.data.max(1)[1])   #=> ([64])
#             if (dec_inp==1).all(): break       # 1: padding token => stop, we're done (padding at the end)
        return torch.stack(res)                  # stack up list of results into single tensor and return it

###  Pytorch seq2seq  

https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html#attention-decoder

In [None]:
class Attn_RNN(nn.Module):
    def __init__(self, conv_model, vs, em_sz, sl, dropout_p=0.1, nl=1, p_num=7):
        # 256, ,10
        super(Attn_RNN, self).__init__()
        self.em_sz = em_sz  # 256
        self.vs = vs  # num words in output ~ vs
        self.dropout_p = dropout_p
        self.sl = sl    # 10

        # ENCODER
        c = conv_model[-2].num_features
        self.pool = nn.AdaptiveMaxPool2d(p_num)      #(bs,128,7,7)
        self.linear = nn.Linear(c*(p_num**2), em_sz)  #(6272, 256)
        self.bn = nn.BatchNorm1d(em_sz, momentum=0.01)
        
        # DECODER
        self.embedding = nn.Embedding(self.vs, self.em_sz)
        self.attn = nn.Linear(self.em_sz * 2, self.sl)
        self.attn_combine = nn.Linear(self.em_sz * 2, self.em_sz)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.em_sz, self.em_sz)
        self.out = nn.Linear(self.em_sz, self.vs)

    def forward(self, inp):
        # decoder_input = torch.tensor([[SOS_token]], device=device)  # BOS token
        # decoder_hidden = encoder_hidden
        # encoder_outputs = ([input_length, max_length, hidden_size])
        
        
        ### Encoder (Conv Layer) ###
        feats = conv_model(inp)              #=> ([50, 128, 16, 16])  (bs,c,h,w)
        feats = self.pool(feats)             #=> ([50, 64, 7, 7])
        out = feats.view(feats.size(0), -1)  #(bs,6272)
        out = self.bn(self.linear(out))      #(bs,256)
        bs,c = out.size()                    # need this because last batch isn't 50
        h = out.unsqueeze(0)                 #(1,bs,256)
 

        ### DECODER LOOP ###    loop through ~20 times -> for each word in output
        #dec_inp = V(torch.zeros(bs).long()) #=> ([bs])   # initialize first word (with _unk_??) and then replace 
        dec_inp = V(torch.zeros(bs).long())
        hidden = V(torch.zeros(bs,em_sz)) #=> ([bs])

        res,attns = [],[]

        # Embed the input & apply dropout
        embedded = self.embedding(dec_inp)  
        embedded = self.dropout(embedded)

        # Concatenate embedded input & hidden -> linear layer -> softmax
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded, hidden), 1)), dim=1)
        
        # Batch Matrix Multiply attention weights and encoder outputs
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), h)

        # Concatenate applied attention & embedded input -> linear layer -> relu
        output = torch.cat((embedded, attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

### Loss fn

In [None]:
# categorical cross entropy loss
# list of probabilities for each character in vocab; target is correct character

def seq2seq_loss(input, target):    
    targ = target.transpose(0,1).contiguous()  # need to switch the axes to line up
    sl,bs = targ.size()   #=> ([13,50])
    sl_in,bs_in,nc = input.size()  #=> ([20, 50, 80])
        
    # tweak 1: align sequence lengths (input is always 20 but target often is less)
    if sl_in>sl: targ = F.pad(targ, (0,0,0,sl_in-sl)) # rank2 tensor requires 4 padding values
    # (padLeft, padRight, padTop, padBottom) => (before 1, after 1, before 0, after 0)
    # sequence length: add as much padding as necessary at the end
    
    # input = input[:sl]
    targ = targ[:20,:].long()
    # cross_entropy expects rank2 tensor but we have sl * bs so we need to flatten out both
    # combination of LogSoftmax and NLLLoss
    return F.cross_entropy(input.view(-1,nc), targ.view(-1))  #=> ([1000, 80]), ([1000])

In [None]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))
rnn = Attn_RNN(conv_model, vs, em_sz, sl)
# SingleModel => way to handle learning rate groups -> treats whole thing as single group
# easy way to turn pytorch module into fastai model
learn = RNN_Learner(data, SingleModel(to_gpu(rnn)), opt_fn=opt_fn)
learn.crit = seq2seq_loss

In [None]:
learn.lr_find(start_lr=1e-7)
learn.sched.plot()

### Notes

https://github.com/tensorflow/models/blob/master/research/attention_ocr/python/model.py
https://www.tensorflow.org/api_docs/python/tf/contrib/legacy_seq2seq/attention_decoder

prev_attn = initialized to zero  
cell = GRU or LSTM  
input = A list of 2D Tensors [batch_size x input_size]  
attention_states = 3D Tensor [batch_size x attn_length x attn_size]  
prev_state = 2D Tensor with shape [batch_size x cell.state_size]  

First, we run the cell on a combination of the input and previous attention masks:  
* cell_output, new_state = cell(linear(input, prev_attn), prev_state)  

Then, we calculate new attention masks:  
* new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))  

and then we calculate the output:  
* output = linear(cell_output, new_attn)   

In [None]:
prev_attn = T(torch.zeros(bs, em_sz))
prev_state = T(torch.zeros(bs, em_sz))
inp = self.embedding(input)
self.linear(inp)

### Bottom Up Attention

