# Predicting English word version of numbers using LSTM & GRU

## Data

In [0]:
from fastai.text import *
import pdb

In [0]:
bs=64

In [3]:
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

[PosixPath('/root/.fastai/data/human_numbers/valid.txt'),
 PosixPath('/root/.fastai/data/human_numbers/train.txt')]

In [0]:
def readnums(d): return [', '.join(o.strip() for o in open(path/d).readlines())]

train.txt gives us a sequence of numbers written out as English words:

In [5]:
train_txt = readnums('train.txt'); train_txt[0][:80]

'one, two, three, four, five, six, seven, eight, nine, ten, eleven, twelve, thirt'

In [6]:
valid_txt = readnums('valid.txt'); valid_txt[0][-80:]

' nine thousand nine hundred ninety eight, nine thousand nine hundred ninety nine'

In [7]:
train = TextList(train_txt, path=path)
valid = TextList(valid_txt, path=path)
src = ItemLists(path, train, valid).label_for_lm()

In [0]:
wordvec_len = 100
nh = 64
bptt = 20
data = src.databunch(bs=bs, bptt=bptt)
nv = len(data.train_ds.vocab.itos)

## nn.LSTM

In [0]:
??nn.LSTM()

In [0]:
class Model5(nn.Module):
  def __init__(self):
    super().__init__()
    self.emb = nn.Embedding(nv, wordvec_len)
    self.input = nn.Linear(wordvec_len, nh)
    self.rnn = nn.LSTM(nh, nh, 1, batch_first=True)
    self.out = nn.Linear(nh, nv)
    self.bn = BatchNorm1dFlat(nh)
    self.h = torch.zeros(1, bs, nh).cuda()
    self.c = torch.zeros(1, bs, nh).cuda()
  
  def forward(self, x):
    res, (h, c) = self.rnn(self.input(self.emb(x)), (self.h, self.c))
    self.h = h.detach()
    self.c = c.detach()
    return self.out(self.bn(res))

In [28]:
learn = Learner(data, Model5(), metrics=accuracy)
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,2.532648,2.220782,0.369247,00:00
1,1.800844,1.938452,0.466832,00:00
2,1.415609,1.785361,0.530114,00:00
3,0.96948,1.767252,0.665128,00:00
4,0.553732,1.824781,0.738991,00:00
5,0.299017,1.663509,0.79098,00:00
6,0.165372,1.51742,0.802131,00:00
7,0.096482,1.465203,0.803835,00:00
8,0.060338,1.486027,0.812358,00:00
9,0.04238,1.46533,0.808168,00:00


## nn.LSTMCell

In [0]:
??nn.LSTMCell

In [0]:
def lstm_loop(cell, x, h):
  hx, cx = [], []
  h, c = h
  for o in x.transpose(0, 1): # time loop
    h, c = cell(o, (h, c))
    hx.append(h)
    cx.append(c)
  # reset shape: [batch, time, hidden size]
  return [torch.stack(hx, dim=1), torch.stack(cx, dim=1)]

class Model6(Model5):
  def __init__(self):
    super().__init__()
    self.h = torch.zeros(bs, nh).cuda()
    self.c = torch.zeros(bs, nh).cuda()
    self.cell = nn.LSTMCell(nh, nh)

  def forward(self, x):
    x = F.relu(self.input(self.emb(x)))
    h, c = lstm_loop(self.cell, x, (self.h, self.c))
    self.h = h[:, -1].detach()
    self.c = c[:, -1].detach()
    return self.out(self.bn(h))

In [73]:
learn = Learner(data, Model6(), metrics=accuracy)
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,2.480802,2.253954,0.457386,00:00
1,1.78768,2.049083,0.342188,00:00
2,1.413622,1.818817,0.534091,00:00
3,0.916679,1.41472,0.710156,00:00
4,0.488267,1.304279,0.800071,00:00
5,0.253496,1.146594,0.803835,00:00
6,0.138924,1.145488,0.814418,00:00
7,0.08146,1.144085,0.81946,00:00
8,0.052895,1.114757,0.818892,00:00
9,0.03885,1.144648,0.816619,00:00


## Custom LSTMCell

Rebuild LSTMCell

The following some code come from [emadRad](https://github.com/emadRad/lstm-gru-pytorch/blob/master/lstm_gru.ipynb):

In [0]:
class LSTMCell(nn.Module):
  def __init__(self, nin, nh):
    super().__init__()
    self.lin_x = nn.Linear(nin, 4 * nh)
    self.lin_h = nn.Linear(nh, 4 * nh)

  def forward(self, x, hc):
    h, c = hc
    _x = self.lin_x(x)
    _h = self.lin_h(h)
    x_i, x_f, x_o, x_g = _x.chunk(4, dim=1)
    h_i, h_f, h_o, h_g = _h.chunk(4, dim=1)
    i = torch.sigmoid(x_i + h_i)
    f = torch.sigmoid(x_f + h_f)
    o = torch.sigmoid(x_o + h_o)
    g = torch.tanh(x_g + h_g)
    c_hat = f * c + i * g
    h_hat = o * torch.tanh(c_hat)
    return (h_hat, c_hat)

In [0]:
class Model6(Model5):
  def __init__(self):
    super().__init__()
    self.h = torch.zeros(bs, nh).cuda()
    self.c = torch.zeros(bs, nh).cuda()
    self.cell = LSTMCell(nh, nh)

  def forward(self, x):
    x = F.relu(self.input(self.emb(x)))
    h, c = lstm_loop(self.cell, x, (self.h, self.c))
    self.h = h[:, -1].detach()
    self.c = c[:, -1].detach()
    return self.out(self.bn(h))

In [80]:
learn = Learner(data, Model6(), metrics=accuracy)
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,2.550474,2.243888,0.439631,00:00
1,1.786658,1.960842,0.481179,00:00
2,1.362301,1.657263,0.563707,00:00
3,0.88429,1.556831,0.71669,00:00
4,0.487119,1.466396,0.796307,00:00
5,0.257691,1.437768,0.809517,00:00
6,0.142321,1.408627,0.811861,00:00
7,0.083653,1.460158,0.81108,00:00
8,0.053673,1.502342,0.813565,00:00
9,0.038856,1.502961,0.8125,00:00


## nn.GRU

In [0]:
class Model7(nn.Module):
  def __init__(self):
    super().__init__()
    self.emb = nn.Embedding(nv, wordvec_len)
    self.input = nn.Linear(wordvec_len, nh)
    self.rnn = nn.GRU(nh, nh, 1, batch_first=True)
    self.out = nn.Linear(nh, nv)
    self.bn = BatchNorm1dFlat(nh)
    self.h = torch.zeros(1, bs, nh).cuda()
  
  def forward(self, x):
    res, h = self.rnn(self.input(self.emb(x)), self.h)
    self.h = h.detach()
    return self.out(self.bn(res))

In [82]:
learn = Learner(data, Model7(), metrics=accuracy)
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,2.60383,2.105865,0.452983,00:00
1,1.815322,1.835269,0.470099,00:00
2,1.474217,1.965154,0.546662,00:00
3,1.13725,2.210357,0.592188,00:00
4,0.748916,2.343477,0.690696,00:00
5,0.43723,1.932745,0.76456,00:00
6,0.247734,1.799907,0.804332,00:00
7,0.145794,1.692037,0.810511,00:00
8,0.092598,1.702543,0.805824,00:00
9,0.065691,1.729235,0.807386,00:00


## nn.GRUCell

Rebuild GRUCell

In [0]:
def gru_loop(gru_cell, x, h):
  res = []
  for o in x.transpose(0, 1): # loop very time
    h = gru_cell(o, h)
    res.append(h)
  return torch.stack(res, dim=1) # reset shape: [batch, time, hidden size]

class Model8(Model7):
  def __init__(self):
    super().__init__()
    self.h = torch.zeros(bs, nh).cuda()
    self.cell = nn.GRUCell(nh, nh)

  def forward(self, x):
    x = F.relu(self.input(self.emb(x)))
    h = gru_loop(self.cell, x, self.h)
    self.h = h[:, -1].detach()
    return self.out(self.bn(h))

In [84]:
learn = Learner(data, Model8(), metrics=accuracy)
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,2.522522,2.189934,0.377273,00:00
1,1.776763,1.80616,0.494247,00:00
2,1.272303,1.613085,0.612713,00:00
3,0.72343,1.423613,0.739062,00:00
4,0.384734,1.463009,0.779688,00:00
5,0.209081,1.550395,0.792045,00:00
6,0.120361,1.631862,0.783452,00:00
7,0.074887,1.622585,0.79517,00:00
8,0.051117,1.648417,0.787784,00:00
9,0.039105,1.590964,0.790199,00:00


## Custom GRUCell

Rebuild GRUCell

In [0]:
??nn.GRUCell

In [0]:
class GRUCell(nn.Module):
  def __init__(self, nin, nh):
    super().__init__()
    self.lin_x = nn.Linear(nin, 3 * nh)
    self.lin_h = nn.Linear(nh, 3 * nh)

  def forward(self, x, h):
    _x = self.lin_x(x)
    _h = self.lin_h(h)
    ir, iz, xin = _x.chunk(3, dim=1)
    hr, hz, hn = _h.chunk(3, dim=1)
    r = torch.sigmoid(ir + hr)  # reset gate
    z = torch.sigmoid(iz + hz)  # update gate
    n = torch.tanh(xin + r * hn)  # new gate
    h_hat = (1 - z) * n + (z * h)
    return h_hat

In [0]:
class Model8(Model7):
  def __init__(self):
    super().__init__()
    self.h = torch.zeros(bs, nh).cuda()
    self.cell = GRUCell(nh, nh)

  def forward(self, x):
    x = F.relu(self.input(self.emb(x)))
    h = gru_loop(self.cell, x, self.h)
    self.h = h[:, -1].detach()
    return self.out(self.bn(h))

In [88]:
learn = Learner(data, Model8(), metrics=accuracy)
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,2.610216,2.169208,0.45973,00:00
1,1.814095,1.821466,0.418253,00:00
2,1.340642,1.814949,0.580256,00:00
3,0.823776,1.249176,0.748722,00:00
4,0.453387,1.306421,0.807742,00:00
5,0.249834,1.450365,0.80767,00:00
6,0.142717,1.255891,0.818466,00:00
7,0.086801,1.308531,0.821023,00:00
8,0.057675,1.356536,0.817187,00:00
9,0.042296,1.293155,0.818253,00:00


## END

RNNs are just a refactored, fully-connected neural network.

You can use the same approach for any sequence labeling task (part of speech, classifying whether material is sensitive,..)