[Article](https://www.dotlayer.org/en/training-rnn-using-pytorch/)

[Stanford RNN Reference](https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-recurrent-neural-networks)

In [17]:
import contextlib
import gzip
import os
import json
import re
import shutil
import sys
import warnings
from io import TextIOBase

import fasttext
import fasttext.util
import requests
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import cross_entropy
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import DataLoader

from pathlib import Path

data_wd = Path.home() / '.data' / 'asl-signs' / 'sequence_padding'

# The Pytorch Model

In [19]:
dimension = 300
num_layer = 1
bidirectional = False

lstm_network = nn.LSTM(
    input_size=dimension,
    hidden_size=dimension,
    num_layers=num_layer,
    bidirectional=bidirectional,
    batch_first=True,
)

In [20]:
input_dim = dimension  # the output of the LSTM
tag_dimension = 8

fully_connected_network = nn.Linear(input_dim, tag_dimension)

In [21]:
device = torch.device("cuda:0")

batch_size = 128
lr = 0.1

epoch_number = 10

# The Dataset

In [22]:
def download_data(data_type):
    """
    Function to download the dataset using data_type to specify if we want the train, valid or test.
    """

    # hardcoded url to download the pickled dataset
    root_url = "https://dot-layer.github.io/blog-external-assets/train_rnn/{}.p"

    url = root_url.format(data_type)
    r = requests.get(url)

    with open(data_wd / f"{data_type}.p", "wb") as f:
        f.write(r.content)


# download_data("train")
# download_data("valid")
# download_data("test")

In [23]:
import builtins
import io
import pickle

safe_builtins = {
    'set',
    'list',
    'tuple'
}

class RestrictedUnpickler(pickle.Unpickler):

    def find_class(self, module, name):
        # Only allow safe classes from builtins.
        if module == "builtins" and name in safe_builtins:
            return getattr(builtins, name)
        # Forbid everything else.
        raise pickle.UnpicklingError("global '%s.%s' is forbidden" %
                                     (module, name))

def restricted_loads(s):
    """Helper function analogous to pickle.loads()."""
    return RestrictedUnpickler(io.BytesIO(s)).load()

In [24]:
# with open(data_wd / 'train.p', mode='rb') as f:
#     train_data = restricted_loads(f.read())
    
# with open(data_wd / 'valid.p', mode='rb') as f:
#     valid_data = restricted_loads(f.read())
    
# with open(data_wd / 'test.p', mode='rb') as f:
#     test_data = restricted_loads(f.read())

In [25]:
# with open(data_wd / 'train.json', 'w') as f:
#     json.dump(train_data, f)
    
# with open(data_wd / 'valid.json', 'w') as f:
#     json.dump(valid_data, f)
    
# with open(data_wd / 'test.json', 'w') as f:
#     json.dump(test_data, f)

In [26]:
with open(data_wd / 'train.json') as f:
    train_data = json.load(f)
    
with open(data_wd / 'valid.json') as f:
    valid_data = json.load(f)
    
with open(data_wd / 'test.json') as f:
    test_data = json.load(f)

In [27]:
len(train_data), len(valid_data), len(test_data)

(728789, 182198, 100000)

# Vectorize the Dataset

In [28]:
class LookForProgress(TextIOBase):
    def __init__(self, stdout):
        self.stdout = stdout
        self.regex = re.compile(r"([0-9]+(\.[0-9]+)?%)", re.IGNORECASE)

    def write(self, o):
        res = self.regex.findall(o)
        if len(res) != 0:
            print(f"\r{res[-1][0]}", end="", file=self.stdout)
            
            
class EmbeddingVectorizer:
    def __init__(self):
        """
        Embedding vectorizer
        """
        with contextlib.redirect_stdout(LookForProgress(sys.stdout)):
            # We use a context manager redirect to handle the broken download in
            # Jupyter notebook
            fasttext.util.download_model("fr", if_exists="ignore")
        self.embedding_model = fasttext.load_model("./cc.fr.300.bin")

    def __call__(self, address):
        """
        Convert address to embedding vectors
        :param address: The address to convert
        :return: The embeddings vectors
        """
        embeddings = []
        for word in address.split():
            embeddings.append(self.embedding_model[word])
        return embeddings

In [29]:
embedding_vectorizer = EmbeddingVectorizer()



In [30]:
addr_record = train_data[0][0]
addr_vector = embedding_vectorizer(addr_record)
addr_record, len(addr_vector), addr_vector[0].shape

('35 r de percé gatineau qc j8r 2e6', 8, (300,))

In [31]:
class DatasetBucket:
    def __init__(self, data, embedding_vectorizer):
        self.data = data
        self.embedding_vectorizer = embedding_vectorizer
        self.tags_set = {
            "StreetNumber": 0,
            "StreetName": 1,
            "Unit": 2,
            "Municipality": 3,
            "Province": 4,
            "PostalCode": 5,
            "Orientation": 6,
            "GeneralDelivery": 7,
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):  # We vectorize when data is asked
        data = self.data[item]
        return self._item_vectorizing(data)

    def _item_vectorizing(self, item):
        address = item[0]
        address_vector = self.embedding_vectorizer(address)

        tags = item[1]
        idx_tags = self._convert_tags_to_idx(tags)

        return address_vector, idx_tags

    def _convert_tags_to_idx(self, tags):
        idx_tags = []
        for tag in tags:
            idx_tags.append(self.tags_set[tag])
        return idx_tags
    
train_dataset_vectorizer = DatasetBucket(train_data, embedding_vectorizer)
valid_dataset_vectorizer = DatasetBucket(valid_data, embedding_vectorizer)
test_dataset_vectorizer = DatasetBucket(test_data, embedding_vectorizer)

In [32]:
address, tag = train_dataset_vectorizer[0]
tag

[0, 1, 1, 1, 3, 4, 5, 5]

In [33]:
def pad_collate_fn(batch):
    """
    The collate_fn that can add padding to the sequences so all can have
    the same length as the longest one.

    Args:
        batch (List[List, List]): The batch data, where the first element
        of the tuple is the word idx and the second element are the target
        label.

    Returns:
        A tuple (x, y). The element x is a tensor of packed sequence .
        The element y is a tensor of padded tag indices. The word vectors are
        padded with vectors of 0s and the tag indices are padded with -100s.
        Padding with -100 is done because of the cross-entropy loss and the
        accuracy metric ignores the targets with values -100.
    """

    # This gets us two lists of tensors and a list of integer.
    # Each tensor in the first list is a sequence of word vectors.
    # Each tensor in the second list is a sequence of tag indices.
    # The list of integer consist of the lengths of the sequences in order.
    sequences_vectors, sequences_labels, lengths = zip(
        *[
            (torch.FloatTensor(seq_vectors), torch.LongTensor(labels), len(seq_vectors))
            for (seq_vectors, labels) in sorted(
                batch, key=lambda x: len(x[0]), reverse=True
            )
        ]
    )

    lengths = torch.LongTensor(lengths)

    padded_sequences_vectors = pad_sequence(
        sequences_vectors, batch_first=True, padding_value=0
    )
    pack_padded_sequences_vectors = pack_padded_sequence(
        padded_sequences_vectors, lengths.cpu(), batch_first=True
    )  # We pack the padded sequence to improve the computational speed during training

    padded_sequences_labels = pad_sequence(
        sequences_labels, batch_first=True, padding_value=-100
    )

    return pack_padded_sequences_vectors, padded_sequences_labels

In [34]:
train_loader = DataLoader(
    train_dataset_vectorizer,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=pad_collate_fn,
    num_workers=4,
)
valid_loader = DataLoader(
    valid_dataset_vectorizer,
    batch_size=batch_size,
    collate_fn=pad_collate_fn,
    num_workers=4,
)
test_loader = DataLoader(
    test_dataset_vectorizer,
    batch_size=batch_size,
    collate_fn=pad_collate_fn,
    num_workers=2,
)


In [35]:
class RecurrentNet(nn.Module):
    def __init__(self, lstm_network, fully_connected_network):
        super().__init__()
        self.hidden_state = None

        self.lstm_network = lstm_network
        self.fully_connected_network = fully_connected_network

    def forward(self, packed_sequences_vectors):
        """
        Defines the computation performed at every call.

        Shapes:
            packed_sequence_vectors: batch_size * longest_sequence_length (padding), 300

        """
        lstm_out, self.hidden_state = self.lstm_network(packed_sequences_vectors)
        lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)

        tag_space = self.fully_connected_network(lstm_out)
        return tag_space.transpose(-1, 1)  # We need to transpose since it's a sequence

In [37]:
model = RecurrentNet(lstm_network, fully_connected_network)

In [38]:
addr_vectors, label_vectors = next(iter(train_loader))

  (torch.FloatTensor(seq_vectors), torch.LongTensor(labels), len(seq_vectors))
  (torch.FloatTensor(seq_vectors), torch.LongTensor(labels), len(seq_vectors))
  (torch.FloatTensor(seq_vectors), torch.LongTensor(labels), len(seq_vectors))
  (torch.FloatTensor(seq_vectors), torch.LongTensor(labels), len(seq_vectors))


In [39]:
pad_packed_sequence(model.lstm_network(addr_vectors)[0], batch_first=True)

(tensor([[[-0.0020, -0.0307, -0.0408,  ..., -0.0066, -0.0041, -0.0457],
          [ 0.0245, -0.0144, -0.0243,  ..., -0.0119, -0.0015, -0.0477],
          [ 0.0674, -0.0033, -0.0151,  ..., -0.0259,  0.0529, -0.0423],
          ...,
          [-0.0216, -0.1179,  0.0279,  ..., -0.0391, -0.0184, -0.0488],
          [-0.0084, -0.0757,  0.0034,  ..., -0.0176, -0.0109, -0.0361],
          [ 0.0021, -0.0654,  0.0006,  ..., -0.0034,  0.0037, -0.0398]],
 
         [[-0.0014, -0.0335, -0.0325,  ..., -0.0015, -0.0128, -0.0471],
          [ 0.0225, -0.0139, -0.0179,  ..., -0.0089, -0.0057, -0.0476],
          [ 0.0650, -0.0021, -0.0111,  ..., -0.0234,  0.0508, -0.0415],
          ...,
          [-0.0278, -0.1195,  0.0308,  ..., -0.0364, -0.0188, -0.0564],
          [-0.0096, -0.0758,  0.0049,  ..., -0.0116, -0.0102, -0.0396],
          [-0.0047, -0.0668, -0.0145,  ..., -0.0143,  0.0039, -0.0536]],
 
         [[-0.0136, -0.0182, -0.0153,  ..., -0.0119,  0.0219,  0.0002],
          [-0.0013, -0.0186,