# An illustration of character-based embedding lookup using convolution layer

In many NLP tasks word are resolved to embedding vectors, via an embedding layer. The embedding layer may be initialized with pre-trained word vectors such as word2vec or may be randomly initialized and learnt during training.

In this illustration, word embedding vectors are composed from character-level embeddings, instead of a simple lookup. The process is as follows.
After pre-processing sentences to ensure all sentences and words are of equal length.
1. For each word, lookup embeddings for each character of the word
2. Apply 1-D convolution layer to each word
3. Apply max pooling to get concise feature representation, which will serve as the word's embedding

Computed word embeddings can be fed into larger, deeper networks for NLP tasks such as NMT

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from ipython_utils import *
import random

## Setup

In [2]:
# constants/hyperparameters
BATCH_SIZE = 4
# MAX_WORD_LEN is used to ensure all words are of same length; pad/truncate shorter/longer words respectively
MAX_WORD_LEN = 15
CHAR_EMBED_SIZE = 5
CONV_1D_KERNEL_SIZE = 4
CONV_1D_OUTPUT_FILTERS = 5

# Setup miniaturized dataset for the purpose of illustration
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"

# dictionary to lookup an index given a character
char2index = {ch: i for i, ch in enumerate(alphabet)}

# add <pad> and <unk> token for padding and out-of-alphabet characters
char2index['<pad>'] = len(char2index)
char2index['<unk>'] = len(char2index)

# define constants for padding and unk characters
PAD_TOKEN = char2index['<pad>']
UNK_TOKEN = char2index['<unk>']

print_h4('Character indices')
print(char2index)

# few sentences
sents_lang = [
    'To be or not to be',
    'Programming is fun',
    'Light at the end of the tunnel.',
    'Aurora Borealis is indeed a spectacle!',
    'Mercury, Venus, Earth and Mars are rocky planets',
    'Jupiter and Saturn are gas giants',
    'Uranus and Neptune are ice giants',
    'Pluto, Makemake, and Ceres are dwarf planets'
]

# strip out leading/trailing whitespaces, if any
sents_lang = [sent.strip() for sent in sents_lang]

# select a random batch of sentences
random.shuffle(sents_lang)
sents_lang_batch = sents_lang[0:BATCH_SIZE]

print_h4('Random batch of sentences')
print_table(['Sentence', '# words'], [(sent, len(sent.split())) for sent in sents_lang_batch])

# convert sentences to a list of list of words
sents_lang_batch_words = [sent.split() for sent in sents_lang_batch]

# convert each word to an array of indices
sent_batch = [[[char2index.get(ch, UNK_TOKEN) for ch in word] for word in sent] for sent in sents_lang_batch_words]

# now make sure all sentences are of the same length and in each sentence, make sure all 
# words are of the same length. insert <pad> characters where necessary
max_sent_len = max(len(sent) for sent in sent_batch)
for s in range(len(sent_batch)):
    sent = sent_batch[s]
    for w in range(len(sent)):
        # pad/truncate each word as necessary
        if len(sent[w]) < MAX_WORD_LEN:
            # pad
            sent[w].extend([PAD_TOKEN]*(MAX_WORD_LEN - len(sent[w])))
        elif len(sent[w]) > MAX_WORD_LEN:
            # truncate
            sent[w] = sent[w][0:MAX_WORD_LEN]

    # pad sentence with extra words to ensure all sentences are of same length
    if len(sent) < max_sent_len:
        sent.extend([[PAD_TOKEN]*MAX_WORD_LEN]*(max_sent_len-len(sent)))

print_table(['Sentence', 'Sentence with word indices'], zip(sents_lang, sent_batch))

# validate to ensure correct sizes
for sent in sent_batch:
    assert len(sent) == max_sent_len
    for word in sent:
        assert len(word) == MAX_WORD_LEN

# finally convert the sentence batch into a tensor
input_batch = torch.tensor(sent_batch)

print_h4('Sentence batch as a tensor')
print(input_batch)
print('Shape = {}'.format(input_batch.shape))

#### Character indices

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25, 'a': 26, 'b': 27, 'c': 28, 'd': 29, 'e': 30, 'f': 31, 'g': 32, 'h': 33, 'i': 34, 'j': 35, 'k': 36, 'l': 37, 'm': 38, 'n': 39, 'o': 40, 'p': 41, 'q': 42, 'r': 43, 's': 44, 't': 45, 'u': 46, 'v': 47, 'w': 48, 'x': 49, 'y': 50, 'z': 51, '0': 52, '1': 53, '2': 54, '3': 55, '4': 56, '5': 57, '6': 58, '7': 59, '8': 60, '9': 61, '<pad>': 62, '<unk>': 63}


#### Random batch of sentences

|Sentence | # words|
|--|--|
| Programming is fun | 3 |
| Pluto, Makemake, and Ceres are dwarf planets | 7 |
| To be or not to be | 6 |
| Light at the end of the tunnel. | 7 |

|Sentence | Sentence with word indices|
|--|--|
| Programming is fun | [[15, 43, 40, 32, 43, 26, 38, 38, 34, 39, 32, 62, 62, 62, 62], [34, 44, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [31, 46, 39, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62]] |
| Pluto, Makemake, and Ceres are dwarf planets | [[15, 37, 46, 45, 40, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62], [12, 26, 36, 30, 38, 26, 36, 30, 63, 62, 62, 62, 62, 62, 62], [26, 39, 29, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [2, 30, 43, 30, 44, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [26, 43, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [29, 48, 26, 43, 31, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [41, 37, 26, 39, 30, 45, 44, 62, 62, 62, 62, 62, 62, 62, 62]] |
| To be or not to be | [[19, 40, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [27, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [40, 43, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [39, 40, 45, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [45, 40, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [27, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62]] |
| Light at the end of the tunnel. | [[11, 34, 32, 33, 45, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [26, 45, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [45, 33, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [30, 39, 29, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [40, 31, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [45, 33, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [45, 46, 39, 39, 30, 37, 63, 62, 62, 62, 62, 62, 62, 62, 62]] |

#### Sentence batch as a tensor

tensor([[[15, 43, 40, 32, 43, 26, 38, 38, 34, 39, 32, 62, 62, 62, 62],
         [34, 44, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62],
         [31, 46, 39, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62],
         [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62],
         [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62],
         [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62],
         [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62]],

        [[15, 37, 46, 45, 40, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62],
         [12, 26, 36, 30, 38, 26, 36, 30, 63, 62, 62, 62, 62, 62, 62],
         [26, 39, 29, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62],
         [ 2, 30, 43, 30, 44, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62],
         [26, 43, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62],
         [29, 48, 26, 43, 31, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62],
         [41, 37, 26, 39, 30, 45, 44, 62, 62, 62, 62, 62, 62, 62, 62]],

  

In [3]:
# setup a convolution layer to compute word embeddings
class CharBasedCNN(nn.Module):
    def __init__(self, char_embed_size, n_char_embeddings, 
                 conv_output_channels, conv_kernel_size,
                 use_maxpool_1d=False):
        """
        char_embed_size: size of character embedding vector (hyperparameter)
        n_char_embeddings: no.of character embeddings = no.of chars in the alphabet + 2 (<pad> & <unk>)
        conv_output_channels: no.of output channels from convolution layer (hyperparameter)
        conv_kernel_size: convolution kernel size (hyperparameter)
        use_maxpool_1d: only used to compare nn.MaxPool1d and torch.max 
        """
        super(CharBasedCNN, self).__init__()
        self.use_maxpool_1d = use_maxpool_1d
        self.embedding = nn.Embedding(num_embeddings=n_char_embeddings, 
                                      embedding_dim=char_embed_size)
        self.conv1d = nn.Conv1d(in_channels=char_embed_size, out_channels=conv_output_channels, 
                                kernel_size=conv_kernel_size)
    
    def forward(self, input):
        """
        Input is a tensor of dimension (batch_size, sentence_length, word_length)
        batch_size = no.of sentences in the batch
        sentence_length = no.of words in the sentence
        word_length = no.of chars in the word
        """
        print('input shape = {}'.format(input.shape))
        
        # lookup character embeddings
        # shape after resolving embeddings will be a 4-d tensor of shape
        # (batch_size, sentence_length, word_length, CHAR_EMBED_SIZE)
        # CHAR_EMBED_SIZE is a hyperparameter defined above, it defines character embedding vector size
        input_char_embed_4d = self.embedding(input)
        print('char embeded input 4d shape = {}'.format(input_char_embed_4d.shape))
        
        # Input to nn.Conv1d must be a 3d tensor
        # merge 1st 2 dimensions into a single dimension
        # resulting tensor is a 3-d tensor of shape
        # (word_count, word_length, CHAR_EMBED_SIZE)
        # word_count = batch_size * sentence_length
        input_char_embed_3d = input_char_embed_4d.view((input_char_embed_4d.shape[0]*input_char_embed_4d.shape[1],
                                                        input_char_embed_4d.shape[2], input_char_embed_4d.shape[3]))
        print('char embeded input 3d shape = {}'.format(input_char_embed_3d.shape))

        # nn.Conv1d convolves along the last dimension
        # We need the convolution to run on characters of a word -- the 2nd dimension
        # Transpose the tensor to get it into required shape
        conv_input = input_char_embed_3d.transpose(dim0=1, dim1=2)
        print('conv1d input shape = {}'.format(conv_input.shape))

        conv_out = self.conv1d(conv_input)
        F.relu_(conv_out)
        print('conv1d output shape = {}'.format(conv_out.shape))
        
        if self.use_maxpool_1d:
            max_1d = nn.MaxPool1d(kernel_size=conv_out.shape[-1], stride=1)
            maxpool_out = max_1d(conv_out)
            print('maxpool out shape = {}'.format(maxpool_out.shape))

            maxpool_out = maxpool_out.squeeze()
            print('maxpool out squeeze shape = {}'.format(maxpool_out.shape))
        else:
            maxpool_out, _ = conv_out.max(dim=2)
            print('maxpool out shape = {}'.format(maxpool_out.shape))

        maxpool_out_reshape = maxpool_out.view(input_char_embed_4d.shape[0], input_char_embed_4d.shape[1], -1)
        print('maxpool out final shape = {}\n'.format(maxpool_out_reshape.shape))

        return maxpool_out_reshape

In [4]:
ch_cnn = CharBasedCNN(CHAR_EMBED_SIZE, len(char2index), CONV_1D_OUTPUT_FILTERS, CONV_1D_KERNEL_SIZE)
print_h4('Computing embeddings. Using torch.max() instead of nn.MaxPool1d')
embeddings_1 = ch_cnn(input_batch)

print_h4('Computing embeddings. Using nn.MaxPool1d instead of torch.max()')
ch_cnn.use_maxpool_1d = True
embeddings_2 = ch_cnn(input_batch)

# both ways of computing embeddings must produce same values
assert (embeddings_1 == embeddings_2).all()

# further validate to ensure convolutions applied to individual sentences produce
# same result as when applied to batch
# validate to ensure convolutions are applied correctly
for i, sent in enumerate(input_batch):
    sent_char_embed = ch_cnn.embedding(sent)
    # transpose before applying conv1d
    sent_char_embed = sent_char_embed.transpose(dim0=1, dim1=2)
    # apply convolution
    sent_conv_out = ch_cnn.conv1d(sent_char_embed)
    F.relu_(sent_conv_out)
    sent_conv_out, _ = sent_conv_out.max(dim=2)
    assert (embeddings_1[i] == sent_conv_out).all() and (embeddings_2[i] == sent_conv_out).all()

#### Computing embeddings. Using torch.max() instead of nn.MaxPool1d

input shape = torch.Size([4, 7, 15])
char embeded input 4d shape = torch.Size([4, 7, 15, 5])
char embeded input 3d shape = torch.Size([28, 15, 5])
conv1d input shape = torch.Size([28, 5, 15])
conv1d output shape = torch.Size([28, 5, 12])
maxpool out shape = torch.Size([28, 5])
maxpool out final shape = torch.Size([4, 7, 5])



#### Computing embeddings. Using nn.MaxPool1d instead of torch.max()

input shape = torch.Size([4, 7, 15])
char embeded input 4d shape = torch.Size([4, 7, 15, 5])
char embeded input 3d shape = torch.Size([28, 15, 5])
conv1d input shape = torch.Size([28, 5, 15])
conv1d output shape = torch.Size([28, 5, 12])
maxpool out shape = torch.Size([28, 5, 1])
maxpool out squeeze shape = torch.Size([28, 5])
maxpool out final shape = torch.Size([4, 7, 5])

