# An illustration of character-based embedding lookup using convolution layer

In many NLP tasks word are resolved to embedding vectors, via a simple lookup on an embedding layer. The embedding layer may be initialized via pre-trained word vectors such as word2vec or may be randomly initialized and word embeddings are learnt in training phase.

In this illustration, word embedding vectors are composed from character-level embeddings. The process is as follows.
After pre-processing sentences to ensure all sentences and words are of equal length.
1. For each word, lookup embeddings for each character of the word
2. Apply 1-D convolution layer to create features
3. Apply max pooling to get concise feature representation, which will serve as the word's embedding

Computed word embeddings can be fed into larger, deeper networks for NLP tasks such as NMT

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from ipython_utils import *
import random

## Setup

In [32]:
# constants
BATCH_SIZE = 4
MAX_WORD_LEN = 15  # this constant is used to ensure all words are of this length, pad/truncate shorter/longer words respectively

# Setup miniaturized dataset for the purpose of illustration
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"

# dictionary to lookup an index given a character
char2index = {ch: i for i, ch in enumerate(alphabet)}

# add <pad> and <unk> token for padding character and out-of-alphabet caracters
char2index['<pad>'] = len(char2index)
char2index['<unk>'] = len(char2index)

# define constants for padding and unk characters
PAD_TOKEN = char2index['<pad>']
UNK_TOKEN = char2index['<unk>']

print_h4('Character indices')
print(char2index)

# few sentences
sents_lang = [
    'To be or not to be',
    'Programming is fun',
    'Light at the end of the tunnel.',
    'Aurora Borealis is a spectacle!',
    'Mercury, Venus, Earth and Mars are rocky planets',
    'Jupiter and Saturn are gas giants',
    'Uranus and Neptune are ice giants',
    'Pluto, Makemake, and Ceres are dwarf planets'
]

# strip out leading/trailing whitespaces, if any
sents_lang = [sent.strip() for sent in sents_lang]

# select a random batch of sentences
random.shuffle(sents_lang)
sents_lang_batch = sents_lang[0:BATCH_SIZE]

print_h4('Random batch of sentences')
print_table(['Sentence', '# words'], [(sent, len(sent.split())) for sent in sents_lang_batch])

# convert sentences to a list of list of words
sents_lang_batch_words = [sent.split() for sent in sents_lang_batch]

# convert each word to an array of indices
sent_batch = [[[char2index.get(ch, UNK_TOKEN) for ch in word] for word in sent] for sent in sents_lang_batch_words]

# now make sure all sentences are of the same length and in each sentence, make sure all words are of the same length. insert <pad> characters where necessary
max_sent_len = max(len(sent) for sent in sent_batch)
for s in range(len(sent_batch)):
    sent = sent_batch[s]
    for w in range(len(sent)):
        # pad/truncate each word as necessary
        if len(sent[w]) < MAX_WORD_LEN:
            sent[w].extend([PAD_TOKEN]*(MAX_WORD_LEN - len(sent[w])))
        elif len(sent[w]) > MAX_WORD_LEN:
            sent[w] = sent[w][0:MAX_WORD_LEN]

    # pad sentence with extra words to ensure all sentences are of same length
    if len(sent) < max_sent_len:
        sent.extend([[PAD_TOKEN]*MAX_WORD_LEN]*(max_sent_len-len(sent)))


print_table(['Sentence', 'Sentence with word indices'], zip(sents_lang, sent_batch))

# validate to ensure correct sizes
for sent in sent_batch:
    assert len(sent) == max_sent_len
    for word in sent:
        assert len(word) == MAX_WORD_LEN

#### Character indices

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25, 'a': 26, 'b': 27, 'c': 28, 'd': 29, 'e': 30, 'f': 31, 'g': 32, 'h': 33, 'i': 34, 'j': 35, 'k': 36, 'l': 37, 'm': 38, 'n': 39, 'o': 40, 'p': 41, 'q': 42, 'r': 43, 's': 44, 't': 45, 'u': 46, 'v': 47, 'w': 48, 'x': 49, 'y': 50, 'z': 51, '0': 52, '1': 53, '2': 54, '3': 55, '4': 56, '5': 57, '6': 58, '7': 59, '8': 60, '9': 61, '<pad>': 62, '<unk>': 63}


#### Random batch of sentences

|Sentence | # words|
|--|--|
| Pluto, Makemake, and Ceres are dwarf planets | 7 |
| Aurora Borealis is a spectacle! | 5 |
| To be or not to be | 6 |
| Light at the end of the tunnel. | 7 |

|Sentence | Sentence with word indices|
|--|--|
| Pluto, Makemake, and Ceres are dwarf planets | [[15, 37, 46, 45, 40, 63, 62, 62, 62, 62, 62, 62, 62, 62, 62], [12, 26, 36, 30, 38, 26, 36, 30, 63, 62, 62, 62, 62, 62, 62], [26, 39, 29, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [2, 30, 43, 30, 44, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [26, 43, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [29, 48, 26, 43, 31, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [41, 37, 26, 39, 30, 45, 44, 62, 62, 62, 62, 62, 62, 62, 62]] |
| Aurora Borealis is a spectacle! | [[0, 46, 43, 40, 43, 26, 62, 62, 62, 62, 62, 62, 62, 62, 62], [1, 40, 43, 30, 26, 37, 34, 44, 62, 62, 62, 62, 62, 62, 62], [34, 44, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [26, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [44, 41, 30, 28, 45, 26, 28, 37, 30, 63, 62, 62, 62, 62, 62], [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62]] |
| To be or not to be | [[19, 40, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [27, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [40, 43, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [39, 40, 45, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [45, 40, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [27, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62]] |
| Light at the end of the tunnel. | [[11, 34, 32, 33, 45, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [26, 45, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [45, 33, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [30, 39, 29, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [40, 31, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [45, 33, 30, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62], [45, 46, 39, 39, 30, 37, 63, 62, 62, 62, 62, 62, 62, 62, 62]] |