# Tokenization Basics

In [1]:
%load_ext autoreload
%autoreload 2

## Install `tfutils`

In [2]:
!pip install -e /content/drive/MyDrive/projects/tfutils

Obtaining file:///content/drive/MyDrive/projects/tfutils
Installing collected packages: tfutils
  Running setup.py develop for tfutils
Successfully installed tfutils-0.0.1


## Basic Tokenization


* Think of a good tokenization rule (preferably custom)
* Use stopwords appropriately (make a custom list)
* Spell check!
* Handle contractions like `haven't` -> `have not`
* After tokenization, check 
    * number of OOV tokens on test set (see if a rule augment and convert these to in vocab tokens)
    * which tokens were stripped off in train set (see if some tokens are what we would want to keep)
    * Visualize the frequencies of words, most common, least common


In [59]:
def sequences_to_text_tokens(sequences, w2id, zero_token='<PAD>'):

    text_tokens = [None]*len(sequences)
    id2w = {v:k for k, v in w2id.items()}

    for i, seq in enumerate(sequences):
        seq_len = len(seq)
        text_tokens_lst = [None]*seq_len
        for j in range(seq_len):
            token_id = seq[j]
            if token_id != 0:
                text_tokens_lst[j] = id2w.get(token_id)
            else:
                text_tokens_lst[j] = zero_token

        text_tokens[i] = text_tokens_lst

    
    return text_tokens


In [62]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Train sentences
sentences = ['I love tensorflow',
             'i love tensorflow!!',
             'you love TenSorflow',
             'we all LOVE Tensorflow :)']

# Initialize a tokenizer with apt tokenization rules
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

# Get word to index, index to word dicts
w2id = tokenizer.word_index
id2w = {v:k for k, v in w2id.items()}

# Convert train sentences to sequences
sents_token_ids = tokenizer.texts_to_sequences(sentences)
sents_tokens = sequences_to_text_tokens(sents_token_ids, w2id)

# maximum sequence length
max_seq_len = max([len(sent_token_ids) for sent_token_ids in sents_token_ids])

# padded sequences
sents_token_ids_padded = pad_sequences(sents_token_ids, maxlen=max_seq_len, padding='post', truncating='post')


In [63]:
print('Word to Index: ', w2id)
print('\n', '-'*10, '\n')
print('Index to Word: ', id2w)

Word to Index:  {'<OOV>': 1, 'love': 2, 'tensorflow': 3, 'i': 4, 'you': 5, 'we': 6, 'all': 7}

 ---------- 

Index to Word:  {1: '<OOV>', 2: 'love', 3: 'tensorflow', 4: 'i', 5: 'you', 6: 'we', 7: 'all'}


In [74]:
for sent, sent_tokens, sent_token_ids, sent_token_ids_padded in zip(sentences, sents_tokens, sents_token_ids, sents_token_ids_padded):
    print('original: ', sent)
    print('tokenized text: ', sent_tokens)
    print('tokenized ids: ', sent_token_ids)
    print('tokenized ids (trunc+pad): ', sent_token_ids_padded)
    print('-'*10)

original:  I love tensorflow
tokenized text:  ['<OOV>', 'i', '<OOV>', 'love', 'tensorflow']
tokenized ids:  [1, 4, 1, 2, 3]
tokenized ids (trunc+pad):  [1 4 1 2]
----------
original:  i love tensorflow!!
tokenized text:  ['i', '<OOV>', 'tensorflow']
tokenized ids:  [4, 1, 3]
tokenized ids (trunc+pad):  [4 1 3 0]
----------


In [72]:
new_sents  = ['Hello, I too love tensorflow!', 'i like tensorflow']
sents_token_ids = tokenizer.texts_to_sequences(new_sents)
sents_tokens = sequences_to_text_tokens(sents_token_ids, w2id)
sents_decoded = tokenizer.sequences_to_texts(sents_token_ids)
sents_token_ids_padded = pad_sequences(sents_token_ids, maxlen=max_seq_len, padding='post', truncating='post')

In [76]:
tokenizer.sequences_to_texts(sents_token_ids_padded)

['<OOV> i <OOV> love', 'i <OOV> tensorflow <OOV>']

In [77]:
for sent, sent_tokens, sent_token_ids, sent_token_ids_padded, sent_decoded in zip(new_sents, sents_tokens, 
                                                                    sents_token_ids, sents_token_ids_padded,
                                                                    sents_decoded):
    print('original: ', sent)
    print('tokenized text: ', sent_tokens)
    print('tokenized ids: ', sent_token_ids)
    print('tokenized ids (trunc+pad): ', sent_token_ids_padded)
    print('decoded: ', sent_decoded)
    print('-'*10)

original:  Hello, I too love tensorflow!
tokenized text:  ['<OOV>', 'i', '<OOV>', 'love', 'tensorflow']
tokenized ids:  [1, 4, 1, 2, 3]
tokenized ids (trunc+pad):  [1 4 1 2]
decoded:  <OOV> i <OOV> love tensorflow
----------
original:  i like tensorflow
tokenized text:  ['i', '<OOV>', 'tensorflow']
tokenized ids:  [4, 1, 3]
tokenized ids (trunc+pad):  [4 1 3 0]
decoded:  i <OOV> tensorflow
----------
