# Natural Language Processing - Tokenizers

## Imports 
There are many tokenizers out there. For this we'll try just a few.

In [None]:
# General imports
import torchtext
import nltk
import numpy as np
import torch

# Tokenizer
from nltk.tokenize import word_tokenize

from torchtext.data import get_tokenizer

from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

# Stopwords
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)

# Punctuation
import string

# Lemmetize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet', quiet=True)

# Stem
from nltk.stem import SnowballStemmer

# Word2vec
import gensim.downloader

print('Libraries imported and NLTK data downloaded.')

## Load Data
For the dataset example, we'll be using some lyrics from Jimi Hendrix's song The Wind Cries Mary.

In [2]:
with open('the_wind_cries_mary.txt', 'r') as f:
    lines = f.readlines()

In [3]:
lines

['After all the jacks are in their boxes\n',
 'And the clowns have all gone to bed\n',
 'You can hear happiness staggering on down the street\n',
 'Footprints dressed in red\n',
 'And the wind whispers Mary \n',
 'A broom is drearily sweeping\n',
 'Up the broken pieces of yesterdays life\n',
 'Somewhere a queen is weeping\n',
 'Somewhere a king has no wife\n',
 'And the wind, it cries Mary \n',
 'The traffic lights they all true blue tomorrow\n',
 'And shine their emptiness down on my bed\n',
 'The tiny island sags downstream\n',
 'Cause the life that lived is, is dead\n',
 'And the wind screams Mary \n',
 'Will the wind ever remember\n',
 'The names it has blown in the past\n',
 "And with his crutch, its old age, and it's wisdom\n",
 'It whispers no, this will be the last\n',
 'And the wind cries Mary \n']

In [4]:
sample_lines = lines[5:10]

In [5]:
sample_lines

['A broom is drearily sweeping\n',
 'Up the broken pieces of yesterdays life\n',
 'Somewhere a queen is weeping\n',
 'Somewhere a king has no wife\n',
 'And the wind, it cries Mary \n']

## Tokenize

### Tokenize with NLTK
https://www.nltk.org/api/nltk.tokenize.html

In [6]:
sample_lines_tokenized = [word_tokenize(line) for line in sample_lines]
sample_lines_tokenized

[['A', 'broom', 'is', 'drearily', 'sweeping'],
 ['Up', 'the', 'broken', 'pieces', 'of', 'yesterdays', 'life'],
 ['Somewhere', 'a', 'queen', 'is', 'weeping'],
 ['Somewhere', 'a', 'king', 'has', 'no', 'wife'],
 ['And', 'the', 'wind', ',', 'it', 'cries', 'Mary']]

### Tokenize with Pytorch
https://pytorch.org/text/stable/_modules/torchtext/data/utils.html#get_tokenizer

In [7]:
pytorch_tokenizer = get_tokenizer("basic_english")
pytorch_tokens = [pytorch_tokenizer(line) for line in sample_lines]
pytorch_tokens

[['a', 'broom', 'is', 'drearily', 'sweeping'],
 ['up', 'the', 'broken', 'pieces', 'of', 'yesterdays', 'life'],
 ['somewhere', 'a', 'queen', 'is', 'weeping'],
 ['somewhere', 'a', 'king', 'has', 'no', 'wife'],
 ['and', 'the', 'wind', ',', 'it', 'cries', 'mary']]

In [8]:
# NLTK from pytorch
pytorch_tokenizer_toktok = get_tokenizer('toktok')
pytorch_tokens_toktok = [pytorch_tokenizer_toktok(line) for line in sample_lines]
pytorch_tokens_toktok

[['A', 'broom', 'is', 'drearily', 'sweeping'],
 ['Up', 'the', 'broken', 'pieces', 'of', 'yesterdays', 'life'],
 ['Somewhere', 'a', 'queen', 'is', 'weeping'],
 ['Somewhere', 'a', 'king', 'has', 'no', 'wife'],
 ['And', 'the', 'wind', ',', 'it', 'cries', 'Mary']]

### Tokenize with Spacy
https://spacy.io/api/tokenizer

In [9]:
nlp = English()
spacy_tokenizer = Tokenizer(nlp.vocab)
spacy_tokens = [spacy_tokenizer(line) for line in sample_lines]
spacy_tokens

[A broom is drearily sweeping,
 Up the broken pieces of yesterdays life,
 Somewhere a queen is weeping,
 Somewhere a king has no wife,
 And the wind, it cries Mary ]

In [10]:
# Spacy returns a doc object, not just list of words. See more here: https://spacy.io/api/doc
type(spacy_tokens[0])

spacy.tokens.doc.Doc

In [11]:
[[token.text for token in line] for line in spacy_tokens]

[['A', 'broom', 'is', 'drearily', 'sweeping', '\n'],
 ['Up', 'the', 'broken', 'pieces', 'of', 'yesterdays', 'life', '\n'],
 ['Somewhere', 'a', 'queen', 'is', 'weeping', '\n'],
 ['Somewhere', 'a', 'king', 'has', 'no', 'wife', '\n'],
 ['And', 'the', 'wind,', 'it', 'cries', 'Mary', '\n']]

## Stopwords

In [12]:
# View stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [13]:
# Find all stopwords and highlight with "~"
mod_tokens = []
for i, line in enumerate(sample_lines_tokenized):
    mod_line = []
    for token in line:
        if token.lower() in stopwords.words('english'):
            mod_line.append(f'~{token}~')
        else:
            mod_line.append(token)
    mod_tokens.append(mod_line)
mod_tokens
            

[['~A~', 'broom', '~is~', 'drearily', 'sweeping'],
 ['~Up~', '~the~', 'broken', 'pieces', '~of~', 'yesterdays', 'life'],
 ['Somewhere', '~a~', 'queen', '~is~', 'weeping'],
 ['Somewhere', '~a~', 'king', '~has~', '~no~', 'wife'],
 ['~And~', '~the~', 'wind', ',', '~it~', 'cries', 'Mary']]

In [14]:
def remove_stopwords(input_text):
    return [token for token in input_text if token.lower() not in stopwords.words('english')]

In [15]:
tokens_without_stopwords = [remove_stopwords(line) for line in sample_lines_tokenized]
tokens_without_stopwords

[['broom', 'drearily', 'sweeping'],
 ['broken', 'pieces', 'yesterdays', 'life'],
 ['Somewhere', 'queen', 'weeping'],
 ['Somewhere', 'king', 'wife'],
 ['wind', ',', 'cries', 'Mary']]

## Remove punctuation

In [16]:
punctuation_characters = set(string.punctuation)
list(punctuation_characters)[:5]

[')', '(', '>', '|', '!']

In [17]:
def remove_punctuation(input_text):
    return [token for token in input_text if token not in set(string.punctuation)]

In [18]:
tokens_without_punctuation = [remove_punctuation(line) for line in tokens_without_stopwords]
tokens_without_punctuation

[['broom', 'drearily', 'sweeping'],
 ['broken', 'pieces', 'yesterdays', 'life'],
 ['Somewhere', 'queen', 'weeping'],
 ['Somewhere', 'king', 'wife'],
 ['wind', 'cries', 'Mary']]

## Lemmetize
https://www.nltk.org/api/nltk.stem.wordnet.html

In [19]:
lem = WordNetLemmatizer()
lem.lemmatize('cries')

'cry'

In [20]:
def lemmatize(input_text):
    # Instantiate class
    lem = WordNetLemmatizer()
    # Lemmatized text becomes input inside all loop runs
    lemmatized_text = input_text
    # Lemmatize each part of speech
    for part_of_speech in ['n', 'v', 'a', 'r', 's']:
        lemmatized_text = [lem.lemmatize(token, part_of_speech).lower() for token in lemmatized_text]
    return lemmatized_text

In [21]:
tokens_lemmatized = [lemmatize(line) for line in tokens_without_punctuation]
tokens_lemmatized

[['broom', 'drearily', 'sweep'],
 ['break', 'piece', 'yesterday', 'life'],
 ['somewhere', 'queen', 'weep'],
 ['somewhere', 'king', 'wife'],
 ['wind', 'cry', 'mary']]

## Stem
https://www.nltk.org/api/nltk.stem.html

In [22]:
stemmer = SnowballStemmer('english')
stemmer.stem('sweeping')

'sweep'

In [23]:
def stem(input_text):
    stemmer = SnowballStemmer('english')
    return [stemmer.stem(token) for token in input_text]

In [24]:
tokens_stemmed = [stem(line) for line in tokens_without_punctuation]
tokens_stemmed

[['broom', 'drearili', 'sweep'],
 ['broken', 'piec', 'yesterday', 'life'],
 ['somewher', 'queen', 'weep'],
 ['somewher', 'king', 'wife'],
 ['wind', 'cri', 'mari']]

## Put it all together

In [25]:
def clean_list_of_text(
        input_text, 
        enable_stopword_removal=True,
        enable_punctuation_removal=True,
        enable_lemmatization=True,
        enable_stemming=False
    ):
    # Get list of operations
    enabled_operations = [word_tokenize]
    if enable_stopword_removal:
        enabled_operations.append(remove_stopwords)
    if enable_punctuation_removal:
        enabled_operations.append(remove_punctuation)
    if enable_lemmatization:
        enabled_operations.append(lemmatize)
    if enable_stemming:
        enabled_operations.append(stem)
    print(f'Enabled Operations: {len(enabled_operations)}')
    

    # Run all operations
    cleaned_text_lines = input_text
    for operation in enabled_operations:
        # Run for all lines
        cleaned_text_lines = [operation(line) for line in cleaned_text_lines]
    
    return cleaned_text_lines

In [26]:
clean_list_of_text(sample_lines, enable_stopword_removal=True, enable_punctuation_removal=True, enable_lemmatization=True)

Enabled Operations: 4


[['broom', 'drearily', 'sweep'],
 ['break', 'piece', 'yesterday', 'life'],
 ['somewhere', 'queen', 'weep'],
 ['somewhere', 'king', 'wife'],
 ['wind', 'cry', 'mary']]

## Word2vec and Glove Embedding
https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/word2vec.py

In [27]:
# Load pretrained gensim model
glove_model = gensim.downloader.load("glove-wiki-gigaword-100") 

In [28]:
# Show default most similar words given a word
glove_model.most_similar('queen')

[('princess', 0.7947244644165039),
 ('king', 0.7507690191268921),
 ('elizabeth', 0.7355712056159973),
 ('royal', 0.7065026164054871),
 ('lady', 0.7044796943664551),
 ('victoria', 0.6853757500648499),
 ('monarch', 0.6683257222175598),
 ('crown', 0.6680562496185303),
 ('prince', 0.6640505194664001),
 ('consort', 0.6570538282394409)]

In [29]:
# Show embed size
glove_model['broom'].size

100

In [30]:
np.set_printoptions(suppress=True)

# Sample of embedding vector for a word
glove_model['broom']

array([ 0.20009   ,  0.32409   , -0.23066   , -0.61079   , -0.42757   ,
        0.0020605 , -0.45512   ,  0.56479   , -0.55531   , -0.25579   ,
       -0.72523   ,  0.55213   , -0.19549   ,  0.96065   , -0.55447   ,
        0.68811   ,  0.039949  ,  0.47085   , -0.45799   , -0.74935   ,
       -0.39437   ,  0.25289   ,  1.0068    , -0.66637   ,  0.63259   ,
        1.0547    , -0.14611   ,  0.35851   , -0.25193   , -0.023974  ,
        0.26526   ,  0.056152  ,  0.27812   ,  0.14538   ,  0.12781   ,
        0.30503   ,  0.024989  , -0.47947   ,  1.2966    ,  0.032496  ,
       -0.25516   , -0.39946   ,  0.22301   , -0.74436   , -0.46208   ,
        0.20526   ,  0.14991   , -0.36987   ,  0.27937   , -0.014941  ,
       -0.85951   , -0.24261   , -0.33566   ,  0.71803   , -0.86      ,
       -0.60147   , -0.84878   ,  0.12036   ,  0.21095   , -0.54984   ,
        0.32912   , -0.24656   ,  0.13614   ,  0.52457   ,  0.31397   ,
       -0.26055   ,  0.29517   , -0.15317   ,  0.33613   , -0.06

In [32]:
# Example of grabbing embedding for each word
text_to_convert = clean_list_of_text(sample_lines, enable_stopword_removal=True, enable_punctuation_removal=True, enable_lemmatization=True)
vectors = [[glove_model[token] for token in line] for line in text_to_convert]

Enabled Operations: 4


In [33]:
# Example of padding those embeddings and converting to torch tensor (num_examples, sequence_length, embed_dim)
torch_padded_tensor = torch.nn.utils.rnn.pad_sequence([torch.FloatTensor(vector) for vector in vectors], batch_first=True)
torch_padded_tensor.shape

torch.Size([5, 4, 100])

In [34]:
# Defaults to zero for padded values
torch_padded_tensor[0][3]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])

## Pytorch Glove

In [35]:
vec = torchtext.vocab.GloVe(name='6B', dim=50)

.vector_cache\glove.6B.zip: 862MB [04:24, 3.26MB/s]                           
100%|█████████▉| 399999/400000 [00:11<00:00, 34874.24it/s]


In [36]:
vec.get_vecs_by_tokens(['broom'], lower_case_backup=True)

tensor([[-0.2070, -0.3793, -0.8425, -0.2447,  0.4185,  0.6902,  0.4083, -0.0413,
          0.7953, -0.4586, -0.4328,  0.7068, -0.4015,  0.4556,  0.0159,  0.4423,
          0.7442,  0.5907, -0.4703, -1.1006,  0.7969,  0.0286,  0.4297,  0.2820,
         -0.1548, -0.5966,  0.2890,  0.5291,  0.6385, -0.0024, -0.3283, -0.6784,
         -1.1181,  0.7954,  0.2005,  0.2453, -0.1501, -0.1612,  0.7281, -0.1558,
          0.1464, -0.3415, -0.1887,  0.6934,  0.8386,  0.1495,  0.6169, -1.2661,
         -0.0847, -0.5917]])