# Building your vocabulary with a tokenizer

### Building one-hot vectors

Let's represent the following text as the one-hot vectors

In [1]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [2]:
str.split(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [3]:
import numpy as np
token_sequence = str.split(sentence) 
token_sequence

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [4]:
sorted(set(token_sequence))

['26.',
 'Jefferson',
 'Monticello',
 'Thomas',
 'age',
 'at',
 'began',
 'building',
 'of',
 'the']

In [5]:
vocab = sorted(set(token_sequence))   

**Q. What is a vocabulary?**

In [6]:
num_tokens = len(token_sequence)
print(num_tokens)
vocab_size = len(vocab)
print(vocab_size)

10
10


### Incidence matrix

In [7]:
onehot_vectors = np.zeros((num_tokens, vocab_size), int) 
for i, word in enumerate(token_sequence):
    onehot_vectors[i, vocab.index(word)] = 1
onehot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [8]:
import pandas as pd
pd.DataFrame(onehot_vectors, columns=vocab)

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


One nice feature of this vector representation of words and tabular representation of documents is that no information is lost. 

What does it mean?

### Building bag-of-words vectors

In [9]:
print(sentence)

Thomas Jefferson began building Monticello at the age of 26.


In [10]:
sentence_bow = {}
for token in sentence.split():
    sentence_bow[token] = 1
print(sentence_bow)

{'Thomas': 1, 'Jefferson': 1, 'began': 1, 'building': 1, 'Monticello': 1, 'at': 1, 'the': 1, 'age': 1, 'of': 1, '26.': 1}


There is one concern about the code above. The BOW needs to take into account the counts of the words.

In [11]:
sentence_bow

{'Thomas': 1,
 'Jefferson': 1,
 'began': 1,
 'building': 1,
 'Monticello': 1,
 'at': 1,
 'the': 1,
 'age': 1,
 'of': 1,
 '26.': 1}

In [12]:
dict(sorted(sentence_bow.items()))

{'26.': 1,
 'Jefferson': 1,
 'Monticello': 1,
 'Thomas': 1,
 'age': 1,
 'at': 1,
 'began': 1,
 'building': 1,
 'of': 1,
 'the': 1}

In [13]:
df = pd.DataFrame.from_dict(dict(sorted(sentence_bow.items())), orient = 'index', columns=['sent1']).T
print(df)

       26.  Jefferson  Monticello  Thomas  age  at  began  building  of  the
sent1    1          1           1       1    1   1      1         1   1    1


Let's compare it with one-hot vector representations

In [14]:
pd.DataFrame(onehot_vectors, columns=vocab)

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


Note that the BOW vector is not necessarily binary. 

Let's use a little longer document.

In [15]:
sentences = """Thomas Jefferson began building Monticello at the age of 26.\n"""
sentences += """Construction was done mostly by local masons and carpenters.\n"""
sentences += "He moved into the South Pavilion in 1770.\n"
sentences += """Turning Monticello into a neoclassical masterpiece was Jefferson's obsession."""
sentences

"Thomas Jefferson began building Monticello at the age of 26.\nConstruction was done mostly by local masons and carpenters.\nHe moved into the South Pavilion in 1770.\nTurning Monticello into a neoclassical masterpiece was Jefferson's obsession."

In [16]:
corpus = {}
for i, sent in enumerate(sentences.split('\n')):
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in sent.split())
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df.head()

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.,...,South,Pavilion,in,1770.,Turning,a,neoclassical,masterpiece,Jefferson's,obsession.
sent0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
sent1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sent2,0,0,0,0,0,0,1,0,0,0,...,1,1,1,1,0,0,0,0,0,0
sent3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,1


### Dot product: Measuring bag-of-words overlap

In [18]:
df = df.T    # this type of matrix is called term-document matrix or word-document matrix
df.head(15) 

Unnamed: 0,sent0,sent1,sent2,sent3
Thomas,1,0,0,0
Jefferson,1,0,0,0
began,1,0,0,0
building,1,0,0,0
Monticello,1,0,0,1
at,1,0,0,0
the,1,0,1,0
age,1,0,0,0
of,1,0,0,0
26.,1,0,0,0


You can see the overlaps.

In [59]:
df.sent0.dot(df.sent1)

0

In [60]:
df.sent0.dot(df.sent2)

1

In [61]:
df.sent0.dot(df.sent3)

1

**Q. what does resulting 1 imply?**

In [19]:
# Show the words used in both sentences
print([(k, v) for (k, v) in (df.sent0 & df.sent1).items() if v])
print([(k, v) for (k, v) in (df.sent0 & df.sent2).items() if v])
print([(k, v) for (k, v) in (df.sent0 & df.sent3).items() if v])

[]
[('the', 1)]
[('Monticello', 1)]


### A token improvment
### Regular expressions

In [20]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [21]:
import re
tokens = re.split(r'[-\s.,;!?]+', sentence)
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

You can compile your regex patterns

In [22]:
pattern = re.compile(r"([-\s.,;!?])+")
tokens = pattern.split(sentence)
tokens = [x for x in tokens if x and x not in '- \t\n.,;!?'] # why this line is necessary?
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

Tryout some other tokenizers

In [23]:
from nltk.tokenize import RegexpTokenizer

In [24]:
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

An even better tokenizer is the Treebank Word Tokenizer from the NLTK package. It incorporates a variety of common rules for English word tokenization.

In [26]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize.casual import casual_tokenize

Observe the difference among the tokenizers

In [27]:
sentence_ex = """Monticello wasn't designated as UNESCO World Heritage Site until 1987."""

In [28]:
RegexpTokenizer(r'\w+|$[0-9.]+|\S+').tokenize(sentence_ex)

['Monticello',
 'wasn',
 "'t",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987',
 '.']

In [29]:
TreebankWordTokenizer().tokenize(sentence_ex)

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987',
 '.']

In [30]:
message = """RT @TJMonticello Best day everrrrrrr at Monticello. Awesommmmmmeeeeeeee day :*)"""

In [31]:
RegexpTokenizer(r'\w+|$[0-9.]+|\S+').tokenize(message)

['RT',
 '@TJMonticello',
 'Best',
 'day',
 'everrrrrrr',
 'at',
 'Monticello',
 '.',
 'Awesommmmmmeeeeeeee',
 'day',
 ':*)']

In [32]:
TreebankWordTokenizer().tokenize(message)

['RT',
 '@',
 'TJMonticello',
 'Best',
 'day',
 'everrrrrrr',
 'at',
 'Monticello.',
 'Awesommmmmmeeeeeeee',
 'day',
 ':',
 '*',
 ')']

In [33]:
casual_tokenize(message)

['RT',
 '@TJMonticello',
 'Best',
 'day',
 'everrrrrrr',
 'at',
 'Monticello',
 '.',
 'Awesommmmmmeeeeeeee',
 'day',
 ':*)']

In [34]:
casual_tokenize(message, reduce_len=True, strip_handles=True)

['RT',
 'Best',
 'day',
 'everrr',
 'at',
 'Monticello',
 '.',
 'Awesommmeee',
 'day',
 ':*)']

In [35]:
from nltk.tokenize import TweetTokenizer
tk = TweetTokenizer()
tk.tokenize(message)

['RT',
 '@TJMonticello',
 'Best',
 'day',
 'everrrrrrr',
 'at',
 'Monticello',
 '.',
 'Awesommmmmmeeeeeeee',
 'day',
 ':*)']

In [36]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
tknzr.tokenize(message)

['RT',
 'Best',
 'day',
 'everrr',
 'at',
 'Monticello',
 '.',
 'Awesommmeee',
 'day',
 ':*)']

In [37]:
# List of stop words
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
len(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yeabinmoon/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


179

In [38]:
stop_words[:7]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours']

In [39]:
[sw for sw in stop_words if len(sw) == 1]

['i', 'a', 's', 't', 'd', 'm', 'o', 'y']

## Sentiment Primer

2.3.2 Naive Bayes

In [None]:
!pip install nlpia

In [42]:
from nlpia.data.loaders import get_data

ModuleNotFoundError: No module named 'nlpia'

In [2]:
movies = get_data('hutto_movies')

In [3]:
movies.head().round(2)

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.27,The Rock is destined to be the 21st Century's ...
2,3.53,The gorgeously elaborate continuation of ''The...
3,-0.6,Effective but too tepid biopic
4,1.47,If you sometimes like to go to the movies to h...
5,1.73,"Emerges as something rare, an issue movie that..."


In [4]:
movies.describe().round(2)

Unnamed: 0,sentiment
count,10605.0
mean,0.0
std,1.92
min,-3.88
25%,-1.77
50%,-0.08
75%,1.83
max,3.94


In [15]:
from collections import Counter 

In [6]:
bags_of_words = []

In [11]:
for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text)))

In [16]:
df_bows = pd.DataFrame.from_records(bags_of_words)
df_bows = df_bows.fillna(0).astype(int) 
df_bows.shape

(10605, 20756)

In [17]:
df_bows.head()

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Ill,slummer,Rashomon,dipsticks,Bearable,Staggeringly,’,ve,muttering,dissing
0,1,1,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_bows.head()[list(bags_of_words[0].keys())]

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Schwarzenegger,",",Jean,Claud,Van,Damme,or,Steven,Segal,.
0,1,1,1,1,2,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [19]:
from sklearn.naive_bayes import MultinomialNB 
nb = MultinomialNB()
nb = nb.fit(df_bows, movies.sentiment > 0)

In [20]:
movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4 
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
round(movies.error.mean(),1)

2.4

In [21]:
movies['sentiment_ispositive'] = (movies.sentiment > 0).astype(int)
movies['predicted_ispos'] = (movies.predicted_sentiment > 0).astype(int)
movies['sentiment predicted_sentiment sentiment_ispositive predicted_ispos'.split()].head(8)

Unnamed: 0_level_0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispos
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.266667,4,1,1
2,3.533333,4,1,1
3,-0.6,-4,0,0
4,1.466667,4,1,1
5,1.733333,4,1,1
6,2.533333,4,1,1
7,2.466667,4,1,1
8,1.266667,-4,1,0


In [22]:
(movies.predicted_ispos == movies.sentiment_ispositive).sum() / len(movies)

0.9344648750589345

In [23]:
products = get_data('hutto_products')
products.head()

Unnamed: 0,id,sentiment,text
0,1_1,-0.9,troubleshooting ad-2500 and ad-2600 no picture...
1,1_2,-0.15,"repost from january 13, 2004 with a better fit..."
2,1_3,-0.2,does your apex dvd player only play dvd audio ...
3,1_4,-0.1,or does it play audio and video but scrolling ...
4,1_5,-0.5,before you try to return the player or waste h...


Explain yourself what the following codes

In [24]:
bags_of_words = []
for text in products.text:
    bags_of_words.append(Counter(casual_tokenize(text)))
df_product_bows = pd.DataFrame.from_records(bags_of_words)
df_product_bows = df_product_bows.fillna(0).astype(int)

In [25]:
df_all_bows = pd.concat([df_bows,df_product_bows])
df_all_bows

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,sligtly,owner,81,defectively,warrranty,expire,expired,voids,baghdad,harddisk
0,1.0,1.0,1,1.0,2,1,1,1,1.0,1,...,,,,,,,,,,
1,2.0,0.0,1,0.0,0,0,1,0,0.0,0,...,,,,,,,,,,
2,0.0,0.0,0,0.0,0,0,0,0,0.0,0,...,,,,,,,,,,
3,0.0,0.0,1,0.0,4,0,1,0,0.0,0,...,,,,,,,,,,
4,0.0,0.0,0,0.0,0,0,0,0,0.0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3541,,,0,,1,0,1,0,,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3542,,,0,,0,0,0,0,,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3543,,,0,,0,0,2,0,,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3544,,,0,,0,0,0,0,,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
df_all_bows.columns

Index(['The', 'Rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'Century's',
       'new',
       ...
       'sligtly', 'owner', '81', 'defectively', 'warrranty', 'expire',
       'expired', 'voids', 'baghdad', 'harddisk'],
      dtype='object', length=23302)

In [27]:
df_product_bows = df_all_bows.iloc[len(movies):][df_bows.columns]
df_product_bows.shape

(3546, 20756)

In [28]:
df_product_bows = df_product_bows.fillna(0).astype(int)
df_bows.shape

(10605, 20756)

In [29]:
products['ispos'] = (products.sentiment > 0).astype(int)
products['pred'] = nb.predict(df_product_bows.values).astype(int)



In [30]:
products.head()

Unnamed: 0,id,sentiment,text,ispos,pred
0,1_1,-0.9,troubleshooting ad-2500 and ad-2600 no picture...,0,0
1,1_2,-0.15,"repost from january 13, 2004 with a better fit...",0,0
2,1_3,-0.2,does your apex dvd player only play dvd audio ...,0,0
3,1_4,-0.1,or does it play audio and video but scrolling ...,0,0
4,1_5,-0.5,before you try to return the player or waste h...,0,0


In [31]:
(products.pred == products.ispos).sum() / len(products)

0.5572476029328821

**How can we interpret this result?**

## Smoothing

In [36]:
movies = get_data('hutto_movies')
bags_of_words = []
for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text)))
df_bows = pd.DataFrame.from_records(bags_of_words)
df_bows = df_bows.fillna(0).astype(int) 
df_bows.shape

(10605, 20756)

In [40]:
nb = MultinomialNB(alpha=20)
nb = nb.fit(df_bows, movies.sentiment > 0)
movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4 
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
round(movies.error.mean(),1)

2.7

In [41]:
movies['sentiment_ispositive'] = (movies.sentiment > 0).astype(int)
movies['predicted_ispos'] = (movies.predicted_sentiment > 0).astype(int)
(movies.predicted_ispos == movies.sentiment_ispositive).sum() / len(movies)

0.8273455917020274

In [42]:
products = get_data('hutto_products')
bags_of_words = []
for text in products.text:
    bags_of_words.append(Counter(casual_tokenize(text)))
df_product_bows = pd.DataFrame.from_records(bags_of_words)
df_product_bows = df_product_bows.fillna(0).astype(int)

df_all_bows = pd.concat([df_bows,df_product_bows])
df_product_bows = df_all_bows.iloc[len(movies):][df_bows.columns]
df_product_bows = df_product_bows.fillna(0).astype(int)
products['ispos'] = (products.sentiment > 0).astype(int)
products['pred'] = nb.predict(df_product_bows.values).astype(int)
(products.pred == products.ispos).sum() / len(products)



0.5789622109419064