In [214]:
import re
import numpy as np
import pandas as pd

from collections import Counter

from nltk.tokenize import casual_tokenize, RegexpTokenizer, TreebankWordTokenizer
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
from sklearn.naive_bayes import MultinomialNB

from nlpia.data.loaders import get_data

## Basic tokenization of text

In [3]:
sentence = 'Thomas Jefferson began building Monticello at the age of 26.'

In [7]:
# \W will match one non-word character.
re.split('\W+', sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

## OneHot Vectors

In [10]:
token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
', '.join(vocab)

'26., Jefferson, Monticello, Thomas, age, at, began, building, of, the'

In [17]:
num_tokens = len(token_sequence)
vocab_size = len(vocab)
onehot_vectors = np.zeros((num_tokens,
                           vocab_size), int)

for i, word in enumerate(token_sequence):
    onehot_vectors[i, vocab.index(word)] = 1
print(sentence)
print(', '.join(vocab))
onehot_vectors

Thomas Jefferson began building Monticello at the age of 26.
26., Jefferson, Monticello, Thomas, age, at, began, building, of, the


array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [21]:
df = pd.DataFrame(onehot_vectors, columns=vocab)
df[df == 0] = ''
df

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,,,,1.0,,,,,,
1,,1.0,,,,,,,,
2,,,,,,,1.0,,,
3,,,,,,,,1.0,,
4,,,1.0,,,,,,,
5,,,,,,1.0,,,,
6,,,,,,,,,,1.0
7,,,,,1.0,,,,,
8,,,,,,,,,1.0,
9,1.0,,,,,,,,,


## Reducing storage with dict

In [25]:
sentence_bow = {}
for token in sentence.split(' '):
    sentence_bow[token] = 1
sorted(sentence_bow.items())

[('26.', 1),
 ('Jefferson', 1),
 ('Monticello', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('of', 1),
 ('the', 1)]

In [31]:
series = pd.Series(dict([(token, 1) 
                         for token in sentence.split(' ')]))
df = pd.DataFrame(series, columns=['sent']).T
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent,1,1,1,1,1,1,1,1,1,1


In [36]:
sentence = '''Thomas Jefferson began building Monticello at the age of 26.
Construction was done mostly by local masons and carpenters.
He moved into South Pavilion in 1770.
Turning Monticello into a neoclassical masterpiece was Jefferson's obsession.
'''
corpus = {}
for i, sentence in enumerate(sentence.strip().split('\n')):
    corpus[f'sent{i}'] = dict([(token, 1)
                               for token in sentence.strip().split(' ')])
df = pd.DataFrame.from_records(corpus).fillna(0).astype(np.int64).T
df

Unnamed: 0,1770.,26.,Construction,He,Jefferson,Jefferson's,Monticello,Pavilion,South,Thomas,...,local,masons,masterpiece,mostly,moved,neoclassical,obsession.,of,the,was
sent0,0,1,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,1,1,0
sent1,0,0,1,0,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,1
sent2,1,0,0,1,0,0,0,1,1,0,...,0,0,0,0,1,0,0,0,0,0
sent3,0,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,1,1,0,0,1


## Dot Product

In [43]:
v = np.array([1,2,3])
w = np.array([2,3,4])

v.dot(w), v @ w, (v * w).sum()

(20, 20, 20)

## Measuring bag-of-words overlap.

In [50]:
df = df.T

In [64]:
df.sent0 @ df.sent3 # One word is used for both sent0 and sent3.

1

In [69]:
[(k, v) 
 for k, v in (df.sent0 & df.sent3).items() 
 if v == 1]

[('Monticello', 1)]

## NLTK RegexpTokenizer

In [74]:
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['Turning',
 'Monticello',
 'into',
 'a',
 'neoclassical',
 'masterpiece',
 'was',
 'Jefferson',
 "'s",
 'obsession',
 '.']

In [79]:
# Contains rules for English contractions (don't -> [do, n't]).
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['Turning',
 'Monticello',
 'into',
 'a',
 'neoclassical',
 'masterpiece',
 'was',
 'Jefferson',
 "'s",
 'obsession',
 '.']

In [81]:
tokenizer.tokenize("Monticello wasn't designated as UNESCO World Heritage Site until 1987.")

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987',
 '.']

In [94]:
sentence = 'Thomas Jefferson began building Monticello at the age of 26.'
# pattern = re.compile('\W+')
pattern = re.compile(r'([-\s.,;!?])+')
tokens = pattern.split(sentence)
tokens = [word for word in tokens if word and word not in '- \t\n.,;!?']
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [95]:
bigrams = list(ngrams(tokens, 2))
bigrams = [' '.join(pair) for pair in bigrams]
bigrams

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monticello',
 'Monticello at',
 'at the',
 'the age',
 'age of',
 'of 26']

In [96]:
trigrams = list(ngrams(tokens, 3))
trigrams = [' '.join(triplets) for triplets in trigrams]
trigrams

['Thomas Jefferson began',
 'Jefferson began building',
 'began building Monticello',
 'building Monticello at',
 'Monticello at the',
 'at the age',
 'the age of',
 'age of 26']

## Stopwords

In [120]:
stop_words = frozenset(stopwords.words('english'))
list(stop_words)[:5], len(stop_words)

(['he', 'here', 'to', 'her', 'where'], 179)

In [105]:
len(sklearn_stop_words)

318

In [121]:
list(sklearn_stop_words - stop_words)[:10]

['twelve',
 'much',
 'everything',
 'thick',
 'whenever',
 'whose',
 'anyone',
 'system',
 'beside',
 'nine']

In [125]:
print('union:', len(sklearn_stop_words & stop_words))
print('intersection:', len(sklearn_stop_words - stop_words))

union: 119
intersection: 199


## Stemming

In [129]:
stemmer = PorterStemmer()
sentence = "dish washer's washed dishes"
[stemmer.stem(word) for word in sentence.split(' ')]

['dish', "washer'", 'wash', 'dish']

## Lemmatizing

In [141]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('better')

'better'

In [140]:
lemmatizer.lemmatize('better', pos='a') # pos='a' indicates the adjective part of speech.

'good'

In [137]:
lemmatizer.lemmatize('goods', pos='n') # pos='a' indicates the adjective part of speech.

'good'

In [138]:
lemmatizer.lemmatize('best', pos='a') # pos='a' indicates the adjective part of speech.

'best'

## Sentiment Analysis

In [146]:
sa = SentimentIntensityAnalyzer()
list(sa.lexicon.items())[:5]

[('$:', -1.5), ('%)', -0.4), ('%-)', -1.5), ('&-:', -0.4), ('&:', -0.7)]

In [179]:
emojis = [':)', # Smile
          ':('] # Sad...

for emoji in emojis:
    score = sa.polarity_scores(emoji)['compound']
    print(f'{score:+.3f} - {emoji}')

+0.459 - :)
-0.440 - :(


In [180]:
corpus = ['Python is a very readable language and it is great for NLP!',
          'Python sucks!',
          'Python is not bad']

for doc in corpus:
    score = sa.polarity_scores(doc)['compound']
    print(f'{score:+.3f} - {doc}')

+0.659 - Python is a very readable language and it is great for NLP!
-0.420 - Python sucks!
+0.431 - Python is not bad


## Naive Bayes

In [201]:
movies = get_data('hutto_movies')
movies.head().round(2)

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.27,The Rock is destined to be the 21st Century's ...
2,3.53,The gorgeously elaborate continuation of ''The...
3,-0.6,Effective but too tepid biopic
4,1.47,If you sometimes like to go to the movies to h...
5,1.73,"Emerges as something rare, an issue movie that..."


In [202]:
movies.describe().round(2)

Unnamed: 0,sentiment
count,10605.0
mean,0.0
std,1.92
min,-3.88
25%,-1.77
50%,-0.08
75%,1.83
max,3.94


In [206]:
pd.set_option('display.width', 75)
bag_of_words = []

for text in movies.text:
    bag_of_words.append(Counter(casual_tokenize(text)))

In [212]:
df_bows = pd.DataFrame.from_records(bag_of_words)
df_bows = df_bows.fillna(0).astype(np.int64)
df_bows.shape

(10605, 20756)

In [213]:
df_bows.head()

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,zips,zombie,zombies,zone,zoning,zzzzzzzzz,½,élan,–,’
0,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [215]:
nb = MultinomialNB()
nb.fit(df_bows, movies.sentiment > 0)

# Convert the binary classification variable (0 or 1) to -4 or 4 so you compare to the ground truth.
movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4

In [216]:
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
movies.error.mean().round(1)

2.4

In [218]:
movies['sentiment_ispositive'] = (movies.sentiment > 0).astype(np.int64)
movies['predicted_ispositive'] = (movies.predicted_sentiment > 0).astype(np.int64)
movies[['sentiment', 'predicted_sentiment', 'sentiment_ispositive', 'predicted_ispositive']].head()

Unnamed: 0_level_0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispositive
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.266667,4,1,1
2,3.533333,4,1,1
3,-0.6,-4,0,0
4,1.466667,4,1,1
5,1.733333,4,1,1


In [220]:
(movies['predicted_ispositive'] == movies['sentiment_ispositive']).sum() / len(movies)

0.9344648750589345