# Natural Language Processing in Action

## 2. Word Tokenization

In [44]:
# Example sentence
sentence = "Thomas Jefferson began building Monticello at the age of 26."

In [45]:
sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [46]:
str.split(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

### One-hot vector を作ってみる

In [47]:
import numpy as np

In [48]:
!poetry add pandas 

The following packages are already present in the pyproject.toml and will be skipped:

  • [36mpandas[0m

If you want to update it to the latest compatible version, you can use `poetry update package`.
If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.

Nothing to add.


In [49]:
import pandas as pd

In [50]:
token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
num_tokens = len(token_sequence)
vocab_size = len(vocab)

In [51]:
onehot_vectors = np.zeros((num_tokens, vocab_size), int)

In [53]:
for i, word in enumerate(token_sequence):
    onehot_vectors[i, vocab.index(word)] = 1

onehot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [54]:
df = pd.DataFrame(onehot_vectors, columns=vocab)

In [55]:
df

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


### Bag of Words を作ってみる

In [56]:
sentences = (
"Thomas Jefferson began building Monticello at the age of 26.\n"
"Construction was done mostly by local masons and carpenters.\n"
"He moved into the South Pavilion in 1770.\n"
"Turning Monticello into a neoclassical masterpiece was Jefferson's obsession."
)

In [57]:
corpus = {}

In [58]:
for i, sent in enumerate(sentences.split('\n')):
    corpus[f"sent{i}"] = dict((tok, 1) for tok in sent.split())

In [59]:
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T

In [60]:
df[df.columns[:10]]

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent0,1,1,1,1,1,1,1,1,1,1
sent1,0,0,0,0,0,0,0,0,0,0
sent2,0,0,0,0,0,0,1,0,0,0
sent3,0,0,0,0,1,0,0,0,0,0


In [61]:
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.,...,South,Pavilion,in,1770.,Turning,a,neoclassical,masterpiece,Jefferson's,obsession.
sent0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
sent1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sent2,0,0,0,0,0,0,1,0,0,0,...,1,1,1,1,0,0,0,0,0,0
sent3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,1


In [62]:
df = df.T

In [63]:
df.sent0.dot(df.sent1)

0

In [64]:
df.sent0.dot(df.sent2)

1

In [65]:
df.sent0.dot(df.sent3)

1

### 正規表現でトークナイズを改善する

In [66]:
import re
pattern = re.compile(r"[-\s.,;!?]+")
tokens = pattern.split(sentence)
tokens = [x for x in tokens if x and x not in "- \t\n.,;!?"]
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

### ライブラリを使ってトークナイズする

各ライブラリの特徴は次の通り。
- spaCy: Accurate , flexible, fast, Python
- Stanford CoreNLP: More accurate, less flexible, fast, depends on Java 8
- NLTK: Standard used by many NLP contests and comparisons, popular, Python

#### spaCyを使ってみる

In [67]:
import spacy
# 英語用のトークナイザ等を読み込む
nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)
[token for token in doc]

[Thomas, Jefferson, began, building, Monticello, at, the, age, of, 26, .]

#### NLTKを使ってみる

In [68]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r"\w+|$[0-9.]+|\S+")
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

## NLTK の Treebank Tokenizer を使ってみる
wasn't のような短縮形を was n't のように分割してくれる。

In [69]:
from nltk.tokenize import TreebankWordTokenizer
sentence2 = """Monticello wasn't designed as UNESCO World Heritage Site untill 1987."""
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence2)

['Monticello',
 'was',
 "n't",
 'designed',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'untill',
 '1987',
 '.']

## n-gramで語彙を広げる

"ice cream" などのように複数単語で成り立っている言葉は、単語でトークナイズしてしまうとそのままでは文書中の意図した意味にならない。こういった言葉をn-gramを使って扱えるようにする。

In [72]:
from nltk.util import ngrams

In [74]:
list(ngrams(tokens, 2))

[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monticello'),
 ('Monticello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26')]

In [75]:
list(ngrams(tokens, 3))

[('Thomas', 'Jefferson', 'began'),
 ('Jefferson', 'began', 'building'),
 ('began', 'building', 'Monticello'),
 ('building', 'Monticello', 'at'),
 ('Monticello', 'at', 'the'),
 ('at', 'the', 'age'),
 ('the', 'age', 'of'),
 ('age', 'of', '26')]

### Stop words

文章の本質的な意味に寄与せず、かつ多くの文書で出てくる言葉をStop Wordsという。
例えば、以下のようなものがある。
- a, an
- the, this
- and, or
- of, on
歴史的に計算負荷をへらすことを目的などにしてStop wordsは除去されることが多いが、わずかながらにも文書の意味に寄与することがある。例えば、以下のような文では、Stop wordsの有無で文意が変わってしまう。
- Mark reported to the CEO.
- Suzanne reported as the CEO to the board.


In [76]:
# NLTKのStop Wordsには次のようなものが含まれている。
import nltk
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words("english")
len(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akitanak/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


179

In [77]:
stop_words[:7]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours']

## 語彙を正規化する

文章中には、単語の活用等によって異なるスペリングでも同じ意味を持つことがある。このようなものを一つの単語に統一するためのテクニックとして、以下のようなものがある。
- Case Folding
- Stemming
- Lemmatization

### Case Folding
Capitalizatinを修正して単語のスペルを揃える。すべてをlower caseにしてしまうと、upper caseであることに意味があるものが失われてしまうので注意。文頭の単語のみ小文字にする等の工夫が必要。

In [79]:
print(tokens)
normalized_tokens = [x.lower() for x in tokens]
print(normalized_tokens)

['Thomas', 'Jefferson', 'began', 'building', 'Monticello', 'at', 'the', 'age', 'of', '26']
['thomas', 'jefferson', 'began', 'building', 'monticello', 'at', 'the', 'age', 'of', '26']


### Stemming

複数形や所有格等の活用形における共通的な単語の stem を特定する手法である。たとえば、 "house" と "houses"、"house's" を "house" に揃える。スペリングを元に機械的に stem に変換するため、文中の意味を失うこともある。
検索エンジンなどで利用すると、対象となる文書が増えるためrecallは改善するが、その分余計な文書も含まれるようになりprecisionに悪影響を与えることになる。

In [81]:
def stem(words):
    return " ".join([re.findall('^(.*ss|.*?)(s)?$', word)[0][0].strip("'") for word in words.lower().split()])

print(stem("houses"))
print(stem("Doctor House's calls"))

house
doctor house call


ポピュラーなstemmingアルゴリズムとして、Porter stemmer と Snowball stemmer がある。どちらもMartine Porterによって考案されたアルゴリズムで Snowball stemmer は Porter stemmer の拡張版である。
[ここ](https://github.com/jedijulia/porter-stemmer/blob/ master/stemmer.py)にPorter stemmerのpython実装がある。
nltk が Porter stemmer の実装を提供している。

In [87]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
" ".join([stemmer.stem(w).strip("'") for w in "dish washer's washed dished".split()])

'dish washer wash dish'

### Lemmatization

意味的に同じ幹に正規化する手法である。Lemmatizer は Part of Speech(POS)(品詞)も単語と一緒に受け取り処理を行う。Lemmatization は Stemming よりも精度が高くなることが期待できる。

In [91]:
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("good", pos="a"))
print(lemmatizer.lemmatize("goods", pos="a"))
print(lemmatizer.lemmatize("goods", pos="n"))
print(lemmatizer.lemmatize("goodness", pos="n"))
print(lemmatizer.lemmatize("goodness", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))

better
good
good
goods
good
goodness
goodness
best


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/akitanak/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 感情分析

感情分析の方法には２つのアプローチがある。
- 人によって構成されたルールベースのアルゴリズム
  - キーワードごとに感情スコアを人間が設定し、文書中に登場するキーワードを元にスコアリングする。
- 予めラベル付けされたデータによる機械学習モデル

### VADER - ルールベースのアルゴリズム

In [95]:
!poetry add vadersentiment

Using version [1m^3.3.2[0m for [36mvaderSentiment[0m

[34mUpdating dependencies[0m
[2K[34mResolving dependencies...[0m [39;2m(1.1s)[0m

[34mWriting lock file[0m

[1mPackage operations[0m: [34m1[0m install, [34m0[0m updates, [34m0[0m removals

  [34;1m•[0m [39mInstalling [0m[36mvadersentiment[0m[39m ([0m[39;1m3.3.2[0m[39m)[0m: [34mPending...[0m
[1A[0J  [34;1m•[0m [39mInstalling [0m[36mvadersentiment[0m[39m ([0m[39;1m3.3.2[0m[39m)[0m: [34mDownloading...[0m [1m0%[0m
[1A[0J  [34;1m•[0m [39mInstalling [0m[36mvadersentiment[0m[39m ([0m[39;1m3.3.2[0m[39m)[0m: [34mDownloading...[0m [1m100%[0m
[1A[0J  [34;1m•[0m [39mInstalling [0m[36mvadersentiment[0m[39m ([0m[39;1m3.3.2[0m[39m)[0m: [34mDownloading...[0m [1m100%[0m
[1A[0J  [34;1m•[0m [39mInstalling [0m[36mvadersentiment[0m[39m ([0m[39;1m3.3.2[0m[39m)[0m: [34mInstalling...[0m
[1A[0J  [32;1m•[0m [39mInstalling [0m[36mvadersentiment[0m

In [97]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()
[(tok, score) for tok, score in sa.lexicon.items() if " " in tok]

[("( '}{' )", 1.6),
 ("can't stand", -2.0),
 ('fed up', -1.8),
 ('screwed up', -1.5)]

In [101]:
sa.polarity_scores(text="Python is very readable and it's great for NLP.")

{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'compound': 0.6249}

In [102]:
sa.polarity_scores(text="Python is not a bad choice for most application.")

{'neg': 0.0, 'neu': 0.737, 'pos': 0.263, 'compound': 0.431}

In [105]:
corpus = [
    "Absolutely perfect! Love it! :-) :-) :-)",
    "Horrible! Completely useless. :(",
    "It was OK. Some good and some bad things."
]

for doc in corpus:
    scores = sa.polarity_scores(doc)
    print(f"{scores['compound']}: {doc}")

0.9428: Absolutely perfect! Love it! :-) :-) :-)
-0.8768: Horrible! Completely useless. :(
-0.1531: It was OK. Some good and some bad things.


### Naive Bayes

Naive Bayesモデルは対象のドキュメントの集合から目的変数となるキーワードを見つける。
下記サンプルのデータは[ここ](https://github.com/totalgood/nlpia/raw/master/src/nlpia/data/hutto_ICWSM_2014/movieReviewSnippets_GroundTruth.csv.gz)からダウンロードした。

In [110]:
from pathlib import Path
path = Path("../data/movieReviewSnippets_GroundTruth.csv")

In [111]:
import pandas as pd

In [171]:
movies = pd.read_csv(path)

In [172]:
movies.head().round(2)

Unnamed: 0,id,sentiment,text
0,1,2.27,The Rock is destined to be the 21st Century's ...
1,2,3.53,The gorgeously elaborate continuation of ''The...
2,3,-0.6,Effective but too tepid biopic
3,4,1.47,If you sometimes like to go to the movies to h...
4,5,1.73,"Emerges as something rare, an issue movie that..."


In [173]:
movies.describe().round(2)

Unnamed: 0,id,sentiment
count,10605.0,10605.0
mean,5303.0,0.0
std,3061.54,1.92
min,1.0,-3.88
25%,2652.0,-1.77
50%,5303.0,-0.08
75%,7954.0,1.83
max,10605.0,3.94


In [174]:
from nltk.tokenize import casual_tokenize
from collections import Counter

pd.set_option('display.width', 75)
bags_of_words = [Counter(casual_tokenize(text)) for text in movies.text]
df_bows = pd.DataFrame.from_records(bags_of_words)
df_bows = df_bows.fillna(0).astype(int)
df_bows.shape

(10605, 20756)

In [119]:
df_bows.head()

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Ill,slummer,Rashomon,dipsticks,Bearable,Staggeringly,’,ve,muttering,dissing
0,1,1,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [142]:
# case folding で normalization
bags_of_words_w_cf = [Counter(casual_tokenize(text.lower())) for text in movies.text]
df_bows_cf = pd.DataFrame.from_records(bags_of_words_w_cf)
df_bows_cf = df_bows_cf.fillna(0).astype(int)

In [143]:
df_bows_cf.shape

(10605, 18541)

In [144]:
df_bows_cf.head()

Unnamed: 0,the,rock,is,destined,to,be,21st,century's,new,',...,drudgery,snubbing,degenerates,hogwash,slummer,rashomon,dipsticks,’,ve,muttering
0,2,1,1,1,2,1,1,1,1,4,...,0,0,0,0,0,0,0,0,0,0
1,3,0,1,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [145]:
# stemming で normalization
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stemming(word):
    return stemmer.stem(word).strip("'")

bags_of_words_w_stemmer = [Counter([stemming(word) for word in casual_tokenize(text)]) for text in movies.text]
df_bows_stemmer = pd.DataFrame.from_records(bags_of_words_w_stemmer)
df_bows_stemmer = df_bows_stemmer.fillna(0).astype(int)

In [146]:
df_bows_stemmer.shape

(10605, 12527)

In [147]:
df_bows_stemmer.head()

Unnamed: 0,the,rock,is,destin,to,be,21st,century,new,Unnamed: 10,...,ame,drudgeri,snub,hogwash,slummer,rashomon,dipstick,’,ve,mutter
0,2,1,1,1,2,1,1,1,1,4,...,0,0,0,0,0,0,0,0,0,0
1,3,0,1,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [149]:
# stop words を使って次元を削減する
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words("english")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akitanak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [153]:
bags_of_words_stopwords = [Counter([stemming(word) for word in casual_tokenize(text) if word not in stop_words]) for text in movies.text]
df_bows_stopwords = pd.DataFrame.from_records(bags_of_words_stopwords)
df_bows_stopwords = df_bows_stopwords.fillna(0).astype(int)

In [154]:
df_bows_stopwords.shape

(10605, 12505)

In [158]:
# Naive Bayes を使って、sentimentを推論してみる
!poetry add scikit-learn

Using version [1m^0.24.2[0m for [36mscikit-learn[0m

[34mUpdating dependencies[0m
[2K[34mResolving dependencies...[0m [39;2m(0.6s)[0m

[34mWriting lock file[0m

[1mPackage operations[0m: [34m3[0m installs, [34m0[0m updates, [34m0[0m removals

  [34;1m•[0m [39mInstalling [0m[36mscipy[0m[39m ([0m[39;1m1.6.1[0m[39m)[0m: [34mPending...[0m
  [34;1m•[0m [39mInstalling [0m[36mthreadpoolctl[0m[39m ([0m[39;1m2.1.0[0m[39m)[0m: [34mPending...[0m
[1A[0J  [34;1m•[0m [39mInstalling [0m[36mthreadpoolctl[0m[39m ([0m[39;1m2.1.0[0m[39m)[0m: [34mInstalling...[0m
[2A[0J  [34;1m•[0m [39mInstalling [0m[36mthreadpoolctl[0m[39m ([0m[39;1m2.1.0[0m[39m)[0m: [34mInstalling...[0m
[1A[0J  [34;1m•[0m [39mInstalling [0m[36mscipy[0m[39m ([0m[39;1m1.6.1[0m[39m)[0m: [34mInstalling...[0m
  [34;1m•[0m [39mInstalling [0m[36mthreadpoolctl[0m[39m ([0m[39;1m2.1.0[0m[39m)[0m: [34mInstalling...[0m
[1A[0J  [32;1m•[

In [199]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb = nb.fit(df_bows, movies.sentiment > 0)
movies["predicted_sentiment"] = (nb.predict_proba(df_bows) * 8 - 4)[:, 1]
movies["error"] = (movies.predicted_sentiment - movies.sentiment).abs()
movies.error.mean().round(1)

1.9

In [201]:
movies["sentiment_ispositive"] = (movies.sentiment > 0).astype(int)
movies["predicted_ispositive"] = (movies.predicted_sentiment > 0).astype(int)
movies["sentiment predicted_sentiment sentiment_ispositive predicted_ispositive".split()].head(8)

Unnamed: 0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispositive
0,2.266667,2.511515,1,1
1,3.533333,3.999904,1,1
2,-0.6,-3.655976,0,0
3,1.466667,1.940954,1,1
4,1.733333,3.910373,1,1
5,2.533333,3.995188,1,1
6,2.466667,3.960466,1,1
7,1.266667,-1.918701,1,0


In [202]:
(movies.predicted_ispositive == movies.sentiment_ispositive).sum() / len(movies)

0.9344648750589345