## 使用词袋法对示例文本进行特征向量化

In [1]:
sent1 = 'The cat is walking in the bedroom.'
sent2 = 'A dog was running across the kitchen.'

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()

sentences = [sent1, sent2]

# 输出特征向量化后的表示
print(count_vec.fit_transform(sentences).toarray())

[[0 1 1 0 1 1 0 0 2 1 0]
 [1 0 0 1 0 0 1 1 1 0 1]]


In [3]:
# 输出向量各个维度的特征含义
print(count_vec.get_feature_names())

['across', 'bedroom', 'cat', 'dog', 'in', 'is', 'kitchen', 'running', 'the', 'walking', 'was']


## 使用 NLTK 对示例文本进行语言学分析

In [6]:
import nltk

# 对句子进行词汇分割和正则化
tokens_1 = nltk.word_tokenize(sent1)
print(tokens_1)

['The', 'cat', 'is', 'walking', 'in', 'the', 'bedroom', '.']


In [7]:
tokens_2 = nltk.word_tokenize(sent2)
print(tokens_2)

['A', 'dog', 'was', 'running', 'across', 'the', 'kitchen', '.']


In [8]:
# 有些情况，如 I'm 要分割为 I 和 'm
tokens_3 = nltk.word_tokenize('I\'m a boy.')
print(tokens_3)

['I', "'m", 'a', 'boy', '.']


In [9]:
# 整理两句的词表，并按照 ASCII 的排序输出
vocab_1 = sorted(set(tokens_1))

print(vocab_1)

['.', 'The', 'bedroom', 'cat', 'in', 'is', 'the', 'walking']


In [10]:
vocab_2 = sorted(set(tokens_2))

print(vocab_2)

['.', 'A', 'across', 'dog', 'kitchen', 'running', 'the', 'was']


In [13]:
# 初始化 stemmer 寻找各个词汇最原始的词根
stemmer = nltk.stem.PorterStemmer()
stem_1 = [stemmer.stem(t) for t in tokens_1]
print(stem_1)

['the', 'cat', 'is', 'walk', 'in', 'the', 'bedroom', '.']


In [14]:
stem_2 = [stemmer.stem(t) for t in tokens_2]
print(stem_2)

['A', 'dog', 'wa', 'run', 'across', 'the', 'kitchen', '.']


In [17]:
# 初始化词性标注器，对每个词汇进行标注
pos_tag_1 = nltk.tag.pos_tag(tokens_1)
print(pos_tag_1)

[('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('walking', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('bedroom', 'NN'), ('.', '.')]


In [18]:
pos_tag_2 = nltk.tag.pos_tag(tokens_2)
print(pos_tag_2)

[('A', 'DT'), ('dog', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('across', 'IN'), ('the', 'DT'), ('kitchen', 'NN'), ('.', '.')]
