In [1]:
import nltk

In [31]:
# 调出NLTK的下载器，可以下载需要的语料库和分词模型等...
nltk.download()

# 需要安装 - 
# Corpora ： brown、wordnet、stopwords
# Models ： porter_test 、punkt、averaged_perceptron_tagger

# 额外安装：
# pip3 install jieba

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## 语料库

In [6]:
from nltk.corpus import brown

In [10]:
# 语料库的信息简介
print(brown.readme())

BROWN CORPUS

A Standard Corpus of Present-Day Edited American
English, for use with Digital Computers.

by W. N. Francis and H. Kucera (1964)
Department of Linguistics, Brown University
Providence, Rhode Island, USA

Revised 1971, Revised and Amplified 1979

http://www.hit.uib.no/icame/brown/bcm.html

Distributed with the permission of the copyright holder,
redistribution permitted.



In [11]:
# 查看语料库里包含的分类
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [18]:
# 查看包含的句子和单词的数量
print(len(brown.sents()))
print(len(brown.words()))

57340
1161192


## 1. 分词

### 1. 英文分词 （punkt分词模型）

In [24]:
import nltk

In [20]:
# 需要处理的文本内容
text = "Python is a high-level programming language, and i like it!"

In [30]:
# 使用nltk的分词工具 (需要实现安装punkt分词模型)
# 分词结构是一个列表

#print(text.split())
seg_list = nltk.word_tokenize(text)
print(seg_list)

['Python', 'is', 'a', 'high-level', 'programming', 'language', ',', 'and', 'i', 'like', 'it', '!']


### 2. 中文分词 （jieba分词）

In [32]:
import jieba

In [33]:
text = "习近平在参加党的十九大贵州省代表团讨论时强调：全党全国各族人民万众一心，开拓进取，把新时代中国特色社会主义推向前进。"

In [40]:
# 使用全模式的 结巴分词 : 把所有可能称为词语的结果罗列出来 （ 一般用在文本内容统计：词频）
seg_list = jieba.cut(text, cut_all=True)
print(list(seg_list))

['习近平', '在', '参加', '党', '的', '十九', '九大', '贵州', '贵州省', '代表', '代表团', '讨论', '时', '强调', '', '', '全党', '全党全国', '全国', '各族', '各族人民', '族人', '人民', '万众', '万众一心', '一心', '', '', '开拓', '开拓进取', '进取', '', '', '把', '新', '时代', '中国', '国特', '特色', '社会', '社会主义', '会主', '主义', '推向', '向前', '前进', '', '']


In [41]:
# 使用精确模式 的结巴分词：尽可能的按中文语义进行分词 (文本分析)
seg_list = jieba.cut(text, cut_all=False)
print(list(seg_list))

['习近平', '在', '参加', '党', '的', '十九', '大', '贵州省', '代表团', '讨论', '时', '强调', '：', '全党全国', '各族人民', '万众一心', '，', '开拓进取', '，', '把', '新', '时代', '中国', '特色', '社会主义', '推向', '前进', '。']


## 2.处理词性

### 1. 词干提取

#### 1. PorterStemmer （早期的一款词干提取算法）

In [42]:
# 波特　词干提取算法
from nltk.stem.porter import PorterStemmer

In [43]:
# 创建一个波特词干提取对象
porter_steammer = PorterStemmer()

In [46]:
# look 、looked、looking，词干都是look
print(porter_steammer.stem("look"))
print(porter_steammer.stem("looked"))
print(porter_steammer.stem("looking"))

print(porter_steammer.stem("run"))
print(porter_steammer.stem("running"))

look
look
look
run
run


#### 2. SnowballStemmer  (可以支持多个语言，并兼容porter）

In [48]:
from nltk.stem.snowball import SnowballStemmer

In [49]:
print(SnowballStemmer.languages)

('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [50]:
snowball_stemmer = SnowballStemmer("english")

In [51]:
print(snowball_stemmer.stem("look"))
print(snowball_stemmer.stem("looked"))
print(snowball_stemmer.stem("looking"))

look
look
look


#### 3. LancasterStemmer （速度较快，常用于英文词干提取）

In [52]:
from nltk.stem.lancaster import LancasterStemmer

In [53]:
lancaster_stemmer = LancasterStemmer()

In [54]:
print(lancaster_stemmer.stem("look"))
print(lancaster_stemmer.stem("looked"))
print(lancaster_stemmer.stem("looking"))

look
look
look


### 2. 词形归并

In [56]:
# 词形归并：将单词的各种词形归并为统一的词形
from nltk.stem import WordNetLemmatizer

In [57]:
lemmatizer = WordNetLemmatizer()

In [62]:
# 默认全部按 名词 做词形归并
print(lemmatizer.lemmatize("dogs"))
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("are"))
print(lemmatizer.lemmatize("went"))  # went 有可能是动词go，有可能是名词文特

dog
cat
are
went


In [63]:
# 通过pos参数，指定词性 进行词形归并
print(lemmatizer.lemmatize("are", pos="v"))
print(lemmatizer.lemmatize("went", pos="v"))

be
go


### 3. 词性标注 (分词的同时，标注单词的词性）

In [64]:
# 需要实现安装 averaged_perceptron_tagger
import nltk

In [65]:
# 需要处理的文本内容
text = "Python is a high-level programming language, and i like it!"

In [66]:
#1. 先分词
seg_list = nltk.word_tokenize(text)
print(seg_list)

['Python', 'is', 'a', 'high-level', 'programming', 'language', ',', 'and', 'i', 'like', 'it', '!']


In [74]:
#2. 词性标注，参数是分词后的列表
# 返回包含所有单词和词性的列表
pos_list = nltk.pos_tag(seg_list)
print(pos_list)

[('Python', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('high-level', 'JJ'), ('programming', 'NN'), ('language', 'NN'), (',', ','), ('and', 'CC'), ('i', 'VBP'), ('like', 'IN'), ('it', 'PRP'), ('!', '.')]


In [77]:
# 给每一个字符做词性标注：整句模式
#print(nltk.pos_tag_sents(seg_list))

## 4. 去停用词

In [79]:
# 停用词：文本里经常出现，但是又没有特殊的含义的词语。为了节省空间提高统计分析速度，会去除停用词（过滤）

In [82]:
# 导入nltk的停用词语料库，需要事先下载安装 stopwords
import nltk
from nltk.corpus import stopwords

In [81]:
# 需要处理的文本内容
text = "Python is a high-level programming language, and i like it!"

In [89]:
# 先分词
seg_list = nltk.word_tokenize(text)
print(seg_list)

['Python', 'is', 'a', 'high-level', 'programming', 'language', ',', 'and', 'i', 'like', 'it', '!']


In [88]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [90]:
# 去除停用词后的列表
filtered_list = [seg for seg in seg_list if seg not in stopwords.words("english")]
print(filtered_list)

['Python', 'high-level', 'programming', 'language', ',', 'like', '!']


## 5. 常用的文本预处理流程

In [106]:
# 分词
import nltk
# 词形归并
from nltk.stem import WordNetLemmatizer
# 去停用词
from nltk.corpus import stopwords

In [107]:
# 需要处理的文本
# 生活就像一盒巧克力，你永远不知道下一个会拿到什么。
text = "Life was like a box of chocolates, you never know what you\' re gonna get." 

In [108]:
## 1. 分词处理，返回分词后的单词列表
seg_list = nltk.word_tokenize(text)
print(seg_list)

['Life', 'was', 'like', 'a', 'box', 'of', 'chocolates', ',', 'you', 'never', 'know', 'what', 'you', "'", 're', 'gon', 'na', 'get', '.']


In [100]:
## 2. 词形归并
lemmatizer = WordNetLemmatizer()

word_list = [lemmatizer.lemmatize(seg) for seg in seg_list]
print(word_list)

['Life', 'wa', 'like', 'a', 'box', 'of', 'chocolate', ',', 'you', 'never', 'know', 'what', 'you', "'", 're', 'gon', 'na', 'get', '.']


In [109]:
### 3. 去停用词
filtered_list = [word for word in word_list if word not in stopwords.words("english")]
#print(filtered_list)

['Life', 'wa', 'like', 'box', 'chocolate', ',', 'never', 'know', "'", 'gon', 'na', 'get', '.']


In [112]:
### 4. 文本对比
print("原文本内容: {}".format(text))
print("预处理后的文本：{}".format(" ".join(filtered_list)))

原文本内容: Life was like a box of chocolates, you never know what you' re gonna get.
处理后的文本：Life wa like box chocolate , never know ' gon na get .
