In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## Tokenization 

### 基本的分句 ：

    sent_tokenize 是PunktSentenceTokenizer的一个instance。它其实是一个已经训练好的模型。以.pickle的存储，需要时load

In [1]:
import nltk

text = "Welcome readers. I hope you find it interesting. Please do reply."
from nltk.tokenize import sent_tokenize

print(sent_tokenize(text))

['Welcome readers.', 'I hope you find it interesting.', 'Please do reply.']


### 若有大量句子需要 tokenized，则 load PunktSentenceTokenizer ：

In [2]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = "Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"
print(tokenizer.tokenize(text))

['Hello everyone.', 'Hope all are fine and doing well.', 'Hope you find the book interesting']


### 其他语言的Tokenization
    从 tokenizers/punkt 中加载相应语言的 .pickle文件 ——作为参数传入tokenizer()

In [3]:
tokenizer = nltk.data.load('tokenizers/punkt/french.pickle') # 法语tokenization
french_text = 'Deux agressions en quelques jours,\
voilà ce qui a motivé hier matin le débrayage  collège franco-britanniquedeLevallois-Perret.\
Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage  Levallois.\
L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression,\
janvier , d’un professeur d’histoire. L’équipe pédagogique de ce collège de 750 élèves avait \
déjà été choquée par l’agression, mercredi , d’un professeur d’histoire'
for no,sent in enumerate(tokenizer.tokenize(french_text)):
    print(no,'\n',sent)
    print('-----')

0 
 Deux agressions en quelques jours,voilà ce qui a motivé hier matin le débrayage  collège franco-britanniquedeLevallois-Perret.Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage  Levallois.L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression,janvier , d’un professeur d’histoire.
-----
1 
 L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , d’un professeur d’histoire
-----


### 单个句子分词
    word_tokenize 是TreebankWordTokenizer 的一个 instance，基于空格和标点分词

In [4]:
text = 'PierreVinken, 59 years old, will join as a nonexecutive director non Nov. 29 .'
tokens = nltk.word_tokenize(text)
print(tokens)

['PierreVinken', ',', '59', 'years', 'old', ',', 'will', 'join', 'as', 'a', 'nonexecutive', 'director', 'non', 'Nov.', '29', '.']


## Tokenization using TreebankWordTokenizer

In [5]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
text = 'Have a nice day. I hope you find the book interesting'
print(tokenizer.tokenize(text))

['Have', 'a', 'nice', 'day.', 'I', 'hope', 'you', 'find', 'the', 'book', 'interesting']


In [6]:
# 依循Penn Treebank 语料库的惯例，以缩略词 分词
tokens = nltk.word_tokenize("Don't hesitate to ask questions")
print(tokens)

['Do', "n't", 'hesitate', 'to', 'ask', 'questions']


In [21]:
# WordPunctTokenizer 以标点分词，并将标点作为new token
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize("Don't hesitate to ask question"))

['Don', "'", 't', 'hesitate', 'to', 'ask', 'question']


## Tokenization using regular expressions

In [1]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
print(tokenizer.tokenize("Don't hesitate to ask question"))

["Don't", 'hesitate', 'to', 'ask', 'question']


In [2]:
# 类的instantiating 之外，另一种方法是使用function：
from nltk.tokenize import regexp_tokenize
sent = "Don't hesitate to ask questions"
print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))

['Don', "'t", 'hesitate', 'to', 'ask', 'questions']


In [25]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\s+', gaps=True) # 非空字符，以空格分割
print(tokenizer.tokenize("Don't hesitate to ask question"))

["Don't", 'hesitate', 'to', 'ask', 'question']


In [26]:
sent = " She secured 90.56 % in class X . She is a meritorious student"
capt = RegexpTokenizer('[A-Z]\w+') # 首字母大写的词
print(capt.tokenize(sent))

['She', 'She']


In [22]:
from nltk.tokenize import BlanklineTokenizer #RegexpTokenizer 的子类，内含已定义好的pattern
tokenizer = BlanklineTokenizer()  # 以空白行（换行符）为分隔符
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
print(sent)
print(tokenizer.tokenize(sent))  

 She secured 90.56 % in class X 
. She is a meritorious student

[' She secured 90.56 % in class X \n. She is a meritorious student\n']


In [6]:
from nltk.tokenize import WhitespaceTokenizer # 通常应该用split()，以空格、制表符、换行符作为分隔符
tokenizer = WhitespaceTokenizer()
sent = " She secured 90.56 % in class X . She is a meritorious student"
print(tokenizer.tokenize(sent))

['She', 'secured', '90.56', '%', 'in', 'class', 'X', '.', 'She', 'is', 'a', 'meritorious', 'student']


In [29]:
sent = "She secured 90.56 % in class X. She is a meritorious student"
print(sent.split())
print(sent.split(' '))

print('-------\n')
sent = "She secured 90.56 % in class X \n. She is a meritorious student\n"
print('\\n: ',sent.split('\n'))
#print("\'\': ",sent.split(''))
print("\n'空格': ",sent.split(' '))
print('\nNone: ',sent.split())

['She', 'secured', '90.56', '%', 'in', 'class', 'X.', 'She', 'is', 'a', 'meritorious', 'student']
['She', 'secured', '90.56', '%', 'in', 'class', 'X.', 'She', 'is', 'a', 'meritorious', 'student']
-------

\n:  ['She secured 90.56 % in class X ', '. She is a meritorious student', '']

'空格':  ['She', 'secured', '90.56', '%', 'in', 'class', 'X', '\n.', 'She', 'is', 'a', 'meritorious', 'student\n']

None:  ['She', 'secured', '90.56', '%', 'in', 'class', 'X', '.', 'She', 'is', 'a', 'meritorious', 'student']


In [31]:
sent = "She secured 90.56 % in class X \n. She is a meritorious student\n"
from nltk.tokenize import BlanklineTokenizer
Blanklinetokenizer = BlanklineTokenizer()
print('Blankline: ',Blanklinetokenizer.tokenize(sent))

from nltk.tokenize import LineTokenizer # 分行器，类似 sent.split('\n') 
print("blankline='keep': ",LineTokenizer(blanklines='keep').tokenize(sent)) #??
print("blankline='discard': ",LineTokenizer(blanklines='discard').tokenize(sent))

Blankline:  ['She secured 90.56 % in class X \n. She is a meritorious student\n']
blankline='keep':  ['She secured 90.56 % in class X ', '. She is a meritorious student']
blankline='discard':  ['She secured 90.56 % in class X ', '. She is a meritorious student']


['She secured 90.56 % in class X ', '. She is a meritorious student']

['She secured 90.56 % in class X ', '. She is a meritorious student']

In [1]:
import nltk
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
from nltk.tokenize import SpaceTokenizer  #  similar to sent.split('')
tokenizer = SpaceTokenizer()
print(tokenizer.tokenize(sent))

['', 'She', 'secured', '90.56', '%', 'in', 'class', 'X', '\n.', 'She', 'is', 'a', 'meritorious', 'student\n']


In [40]:
from nltk.tokenize import WhitespaceTokenizer
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
list(WhitespaceTokenizer().span_tokenize(sent))  # 计算tokens 在句子中的offsets ??

[(1, 4),
 (5, 12),
 (13, 18),
 (19, 20),
 (21, 23),
 (24, 29),
 (30, 31),
 (33, 34),
 (35, 38),
 (39, 41),
 (42, 43),
 (44, 55),
 (56, 63)]

In [44]:
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize.util import spans_to_relative
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
list(spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))) #??

[(1, 3),
 (1, 7),
 (1, 5),
 (1, 1),
 (1, 2),
 (1, 5),
 (1, 1),
 (2, 1),
 (1, 3),
 (1, 2),
 (1, 1),
 (1, 11),
 (1, 7)]

In [34]:
from nltk.tokenize.util import string_span_tokenize  # ??
sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(string_span_tokenize(sent, " ")))

[(1, 4), (5, 12), (13, 18), (19, 20), (21, 23), (24, 29), (30, 31), (32, 34), (35, 38), (39, 41), (42, 43), (44, 55), (56, 64)]


## Normalization
包括消除标点、大小写转换、阿拉伯数字转换成单词、缩写扩展等等

### 消除标点

In [16]:
# 含有标点
text=[" It is a pleasant evening.",
      "Guests, who came from US arrived at the venue",
      "Food was tasty."]
from nltk.tokenize import word_tokenize
tokenized_doc = [word_tokenize(doc) for doc in text]
tokenized_doc

[['It', 'is', 'a', 'pleasant', 'evening', '.'],
 ['Guests', ',', 'who', 'came', 'from', 'US', 'arrived', 'at', 'the', 'venue'],
 ['Food', 'was', 'tasty', '.']]

In [19]:
# 去除每句话中的标点
import re
import string
text = [" It is a pleasant evening.",
        "Guests, who came from US arrived at the venue",
        "Food was tasty."]
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in text]
x = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = x.sub(u'',token)  # ??
        if not new_token == u'':
            new_review.append(new_token)
    tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)

[['It', 'is', 'a', 'pleasant', 'evening'], ['Guests', 'who', 'came', 'from', 'US', 'arrived', 'at', 'the', 'venue'], ['Food', 'was', 'tasty']]


### 大小写转换

In [20]:
text="HARdWork IS KEy to SUCCESS"
print(text.lower())
print(text.upper())

hardwork is key to success
HARDWORK IS KEY TO SUCCESS


### 停用词
这些词对句子的整体意义没有多大贡献。

许多搜索引擎通过删除停止词来减少搜索空间。

NLTK有多种语言的停用词列表。

In [23]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english')) # .words() function。 参数fileid，此处是english；若不指定fileid，则指所有语言的停用词
words = ["Don't", "hesitate", "to", "ask", "questions"]
print([word for word in words if word not in stops])

["Don't", 'hesitate', 'ask', 'questions']


In [25]:
from nltk.corpus import stopwords
stopwords.fileids()   # 查看有哪些语言的停用词列表可指定

['danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'kazakh',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish',
 'turkish']

### 计算非停用词占比

In [2]:
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))
def para_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    para = [w for w in text if w.lower() not in stopwords]
    return len(para) / len(text)

print(para_fraction(nltk.corpus.reuters.words()))
print(para_fraction(nltk.corpus.inaugural.words()))

0.735240435097661
0.5228599855902837


### 替换和纠正tokens
比如：
* 数字转换： 1 -> one
* 缩写扩展：can't -> cannot

#### 用正则表达式进行单词替换操作
写一个 replacers.py 存于nltkdata 文件夹以调用

### 缩写扩展

In [2]:
from replacers import RegexpReplacer
replacer = RegexpReplacer()
replacer.replace("Don't hesitate to ask quesiton")
print(replacer.replace("She must've gone to the market but she didn't go"))

'Do not hesitate to ask quesiton'

She must have gone to the market but she did not go


分词前先进行缩写扩展

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from replacers import RegexpReplacer
replacer = RegexpReplacer()
text = "Don't hesitate to ask questions"
tokens = word_tokenize(text)
tokens
print(word_tokenize(replacer.replace(text)))

['Do', 'not', 'hesitate', 'to', 'ask', 'questions']


处理重复字符

In [1]:
from replacers import RepeatReplacer
replacer = RepeatReplacer()
print(replacer.replace('lottttt'))
print(replacer.replace('ohhhhh'))
print(replacer.replace('ooohhhhh'))

lot
oh
oh


In [None]:
from replacers import RepeatReplacer
replacer = RepeatReplacer()
print(replacer.replace('happy'))

In [None]:
from replacers import WordReplacer
replacer = WordReplacer({'congrats': 'congratulations'})
print(replacer.replace('congrats'))
print(replacer.replace('maths'))

In [11]:
from nltk.metrics import *
training = 'PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split()
testing = 'PERSON OTHER OTHER OTHER OTHER OTHER'.split()
print('accuracy: ',accuracy(training, testing))
trainset = set(training)
testset = set(testing)
print('precision: ',precision(trainset, testset))
print('recall: ',recall(trainset, testset))
print('f_measure: ',f_measure(trainset, testset))

accuracy:  0.6666666666666666
precision:  1.0
recall:  0.6666666666666666
f_measure:  0.8


In [12]:
from nltk.metrics import *
print(edit_distance('relate','relation'))
print(edit_distance('suggestion','calculation'))

3
7


In [17]:
from nltk.metrics import *
X = set([10, 20, 30, 40])
Y = set([20, 30, 60])
print(jaccard_distance(X,Y))

0.6


In [19]:
from nltk.metrics import *
X = set([10, 20, 30, 40])
Y = set([30, 50, 70])
print(binary_distance(X,Y))

1.0


In [22]:
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
trigrams_tokens=ngrams(alpino.words(),3)
for i in trigrams_tokens:
    print(i)

['De', 'verzekeringsmaatschappijen', 'verhelen', ...]
('De', 'verzekeringsmaatschappijen', 'verhelen')
('verzekeringsmaatschappijen', 'verhelen', 'niet')
('verhelen', 'niet', 'dat')
('niet', 'dat', 'ook')
('dat', 'ook', 'de')
('ook', 'de', 'rentegrondslag')
('de', 'rentegrondslag', 'van')
('rentegrondslag', 'van', 'vier')
('van', 'vier', 'procent')
('vier', 'procent', 'nog')
('procent', 'nog', 'een')
('nog', 'een', 'ruime')
('een', 'ruime', 'marge')
('ruime', 'marge', 'laat')
('marge', 'laat', 'ten')
('laat', 'ten', 'opzichte')
('ten', 'opzichte', 'van')
('opzichte', 'van', 'de')
('van', 'de', 'thans')
('de', 'thans', 'geldende')
('thans', 'geldende', 'rentestand')
('geldende', 'rentestand', '.')
('rentestand', '.', 'Gezien')
('.', 'Gezien', 'de')
('Gezien', 'de', 'lange')
('de', 'lange', 'duur')
('lange', 'duur', 'van')
('duur', 'van', 'vele')
('van', 'vele', 'verzekeringscontracten')
('vele', 'verzekeringscontracten', 'is')
('verzekeringscontracten', 'is', 'dit')
('is', 'dit', 'onver