In [41]:
import spacy
nlpz = spacy.load('zh_core_web_sm')

# doc3 = nlpz("我爱北京天安门")
# for token in doc3:
#     print(token.text, token.pos_, token.dep_)

import pathlib
doc4 = nlpz((pathlib.Path("./data/text_zh.txt").read_text(encoding='utf8')))

# 获取token(文本中每个词（词组）为一个token)，打印token的文本，token的词性，token的依存关系
print("\n ###### 获取token，打印token的文本，在文本中的起始位置，token的词性，token的依存关系，token是否为停止词， token是否为字母， token是否为标点符号 ###")
for token in doc4:
    print(token.text,token.idx, token.pos_, token.dep_, token.is_stop, token.is_alpha, token.is_punct)


 ###### 获取token，打印token的文本，在文本中的起始位置，token的词性，token的依存关系，token是否为停止词， token是否为字母， token是否为标点符号 ###
我 0 PRON nsubj True True False
是 1 VERB cop True True False
一个 2 NUM dep True True False
小 4 ADJ amod True True False
老虎 5 NOUN ROOT False True False
， 7 PUNCT punct True False True
天天 8 ADV advmod False True False
开心 10 VERB conj False True False
。 12 PUNCT punct True False True

 13 SPACE dep False False False
他 14 PRON nsubj True True False
是 15 VERB cop True True False
一个 16 X dep True True False
【 18 PUNCT punct True False True
大笨 19 NOUN dep False True False
猪】 21 NOUN ROOT False False False
， 23 PUNCT punct True False True
走 24 VERB conj False True False
在 25 ADP case True True False
大路 26 NOUN nmod:prep False True False
。 28 PUNCT punct True False True

 29 SPACE dep False False False
哈哈哈 30 PROPN nsubj False True False
， 33 PUNCT punct True False True
这是 34 VERB dep False True False
一个 36 NUM dep True True False
美妙 38 ADJ amod False True False
的 40 PART case True True False
清晨 4

In [42]:
# 获取句子，打印句子的前3个 token
print("\n ###### 获取句子，打印句子的前10个 token，而不是词 ###")
for sent in doc4.sents:
    print(sent[:10])

# 自定义句子分隔符，进行句子检测
print("\n ##### 使用自定义分词器 ####")
from spacy.language import Language


@Language.component("my_sentencizer")
def my_sentencizer(doc):
    for token in doc[:-1]:
        if token.text == '...' and not token.is_punct and not token.is_sent_start:
            doc[token.i+1].is_sent_start = True
    return doc

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('my_sentencizer', before='parser')
doc5 = nlp((pathlib.Path("./data/text.txt").read_text(encoding='utf8')))
for sent in doc5.sents:
    print(sent)


 ###### 获取句子，打印句子的前10个 token，而不是词 ###
我是一个小老虎，天天开心。


他是一个【大笨猪】，走在大路


哈哈哈，这是一个美妙的清晨，一切都
走在张家界的山谷里，都是空气
啦啦啦啦。
快快走吧。

快快乐乐吧...，一切都是快快乐乐...的


快去吧，都是好好的。

 ##### 使用自定义分词器 ####
who are you.

give me... a token.

this is... a notebook.

hi give me five.


In [47]:
# 自定义分词器，将特殊字符，如：&、@等，能够分词出来，比如：new@year 可以分为两个token而不是一个

custom_about_text = (
    "Gus Proto is a 【Python】 developer currently"
    " working for a London@based Fintech"
    " company. He is interested in learning"
    " Natural&Language&Processing. 1234569. abced"
)

print([token.text for token in nlp(custom_about_text)[:-1]])

['Gus', 'Proto', 'is', 'a', '【', 'Python', '】', 'developer', 'currently', 'working', 'for', 'a', 'London@based', 'Fintech', 'company', '.', 'He', 'is', 'interested', 'in', 'learning', 'Natural&Language&Processing', '.', '1234569', '.']


In [46]:
from spacy.tokenizer import Tokenizer

custom_nlp = spacy.load("en_core_web_sm")
# 处理前面的标点符号（如左括号）的函数。
prefix_re = spacy.util.compile_prefix_regex(
    custom_nlp.Defaults.prefixes + ["【"]
)
# 处理后续标点符号（如右括号）的函数。
suffix_re = spacy.util.compile_suffix_regex(
    custom_nlp.Defaults.suffixes + ['】']
)

custom_infixes = ["@", "&"]

# 处理非空格分隔符（如连字符）的函数。
infix_re = spacy.util.compile_infix_regex(
    list(custom_nlp.Defaults.infixes) + custom_infixes
)

# 自定义 tokenizer，其中token_match 用于匹配不应拆分的字符串。它会覆盖前面的规则，对 URL 或数字号码，等不应该拆分的实体很有用。
custom_nlp.tokenizer = Tokenizer(
    custom_nlp.vocab,
    prefix_search=prefix_re.search,
    suffix_search=suffix_re.search,
    infix_finditer=infix_re.finditer,
    token_match=None,
)

custom_tokenizer_about_doc = custom_nlp(custom_about_text)

print([token.text for token in custom_tokenizer_about_doc[:-1]])

for token in custom_tokenizer_about_doc:
    print(token.text, token.idx)

['Gus', 'Proto', 'is', 'a', '【', 'Python', '】', 'developer', 'currently', 'working', 'for', 'a', 'London', '@', 'based', 'Fintech', 'company', '.', 'He', 'is', 'interested', 'in', 'learning', 'Natural', '&', 'Language', '&', 'Processing']
Gus 0
Proto 4
is 10
a 13
【 15
Python 16
】 22
developer 24
currently 34
working 44
for 52
a 56
London 58
@ 64
based 65
Fintech 71
company 79
. 86
He 88
is 91
interested 94
in 105
learning 108
Natural 117
& 124
Language 125
& 133
Processing 134
. 144


In [50]:
# stop_word 停用词，停用词通常定义为语言中最常见的词。在英语中，停用词的一些示例是 the、are、but 和 they。大多数句子需要包含停用词才能成为具有语法意义的完整句子。
# 使用 NLP 时，停用词通常会被删除，因为它们并不重要，并且它们会严重扭曲任何词频分析。spaCy 存储英语的停用词列表：

import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

for stop_word in list(spacy_stopwords)[:14]:
    print(stop_word)


before
each
eleven
herself
wherever
ca
few
anywhere
nevertheless
with
whereafter
either
might
him


In [52]:
# 使用 is_stop属性删除停用词
custom_about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)
nlp = spacy.load("en_core_web_sm")
about_doc = nlp(custom_about_text)
print([token for token in about_doc if not token.is_stop])

[Gus, Proto, Python, developer, currently, working, London, -, based, Fintech, company, ., interested, learning, Natural, Language, Processing, .]


In [55]:
# 词性还原， 将 过去式 还原成 现在式
# 词形还原是必需的，因为它可以帮助您减少单词的变形形式，以便可以将它们作为单个项目进行分析。它还可以帮助您规范化文本。

# 不对文本进行词形还原，那么 organize 和 organizing 将被计为不同的标记，词形还原可帮助避免可能在概念上重叠的重复单词。进行词频统计。
import spacy
nlp = spacy.load("en_core_web_sm")
conference_help_text = (
    "Gus is helping organize a developer"
    " conference on Applications of Natural Language"
    " Processing. He keeps organizing local Python meetups"
    " and several internal talks at his workplace."
)
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
    if str(token) != str(token.lemma_):
        print(f"{str(token):>20} : {str(token.lemma_)}")

                  is : be
                  He : he
               keeps : keep
          organizing : organize
             meetups : meetup
               talks : talk


In [56]:
# 词频统计， 统计文本中出现最多得词汇，初步分析文本主旨，此过程可能会丢失信息

import spacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")
complete_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech company. He is"
    " interested in learning Natural Language Processing."
    " There is a developer conference happening on 21 July"
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number'
    " available at +44-1234567891. Gus is helping organize it."
    " He keeps organizing local Python meetups and several"
    " internal talks at his workplace. Gus is also presenting"
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    " Apart from his work, he is very passionate about music."
    " Gus is learning to play the Piano. He has enrolled"
    " himself in the weekend batch of Great Piano Academy."
    " Great Piano Academy is situated in Mayfair or the City"
    " of London and has world-class piano instructors."
)
complete_doc = nlp(complete_text)

words = [
    token.text
    for token in complete_doc
    if not token.is_stop and not token.is_punct
]

print(Counter(words).most_common(5))

[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


In [1]:
# 词性标记，根据每个词标记在句子中的用法为每个标记分配 POS 标签
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)
about_doc = nlp(about_text)
for token in about_doc:
    print(
        f"""
TOKEN: {str(token)}
=====
TAG: {str(token.tag_):10} POS: {token.pos_}
EXPLANATION: {spacy.explain(token.tag_)}"""
    )


TOKEN: Gus
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: Proto
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: is
=====
TAG: VBZ        POS: AUX
EXPLANATION: verb, 3rd person singular present

TOKEN: a
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: Python
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: developer
=====
TAG: NN         POS: NOUN
EXPLANATION: noun, singular or mass

TOKEN: currently
=====
TAG: RB         POS: ADV
EXPLANATION: adverb

TOKEN: working
=====
TAG: VBG        POS: VERB
EXPLANATION: verb, gerund or present participle

TOKEN: for
=====
TAG: IN         POS: ADP
EXPLANATION: conjunction, subordinating or preposition

TOKEN: a
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: London
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: -
=====
TAG: HYPH       POS: PUNCT
EXPLANATION: punctuation mark, hyphen

TOKEN: based
=====
TAG