In [1]:
import spacy

## 先加载英文模型
这里用的是 en_core_web_sm

In [2]:
nlp = spacy.load('en_core_web_sm') # 默认en为 sm这版
test_doc = nlp(u"it's word tokenize test for spacy")
print(test_doc)
nlp.pipeline

it's word tokenize test for spacy


[('tagger', <spacy.pipeline.pipes.Tagger at 0x15066f4fcf8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x150683baca8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x150683bad08>)]

In [3]:
for token in test_doc:
	print(token)

it
's
word
tokenize
test
for
spacy


### 英文断句：
test_doc.sents 将文档拆分成句子


In [4]:
test_doc = nlp(u'Natural language processing (NLP) deals with the application of computational models to text or speech data. Application areas within NLP include automatic (machine) translation between languages; dialogue systems, which allow a human to interact with a machine using natural language; and information extraction, where the goal is to transform unstructured text into structured (database) representations that can be searched and browsed in flexible ways. NLP technologies are having a dramatic impact on the way people interact with computers, on the way people interact with each other through the use of language, and on the way people access the vast amount of linguistic data now in electronic form. From a scientific viewpoint, NLP involves fundamental questions of how to structure formal models (for example statistical models) of natural language phenomena, and of how to design algorithms that implement these models.')

In [5]:
for sentence in test_doc.sents:
    print(sentence)
    print()

Natural language processing (NLP) deals with the application of computational models to text or speech data.

Application areas within NLP include automatic (machine) translation between languages; dialogue systems, which allow a human to interact with a machine using natural language; and information extraction, where the goal is to transform unstructured text into structured (database) representations that can be searched and browsed in flexible ways.

NLP technologies are having a dramatic impact on the way people interact with computers, on the way people interact with each other through the use of language, and on the way people access the vast amount of linguistic data now in electronic form.

From a scientific viewpoint, NLP involves fundamental questions of how to structure formal models (for example statistical models) of natural language phenomena, and of how to design algorithms that implement these models.



### 词干化（Lemmatize):

In [6]:
test_doc = nlp(u"you are best. it is lemmatize test for spacy. I love these books")
for token in test_doc:
    print((token, token.lemma_, token.lemma))

(you, '-PRON-', 561228191312463089)
(are, 'be', 10382539506755952630)
(best, 'good', 5711639017775284443)
(., '.', 12646065887601541794)
(it, '-PRON-', 561228191312463089)
(is, 'be', 10382539506755952630)
(lemmatize, 'lemmatize', 4507259281035238268)
(test, 'test', 1618900948208871284)
(for, 'for', 16037325823156266367)
(spacy, 'spacy', 10639093010105930009)
(., '.', 12646065887601541794)
(I, '-PRON-', 561228191312463089)
(love, 'love', 3702023516439754181)
(these, 'these', 6459564349623679250)
(books, 'book', 13814433107111459297)


### 词性标注(POS Tagging):

In [7]:
for token in test_doc:
    print((token, token.pos_, token.pos))

(you, 'PRON', 95)
(are, 'VERB', 100)
(best, 'ADJ', 84)
(., 'PUNCT', 97)
(it, 'PRON', 95)
(is, 'VERB', 100)
(lemmatize, 'NOUN', 92)
(test, 'NOUN', 92)
(for, 'ADP', 85)
(spacy, 'NOUN', 92)
(., 'PUNCT', 97)
(I, 'PRON', 95)
(love, 'VERB', 100)
(these, 'DET', 90)
(books, 'NOUN', 92)


### 命名实体识别（NER）：
test_doc.ents 表示命名实体

In [8]:
test_doc = nlp(u"Rami Eid is studying at Stony Brook University in New York")
for ent in test_doc.ents:
    print((ent, ent.label_, ent.label))

(Rami Eid, 'PERSON', 380)
(Stony Brook University, 'ORG', 383)
(New York, 'GPE', 384)


### 名词短语提取：

In [9]:
test_doc = nlp(u'Natural language processing (NLP) deals with the application of computational models to text or speech data. Application areas within NLP include automatic (machine) translation between languages; dialogue systems, which allow a human to interact with a machine using natural language; and information extraction, where the goal is to transform unstructured text into structured (database) representations that can be searched and browsed in flexible ways. NLP technologies are having a dramatic impact on the way people interact with computers, on the way people interact with each other through the use of language, and on the way people access the vast amount of linguistic data now in electronic form. From a scientific viewpoint, NLP involves fundamental questions of how to structure formal models (for example statistical models) of natural language phenomena, and of how to design algorithms that implement these models.')

In [10]:
for np in test_doc.noun_chunks:
    print(np)

Natural language processing
the application
computational models
text
speech data
Application areas
NLP
automatic (machine) translation
languages
dialogue systems
a human
a machine
natural language
information extraction
the goal
unstructured text
database
flexible ways
NLP technologies
a dramatic impact
the way
people
computers
the way
people
the use
language
the way
people
the vast amount
linguistic data
electronic form
a scientific viewpoint
NLP
fundamental questions
formal models
example
natural language phenomena
algorithms
these models


### 基于词向量计算两个单词的相似度：

In [11]:
test_doc = nlp(u"Apple and orange are similar. Boots and hippos aren't.")
test_doc

Apple and orange are similar. Boots and hippos aren't.

In [12]:
apple = test_doc[0]
print(apple)

Apple


In [13]:
orange = test_doc[2]
print(orange)

orange


In [14]:
hippos = test_doc[8]
print(hippos)

hippos


In [15]:
boots = test_doc[6]
print(boots)

Boots


In [17]:
apple.similarity(orange)

  "__main__", mod_spec)


0.35227782

In [18]:
boots.similarity(hippos)

  "__main__", mod_spec)


0.34843963

# 现在试试Spacy自带的词嵌入模型
对于英文来说，词嵌入模型位于 en_core_web_lg 模型下，需要提前下载好。lg比sm大很多。

In [19]:
nlp = spacy.load('en_core_web_lg')

### 拿"minister"测试一下

In [20]:
print(nlp.vocab['minister'].vector)

[-3.2239e-01  3.8982e-01  6.4522e-01  8.7596e-02  4.0262e-01 -4.1253e-01
  1.3804e-01 -4.0226e-01 -3.0679e-01  3.6520e+00 -6.1745e-01 -2.4114e-01
  1.7488e-01 -2.5995e-01 -3.8512e-01 -1.3747e-01 -3.7446e-01 -8.0867e-01
  2.8081e-01  2.9735e-01  1.7901e-01 -3.4726e-02  7.2466e-02 -5.2111e-01
  7.1858e-02  1.5913e-01 -6.0877e-01  1.6604e-01 -9.3809e-02  2.0030e-01
 -5.0653e-01 -1.4978e-01  1.7742e-01  3.2996e-02 -2.1082e-01 -6.0442e-02
 -9.6639e-02 -4.6054e-01 -7.1622e-02 -1.4829e-01  5.1362e-01 -3.7840e-01
 -3.2035e-01 -1.1593e-01 -1.5887e-01  2.5999e-01  2.6821e-01  2.7429e-01
 -5.8973e-02 -1.0218e-01  6.6629e-03  3.5737e-01  3.1083e-01  4.7950e-01
  4.4323e-01  8.0089e-03  6.4577e-02 -4.1851e-01  3.2589e-01 -6.0894e-01
 -3.5944e-01  1.9116e-01  1.5910e-02 -1.8912e-01  1.2646e-01 -9.6918e-03
 -6.5529e-01 -4.1851e-01  4.0244e-01 -6.7241e-01 -7.6219e-01  2.6207e-01
 -1.5607e+00  3.2982e-01 -4.1586e-01  3.4098e-01 -1.2571e-01 -1.2014e-01
  1.3608e-02 -1.7894e-01  5.0916e-01 -2.8449e-01 -1

In [21]:
dog = nlp.vocab["dog"]
cat = nlp.vocab["cat"]
apple = nlp.vocab["apple"]
orange = nlp.vocab["orange"]

In [22]:
cat

<spacy.lexeme.Lexeme at 0x150091ed480>

可见，cat是个spacy内部的一种格式的对象

In [23]:
apple.similarity(orange)

0.56189173

In [24]:
cat.similarity(dog)

0.80168545

In [25]:
cat.similarity(apple)

0.28213844