In [20]:
# Intro to Spacy

In [21]:
import spacy
# Load the installed model "en_core_web_sm"
nlp = spacy.load("en_core_web_sm")

In [22]:
doc = nlp("This is a text")

# Token texts
[token.text for token in doc]
# ['This', 'is', 'a', 'text']

['This', 'is', 'a', 'text']

In [23]:
doc = nlp("This is a text")

span = doc[2:4]
span.text

'a text'

In [24]:
doc = nlp("This is a text.")

# Coarse-grained part-of-speech tags
[token.pos_ for token in doc]
# ['DET', 'VERB', 'DET', 'NOUN', 'PUNCT']

# Fine-grained part-of-speech tags
# [token.tag_ for token in doc]
# ['DT', 'VBZ', 'DT', 'NN', '.']

['PRON', 'AUX', 'DET', 'NOUN', 'PUNCT']

In [25]:
doc = nlp("Larry Page founded Google")
# Text and label of named entity span
[(ent.text, ent.label_) for ent in doc.ents]
# [('Larry Page', 'PERSON'), ('Google', 'ORG')]

[('Larry Page', 'PERSON'), ('Google', 'ORG')]

In [26]:
doc = nlp("This a sentence. This is another one.")
# doc.sents is a generator that yields sentence spans
[sent.text for sent in doc.sents]
# ['This is a sentence.', 'This is another one.']

['This a sentence.', 'This is another one.']

In [27]:
# Comparing similarity

In [28]:
nlp = spacy.load("en_core_web_md")

In [48]:
doc1 = nlp("I like cats")
doc2 = nlp("I like dogs")

# Compare 2 documents
print(doc1.similarity(doc2))

# Compare 2 tokens
print(doc1[2].similarity(doc2[2]))

# Compare tokens and spans
print(doc1[0].similarity(doc2[1:3]))

0.9853113936930281
0.8753712773323059
0.35636189579963684


In [49]:
doc1 = nlp("Jeg kan godt lide at bygge modeller")
doc2 = nlp("Konstruktion af modeller er noget jeg synes om")

# Compare 2 documents
doc1.similarity(doc2)

0.7272915484427941

In [31]:
doc1 = nlp("han går til fodbold")
doc2 = nlp("bageren har åbent idag. De sælger krokoskager")

# Compare 2 documents
doc1.similarity(doc2)

0.24033794179199525

In [32]:
from spacy import displacy