In [1]:
import spacy

In [3]:
# load the model.
nlp = spacy.load('en_core_web_sm')

In [7]:
# u'' = unicode string
# tokenise!
# Create document object.
doc = nlp(u'Tesla is looking at buying U.S. startup for $6M')

In [11]:
for token in doc:
    # pos = part of speech like NOUN, VERB e.t.c
    # PROPN = proper noun.
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 96 PROPN nsubj
is 87 AUX aux
looking 100 VERB ROOT
at 85 ADP prep
buying 100 VERB pcomp
U.S. 96 PROPN compound
startup 92 NOUN dobj
for 85 ADP prep
$ 99 SYM nmod
6 93 NUM pobj
M 92 NOUN pobj


In [12]:
# What is actually happening is a series of things!
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x150c9b050>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x150c9af30>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x150a9cdd0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x150e48750>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x150e4a390>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x150a9d070>)]

In [13]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [20]:
doc2 = nlp(u'Tesla isn\'t looking into startups anymore.')

In [21]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [22]:
doc2[0]

Tesla

In [23]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [24]:
life_quote = doc3[16:30]

In [25]:
# Span of the overall document...
life_quote

"Life is what happens to us while we are making other plans"

In [26]:
# It is a span
type(life_quote)

spacy.tokens.span.Span

In [27]:
type(doc3)

spacy.tokens.doc.Doc

In [28]:
doc4 = nlp(u'This is the first sentence. This is the second. this is le last sentunce')

In [30]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is the second.
this is le last sentunce


In [31]:
doc4[6].is_sent_start

True

In [32]:
doc4[8].is_sent_start

False

In [33]:
# Tokenisation is the process of breaking the original text into component pieces called tokens.

In [34]:
my_str = '"We\'re moving to L.A. in the spring!"'

In [36]:
my_str

'"We\'re moving to L.A. in the spring!"'

In [37]:
doc = nlp(my_str)

In [38]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
in
the
spring
!
"


In [39]:
str_two = u'We\'re here to help, send email to us at support@infotrack.com.au or visit our site infotrackgo.com.au'

In [40]:
str_two

"We're here to help, send email to us at support@infotrack.com.au or visit our site infotrackgo.com.au"

In [41]:
doc2 = nlp(str_two)

In [42]:
for tok in doc2:
    print(tok.text)

We
're
here
to
help
,
send
email
to
us
at
support@infotrack.com.au
or
visit
our
site
infotrackgo.com.au


In [43]:
doc3 = nlp(u'A 5KM NYC cab-ride costs $10.30')

In [44]:
for tok in doc3: print(tok.text)

A
5KM
NYC
cab
-
ride
costs
$
10.30


In [45]:
len(doc4)

16

In [47]:
len(doc4.vocab)

836

In [51]:
doc8 = nlp(u'Apple to build a hong kong factory for $6 Million')

In [52]:
for token in doc8: print(token.text)

Apple
to
build
a
hong
kong
factory
for
$
6
Million


In [58]:
for entity in doc8.ents: print(entity, entity.label_, str(spacy.explain(entity.label_)))

Apple ORG Companies, agencies, institutions, etc.
hong kong GPE Countries, cities, states
$6 Million MONEY Monetary values, including unit


In [59]:
doc9 = nlp(u'Autonomous cars shift insurance liability towards manufacturers')

In [60]:
for chunk in doc9.noun_chunks: print(chunk)

Autonomous cars
insurance liability
manufacturers


In [61]:
# Tokenisation Visualisation
from spacy import displacy

In [62]:
doc = nlp(u'Apple is going to build a UK factory for $7 Million')

In [63]:
displacy.render(doc) # render has display options

In [64]:
doc = nlp(u'Over the last quater Apple sold almost 20 thousand iPods for a profit of $2 Billion')

In [65]:
displacy.render(doc, style='ent', jupyter=True)

In [66]:
doc = nlp(u'This is a sentence')

In [None]:
# Outside of jupyter you can run this to get a small web server that renders the displacy calls.
displacy.serve(doc, style='dep', port=4000)




Using the 'dep' visualizer
Serving on http://0.0.0.0:4000 ...



127.0.0.1 - - [20/Jul/2023 06:01:05] "GET / HTTP/1.1" 200 3394
127.0.0.1 - - [20/Jul/2023 06:01:05] "GET /favicon.ico HTTP/1.1" 200 3394
