In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# nlp model will process the text and save it to a Doc object once called.
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2c68286a030>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2c6847853d0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2c68462be60>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2c6848eba10>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2c6847cb490>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2c68462bed0>)]

Tokenisation

In [3]:
# doc will contain tokens of the text
doc = nlp('"Let\'s go to N.Y.!"')
for token in doc:
    print(token.text)

"
Let
's
go
to
N.Y.
!
"


Named Entities with spacy

In [4]:
""" 
In a sentence, a word or a group of words indicates names (e.g., Name of location, person, area, country, state, monetary values, and so on). 
The primary purpose of a named entity is to identify it.
The named entities can be accessible with ents attributes of Doc object.
"""

' \nIn a sentence, a word or a group of words indicates names (e.g., Name of location, person, area, country, state, monetary values, and so on). \nThe primary purpose of a named entity is to identify it.\nThe named entities can be accessible with ents attributes of Doc object.\n'

In [15]:
# Normal Tokenisation
sentence = nlp("Yash is an Machine Learning enthusiast and events head of RoboVITics")
for token in sentence:
    print(token.text,end=' ')

print('----------------------------')

# Named entity
for ent in sentence.ents:
    print(ent.text+'-'+ent.label_+'-'+str(spacy.explain(ent.label_)))

Yash is an Machine Learning enthusiast and events head of RoboVITics ----------------------------
Yash-PERSON-People, including fictional
Machine Learning-GPE-Countries, cities, states
RoboVITics-NORP-Nationalities or religious or political groups


Visualizing named entity

In [19]:
from spacy import displacy
doc = nlp('Over last few years USA generates $6 million revenue')
displacy.render(doc,style='ent',jupyter=True)

Visualizing dependency

In [20]:
from spacy import displacy
doc = nlp('India is a beautiful country.')
displacy.render(doc,style='dep',jupyter=True,options={'distance':110})

Stemming

In [21]:
# Spacy doesnt have a Stemmer and relies on lemmatization
# We prefer PorterStemmer or Snowball Stemmer from nltk for Stemming

Lemmatization and POS Tagging with spaCy

In [22]:
doc = nlp("He is a runner in a competition because he loves to run since he ran today")

for token in doc:
    print(f'{token.text:{12}} {token.pos_:{6}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

He           PRON   PRON   1655312771067108281    he
is           AUX    AUX    10382539506755952630   be
a            DET    DET    11901859001352538922   a
runner       NOUN   NOUN   12640964157389618806   runner
in           ADP    ADP    3002984154512732771    in
a            DET    DET    11901859001352538922   a
competition  NOUN   NOUN   4661638505416061516    competition
because      SCONJ  SCONJ  16950148841647037698   because
he           PRON   PRON   1655312771067108281    he
loves        VERB   VERB   3702023516439754181    love
to           PART   PART   3791531372978436496    to
run          VERB   VERB   12767647472892411841   run
since        SCONJ  SCONJ  10066841407251338481   since
he           PRON   PRON   1655312771067108281    he
ran          VERB   VERB   12767647472892411841   run
today        NOUN   NOUN   11042482332948150395   today


Stopwords

In [23]:
print(nlp.Defaults.stop_words)

{'done', 'why', 'thru', 'enough', 'part', 'yourself', 'themselves', "'s", 'former', 'herein', 'in', 'beyond', 'really', 'latter', 'nine', "'ve", 'about', 'becoming', 'will', 'whose', 'further', 'get', 'almost', 'hers', 'who', 'third', 'and', 'they', 'indeed', 'name', 'seem', 'such', 'others', 'does', 'latterly', 'anything', 'a', 'go', 'n‘t', 'few', 'thus', 'my', '‘d', 'between', 'along', 'anyway', 'nor', 'keep', "n't", 'else', 'must', 'myself', 'all', 'ca', 'same', 'whatever', 'but', 'fifty', 'without', 'during', 'front', 'through', 'every', 'where', 'both', 'various', 'thereafter', 'give', 'yet', '‘s', 'could', 'only', 'otherwise', 'show', 'fifteen', 'whither', 'over', 'other', 'behind', 'for', 'has', "'ll", 'against', 'either', 'hereupon', 'off', 'should', 'what', 'is', 'first', 'our', 'somewhere', 'together', 'whenever', 'itself', 'several', '’d', 'those', 'someone', 'many', 'nobody', 'ours', '‘ll', 'being', 'out', 'himself', 'or', 'mine', 'not', '’ll', 'which', 'yourselves', 'hereb

In [25]:
nlp.vocab['myself'].is_stop

True

In [26]:
# Adding a word to stopwords
nlp.Defaults.stop_words.add('btw')
nlp.Defaults.stop_words.remove('hers')

Matcher with spaCy

In [27]:
# allows us to set rules or regular expressions to match with a Doc object, and it returns a list containing the found matches
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [31]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
matcher.add('SolarPower',[pattern1, pattern2, pattern3])

In [34]:
# Applying Matcher to the doc object
document = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')
found_matches = matcher(document)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [35]:
# Finding out text for the matches
for match_id, start, end in found_matches:
    # get string representation
    string_id = nlp.vocab.strings[match_id]
    # get the matched span
    span = doc[start:end]       
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 is a
8656102463236116519 SolarPower 10 11 to
8656102463236116519 SolarPower 13 16 he ran today
