In [4]:
# Import the toolkit and the full Porter Stemmer library
import nltk

from nltk.stem.porter import PorterStemmer

In [2]:
p_stemmer = PorterStemmer()

In [10]:
words = ['run','runner','running','ran','runs','easily','fairly','fairness']

In [11]:
for word in words:
    print(word+' --> '+p_stemmer.stem(word))
    

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli
fairness --> fair


 **Better Version**

In [7]:
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

In [12]:
words = ['run','runner','running','ran','runs','easily','fairly','fairness']
# words = ['generous','generation','generously','generate']

In [13]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair
fairness --> fair


In [15]:
words=['generous','generation','generously','generate']

In [16]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

generous --> generous
generation --> generat
generously --> generous
generate --> generat


**Lemmatization**

In [17]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [18]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [19]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [20]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [21]:
doc3 = nlp(u"I am meeting him tomorrow at the meeting.")

show_lemmas(doc3)

I            PRON   561228191312463089     -PRON-
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   561228191312463089     -PRON-
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
.            PUNCT  12646065887601541794   .


In [22]:
doc4 = nlp(u"That's an enormous automobile")

show_lemmas(doc4)

That         DET    4380130941430378203    that
's           AUX    10382539506755952630   be
an           DET    15099054000809333061   an
enormous     ADJ    17917224542039855524   enormous
automobile   NOUN   7211811266693931283    automobile


**Stop-words**

In [1]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)

{'hereafter', 'thru', 'mine', 'either', 'how', 'some', '‘d', 'these', 'sometimes', 'anything', 'seemed', '‘m', 'forty', 'seems', 'whereupon', 'their', 'been', "'s", 'whatever', 'from', 'fifty', 'empty', 'nine', 'whither', 'whose', 'whereas', 'became', 'were', 'hereupon', 'now', 'front', 'meanwhile', 'very', 'moreover', 'none', 'over', 'see', 'this', 'at', 'we', 'anyone', 'just', 'that', 'more', 'anyhow', 'call', '’d', 'it', 'beyond', 'no', 'has', 'about', 'by', 'so', 'among', 'sixty', 'yet', 'everything', 'do', 'and', 'with', '‘re', 'rather', 'namely', 'does', 'third', '’m', 'several', 'few', 'whereafter', 'last', 'whole', 'yours', "'ll", 'during', '‘ve', "'m", 'me', 'is', 'everyone', 'of', 'both', 'serious', 'however', 'else', 'whence', 'themselves', 'any', 'be', 'somehow', 'per', 'twelve', 'something', '‘s', 'make', 'herein', 'part', 'take', "'ve", 'done', 'although', 'yourselves', 'nothing', 'latter', 'along', 'indeed', 'to', 'will', 'also', 'between', 'say', 'thereupon', '’ll', 'to

**To check whether a word is stop word or not**

In [3]:
nlp.vocab['myself'].is_stop

True

In [4]:
nlp.vocab['mystery'].is_stop

False

**To add a stop word**

In [5]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('btw')

# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

In [6]:
len(nlp.Defaults.stop_words)

327

In [7]:
nlp.vocab['btw'].is_stop

True

**To remove**

In [8]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

In [9]:
len(nlp.Defaults.stop_words)

326

In [10]:
nlp.vocab['beyond'].is_stop

False

**Vocabulary and Matching**

In [11]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [12]:
# Import the Matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [13]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [14]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [15]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [16]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power
