In [1]:
# Import the toolkit and the full Porter Stemmer library
import nltk

from nltk.stem.porter import *

In [2]:
p_stemmer = PorterStemmer()

In [8]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [4]:
for word in words:
    print(word + '------>' + p_stemmer.stem(word))

run------>run
runner------>runner
ran------>ran
runs------>run
easily------>easili
fairly------>fairli


In [5]:
from nltk.stem.snowball import SnowballStemmer

In [6]:
s_stemmer = SnowballStemmer(language='english')

In [9]:
for word in words:
    print(word + '------>' + s_stemmer.stem(word))

run------>run
runner------>runner
ran------>ran
runs------>run
easily------>easili
fairly------>fair
fairness------>fair


In [10]:
words = ['generous', 'generation', 'generously', 'generate']

In [11]:
for word in words:
    print(word + '------>' + s_stemmer.stem(word))

generous------>generous
generation------>generat
generously------>generous
generate------>generat


# Lemmatization in spaCy

In [12]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")



In [14]:
# print the tokes, port of speech and lemma (number) and lemma_ for actual
# the number points to a lemma in the library

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [15]:
# add some alignment with the printout
# make a function

def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')


In [16]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


# Stop Words

In [17]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)

{'six', 'part', "'s", "'re", 'all', 'but', 'except', 'being', 'to', 'seemed', 'wherever', 'thereby', 'nevertheless', 'moreover', 'he', 'became', 'sometimes', 'mostly', 'thereupon', 'while', 'quite', 'go', 'do', 're', 'amount', 'top', 'call', 'front', 'bottom', 'upon', 'if', 'of', 'without', 'get', 'last', 'move', 'within', 'whereby', 'ourselves', 'five', 'either', 'throughout', 'thru', 'just', 'during', 'our', 'thereafter', 'someone', 'together', 'yourselves', 'this', 'off', 'be', 'though', 'each', 'even', 'see', '’s', 'become', 'behind', '’re', 'had', 'meanwhile', 'and', 'well', 'must', 'beforehand', 'many', 'the', 'can', 'up', 'before', 'out', 'n‘t', 'hereafter', 'put', 'from', 'twelve', 'anyone', 'until', 'doing', 'along', 'with', 'what', 'is', 'forty', 'name', 'that', 'always', 'really', 'his', 'any', 'amongst', 'toward', 'both', 'my', 'she', 'yourself', "'m", 'how', 'through', 'would', 'further', 'yet', 'down', 'then', 'which', 'them', 'us', 'say', 'hereby', 'could', 'whole', 'its

In [18]:
len(nlp.Defaults.stop_words)

326

In [20]:
nlp.vocab['is'].is_stop

True

In [21]:
nlp.vocab['mystery'].is_stop

False

In [23]:
# add custom words to stop words

nlp.Defaults.stop_words.add('btw')

In [24]:
len(nlp.Defaults.stop_words)

327

In [25]:
# added btw

nlp.vocab['btw'].is_stop

True

In [26]:
# remove a stop word from the set

nlp.Defaults.stop_words.remove('beyond')

In [27]:
nlp.vocab['beyond'].is_stop

False