In [15]:
# Stemming

In [1]:
# Five phases of reduction (Porter's Rule)

# 1. Simple suffix
#    SSES --> SS; ISE --> I; SS --> SS; S --> ' '
# 2. Length / Complexity
#    ATIONAL --> ATE; EED --> EE

In [2]:
# Snowball --> improved Porter's Rule

In [3]:
import nltk

In [4]:
from nltk.stem.porter import PorterStemmer

In [5]:
p_stemmer = PorterStemmer()

In [7]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly']

In [9]:
for word in words:
    print(word + ' ---> ' + p_stemmer.stem(word))

run ---> run
runner ---> runner
ran ---> ran
runs ---> run
easily ---> easili
fairly ---> fairli


In [10]:
from nltk.stem.snowball import SnowballStemmer

In [11]:
s_stemmer = SnowballStemmer(language='english')

In [12]:
for word in words:
    print(word + ' ---> ' + s_stemmer.stem(word))

run ---> run
runner ---> runner
ran ---> ran
runs ---> run
easily ---> easili
fairly ---> fair


In [13]:
words = ['generous', 'generation', 'generously', 'generate']

In [14]:
for word in words:
    print(word + ' ---> ' + s_stemmer.stem(word))

generous ---> generous
generation ---> generat
generously ---> generous
generate ---> generat


In [17]:
# Lemmatization
# look at surrounding text to determine a given word's part of speech

In [18]:
import spacy

In [20]:
nlp = spacy.load('en_core_web_sm')

In [21]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [24]:
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [25]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}}{token.pos_:{6}}{token.lemma:<{22}}{token.lemma_}')

In [26]:
show_lemmas(doc1)

I           PRON  4690420944186131903   I
am          AUX   10382539506755952630  be
a           DET   11901859001352538922  a
runner      NOUN  12640964157389618806  runner
running     VERB  12767647472892411841  run
in          ADP   3002984154512732771   in
a           DET   11901859001352538922  a
race        NOUN  8048469955494714898   race
because     SCONJ 16950148841647037698  because
I           PRON  4690420944186131903   I
love        VERB  3702023516439754181   love
to          PART  3791531372978436496   to
run         VERB  12767647472892411841  run
since       SCONJ 10066841407251338481  since
I           PRON  4690420944186131903   I
ran         VERB  12767647472892411841  run
today       NOUN  11042482332948150395  today


In [27]:
doc2 = nlp(u"I saw ten mice today!")

In [28]:
show_lemmas(doc2)

I           PRON  4690420944186131903   I
saw         VERB  11925638236994514241  see
ten         NUM   7970704286052693043   ten
mice        NOUN  1384165645700560590   mouse
today       NOUN  11042482332948150395  today
!           PUNCT 17494803046312582752  !


In [29]:
# Stop Words

In [30]:
print(nlp.Defaults.stop_words)

{'beside', 'nobody', 'empty', 'never', 'across', 'did', 'thereafter', 'our', 'am', 'becomes', 'whereafter', 'third', 'seem', 'whole', 'n’t', 'every', '’ll', 'many', 'name', 'some', "'s", 'because', 'amount', 'four', 'nor', 'either', 'since', 'myself', 'have', 'get', 'nevertheless', 'itself', 'done', 'been', "'re", 'formerly', 'whom', 'serious', 'their', 'they', 'whose', 'former', 'these', 'least', 'back', 'between', 'latter', 'yet', 'thereupon', 'three', 'on', 'hundred', 'else', 'which', 'out', 'upon', 'something', 'under', 'towards', '’ve', 'be', 'thru', 'elsewhere', 'too', 'even', 'often', 'thence', 'throughout', 'well', 'becoming', 'top', 'herself', 'again', 'most', 'down', 'hereby', 'give', '‘ll', 'afterwards', 'for', 'next', 'or', 'could', 'she', 'during', 'so', 're', 'amongst', 'toward', 'just', '‘ve', 'where', 'through', 'go', 'via', 'i', 'put', 'seeming', 'them', 'quite', '’m', 'cannot', 'take', 'whatever', 'his', 'around', 'none', 'see', 'perhaps', 'the', 'few', 'both', 'one',

In [31]:
len(nlp.Defaults.stop_words)

326

In [34]:
nlp.vocab['is'].is_stop

True

In [35]:
nlp.vocab['mystery'].is_stop

False

In [36]:
nlp.Defaults.stop_words.add('btw')

In [37]:
nlp.vocab['btw'].is_stop = True

In [38]:
len(nlp.Defaults.stop_words)

327

In [39]:
nlp.Defaults.stop_words.remove('beyond')

In [40]:
nlp.vocab['beyond'].is_stop = False

In [41]:
nlp.vocab['beyond'].is_stop

False

In [42]:
len(nlp.Defaults.stop_words)

326