In [2]:
# Spacy doesn't have support for stemming, 
# NLTK supports both stemming and lemmatization

import nltk
import spacy

In [3]:
from nltk.stem import PorterStemmer   # we have one more stemmer which is snowballstemmer

stemmer = PorterStemmer()

In [4]:
words = ['eating','eats','eat','ate','adjustable','rafting','ability','meeting']

for word in words:
    print(word, " | ", stemmer.stem(word)) 

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  ate
adjustable  |  adjust
rafting  |  raft
ability  |  abil
meeting  |  meet


In [5]:
nlp = spacy.load('en_core_web_sm')

doc = nlp('eating eats eat ate adjustable rafting ability meeting better')
for token in doc:
    print(token,' | ', token.lemma_, ' | ', token.lemma)   # lemma(without underscore) gives has of each word

eating  |  eat  |  9837207709914848172
eats  |  eat  |  9837207709914848172
eat  |  eat  |  9837207709914848172
ate  |  eat  |  9837207709914848172
adjustable  |  adjustable  |  6033511944150694480
rafting  |  raft  |  7154368781129989833
ability  |  ability  |  11565809527369121409
meeting  |  meeting  |  14798207169164081740
better  |  well  |  4525988469032889948


In [6]:
doc = nlp("Mando talked for 3 hours although talking isn't his thing, he became talkative")

for token in doc:
    print(token, ' | ', token.lemma_)

Mando  |  Mando
talked  |  talk
for  |  for
3  |  3
hours  |  hour
although  |  although
talking  |  talk
is  |  be
n't  |  not
his  |  his
thing  |  thing
,  |  ,
he  |  he
became  |  become
talkative  |  talkative


In [7]:
nlp.pipe_names   # attribute_ruler assigns rule to the attributes and can also update it

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [8]:
doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")

for token in doc:
    print(token, ' | ', token.lemma_)

Bro  |  bro
,  |  ,
you  |  you
wanna  |  wanna
go  |  go
?  |  ?
Brah  |  Brah
,  |  ,
do  |  do
n't  |  not
say  |  say
no  |  no
!  |  !
I  |  I
am  |  be
exhausted  |  exhaust


In [9]:
doc[0]

Bro

In [10]:
doc[0].lemma_

'bro'

In [11]:
ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT": "Bro"}], [{"TEXT":"Brah"}]], {"LEMMA": "Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")

for token in doc:
    print(token, ' | ', token.lemma_)

Bro  |  Brother
,  |  ,
you  |  you
wanna  |  wanna
go  |  go
?  |  ?
Brah  |  Brother
,  |  ,
do  |  do
n't  |  not
say  |  say
no  |  no
!  |  !
I  |  I
am  |  be
exhausted  |  exhaust


In [12]:
doc[0]

Bro

In [13]:
doc[0].lemma_

'Brother'

In [14]:
#using stemming in nltk
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']

for word in lst_words:
    print(word, " | ", stemmer.stem(word))
    
# did not convert the past form into base word

running  |  run
painting  |  paint
walking  |  walk
dressing  |  dress
likely  |  like
children  |  children
whom  |  whom
good  |  good
ate  |  ate
fishing  |  fish


In [16]:
#using lemmatization in spacy

doc = nlp("running painting walking dressing likely children who good ate fishing")

for token in doc:
    print(token, ' | ', token.lemma_)
    
# converted past form too in base word but fishing remains same

running  |  run
painting  |  paint
walking  |  walk
dressing  |  dress
likely  |  likely
children  |  child
who  |  who
good  |  good
ate  |  eat
fishing  |  fishing


#### Observations

#### Words that are different in stemming and lemmatization are:

painting
likely
children
ate
fishing
As Stemming achieves the base word by removing the suffixes [ing, ly etc], so it successfully transform the words like 'painting', 'likely', 'fishing' and lemmatization fails for some words ending with suffixes here.

As Lemmatization uses the dictionary meanings while converting to the base form, so words like 'children' and 'ate' are successfully transformed and stemming fails here.

In [19]:
text = """Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a 
habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.
"""

In [30]:
#using stemming in nltk

#step1: Word tokenizing
all_word_tokenize = nltk.word_tokenize(text)


#step2: getting the base form for each token using stemmer
base_form = []
for token in all_word_tokenize:
    base_form.append(stemmer.stem(token))
base_form

#step3: joining all words in a list into string using 'join()'
complete_string = ' '.join(base_form)
complete_string

'latha is veri multi talent girl.sh is good at mani skill like danc , run , sing , playing.sh also like eat pav bhagi . she ha a habit of fish and swim too.besid all thi , she is a wonder at cook too .'

In [33]:
#using lemmatisation in spacy


#step1: Creating the object for the given text
doc = nlp(text)


#step2: getting the base form for each token using spacy 'lemma_'

base_word = []
for token in doc:
    base_word.append(token.lemma_)


#step3: joining all words in a list into string using 'join()'
complete_sentence = ' '.join(base_word)
complete_sentence

'Latha be very multi talented girl . she be good at many skill like dancing , running , singing , play . she also like eat Pav Bhagi . she have a \n habit of fishing and swim too . besides all this , she be a wonderful at cook too . \n'