# Tools

In [1]:
import nltk
import spacy

# Stemming

In [4]:
from nltk.stem import PorterStemmer # You can also use SnowballStemmer 

stemmer = PorterStemmer()

In [7]:
#Let's try it !

words = ["studying", "working", "eat", "ate", "ability" ]

for word in words:
    print(word, "|",stemmer.stem(word))

studying | studi
working | work
eat | eat
ate | ate
ability | abil


In [8]:
# You can see that sometimes it fails!

# Lemmatization

In [10]:
# small english processing pipeline
nlp = spacy.load("en_core_web_sm")

doc = nlp("studying working eat ate ability")

for token in doc:
    print(token,"|", token.lemma_)

studying | study
working | work
eat | eat
ate | eat
ability | ability


In [11]:
#You can see that the problem is fixed with Lemmatization

In [13]:
for token in doc:
    print(token,"|", token.lemma)
    
# You can also see the hash of each word with lemma.
# You can see that for eat the hash is the same

studying | 4251533498015236010
working | 10038440415813069799
eat | 9837207709914848172
ate | 9837207709914848172
ability | 11565809527369121409


# Custom rules 

In [14]:
doc = nlp("Hey bro ! you need to study braw even if you are not in the mood for it")

for token in doc:
    print(token,"|", token.lemma_)

Hey | hey
bro | bro
! | !
you | you
need | need
to | to
study | study
braw | braw
even | even
if | if
you | you
are | be
not | not
in | in
the | the
mood | mood
for | for
it | it


You can see that the language model does not understand that bro and braw mean the same thing wich is brother.

In [15]:
# Let's customize the rule
# You can use attribute ruler for that

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [20]:
ar = nlp.get_pipe('attribute_ruler')

#We want lemma to be Brother for Bro and Braw

ar.add([[{"TEXT":"bro"}],[{"TEXT":"braw"}]],{"LEMMA":"Brother"})

doc = nlp("Hey bro ! you need to study braw even if you are not in the mood for it")

for token in doc:
    print(token,"|", token.lemma_)

Hey | hey
bro | Brother
! | !
you | you
need | need
to | to
study | study
braw | Brother
even | even
if | if
you | you
are | be
not | not
in | in
the | the
mood | mood
for | for
it | it


In [21]:
# That's it