In [1]:
## Stemming in NLP
# Stemming is the process of reducing a word to its base or root form.
# For example, the words "running", "runner", and "ran" can all be reduced to the root word "run".
# This is useful in natural language processing (NLP) because it allows us to treat different forms of a word as the same word.

In [None]:
words = ["running", "runner", "ran", "easily", "fairly", 
"cats", "cacti", "eating", "eats", "eaten", "ate", "eater", "eaters", "eating", "eats"
"finally", "finally", "finalize", "finalizes", "finalizing", 
"finalized", "finalizer", "finalizers"]


In [3]:
## Porter Stemmer
# The Porter Stemmer is a widely used algorithm for stemming words in English.
# It works by applying a set of rules to the word to reduce it to its root form.
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
for word in words:
    print(f"{word} -> {stemmer.stem(word)}")
print("\n")

running -> run
runner -> runner
ran -> ran
easily -> easili
fairly -> fairli
cats -> cat
cacti -> cacti
eating -> eat
eats -> eat
eaten -> eaten
ate -> ate
eater -> eater
eaters -> eater
eating -> eat
eatsfinally -> eatsfin
finally -> final
finalize -> final
finalizes -> final
finalizing -> final
finalized -> final
finalizer -> final
finalizers -> final




In [4]:
stemmer.stem("congratulations")

'congratul'

In [5]:
stemmer.stem("sitting")

'sit'

In [None]:
## RegexStemmer
# The RegexStemmer is a more flexible stemming algorithm that uses regular expressions to match patterns in words.
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer("ing$|s$|e$|able$|es$|ed$|$ble", min=4)
# The min parameter specifies the minimum length of the word to be stemmed.
# The regex pattern specifies the suffixes to be removed from the word.
for word in words:
    print(f"{word} -> {stemmer.stem(word)}")
print("\n")

running -> runn
runner -> runner
ran -> ran
easily -> easily
fairly -> fairly
cats -> cat
cacti -> cacti
eating -> eat
eats -> eat
eaten -> eaten
ate -> ate
eater -> eater
eaters -> eater
eating -> eat
eatsfinally -> eatsfinally
finally -> finally
finalize -> finaliz
finalizes -> finaliz
finalizing -> finaliz
finalized -> finaliz
finalizer -> finalizer
finalizers -> finalizer




In [9]:
stemmer.stem("edible")

'edibl'