In [1]:
# Daniel Bandala @ sep 2022
from nltk import download
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

# Stemming
In linguistic morphology and information retrieval, stemming is the process of reducing inflected, or sometimes derived, words to their word stem, base or root form, generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root. Algorithms for stemming have been studied in computer science since the 1960s. Many search engines treat words with the same stem as synonyms as a kind of query expansion, a process called conflation.

The Porter stemming algorithm, or ‘Porter stemmer’, is a process for removing the commoner morphological and inflexional endings from words in English. Its main use is as part of a term normalisation process that is usually done when setting up Information Retrieval systems.

In [2]:
ps = PorterStemmer()
lc=LancasterStemmer()

In [3]:
# choose some words to be stemmed
words = ["program", "programs", "programmer", "programming", "programmers"]

In [4]:
for w in words:
    print(w, " : ", ps.stem(w))

program  :  program
programs  :  program
programmer  :  programm
programming  :  program
programmers  :  programm


In [5]:
print("Porter Stemmer")
print(ps.stem("cats"))
print(ps.stem("trouble"))
print(ps.stem("troubling"))
print(ps.stem("troubled"))
print("\nLancaster Stemmer")
print(ps.stem("cats"))
print(ps.stem("trouble"))
print(ps.stem("troubling"))
print(ps.stem("troubled"))

Porter Stemmer
cat
troubl
troubl
troubl

Lancaster Stemmer
cat
troubl
troubl
troubl


In [6]:
#A list of words to be stemmed
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]
print("{0:20}{1:20}{2:20}".format("Word","Porter Stemmer","lancaster Stemmer"))
for word in word_list:
    print("{0:20}{1:20}{2:20}".format(word,ps.stem(word),lc.stem(word)))

Word                Porter Stemmer      lancaster Stemmer   
friend              friend              friend              
friendship          friendship          friend              
friends             friend              friend              
friendships         friendship          friend              
stabil              stabil              stabl               
destabilize         destabil            dest                
misunderstanding    misunderstand       misunderstand       
railroad            railroad            railroad            
moonlight           moonlight           moonlight           
football            footbal             footbal             


In [10]:
def stemSentence(sentence):
    #token_words=word_tokenize(sentence)
    #token_words
    stem_sentence=[]
    #for word in token_words:
    for word in sentence.split():
        stem_sentence.append(word+" - "+ps.stem(word))
        stem_sentence.append("\n")
    return "".join(stem_sentence)

In [8]:
sentence="We have alluded to the idea that MRI is based on the interaction of nuclear spin with an external magnetic field."
ps.stem(sentence)

'we have alluded to the idea that mri is based on the interaction of nuclear spin with an external magnetic field.'

In [23]:
#call the nltk downloader
download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [11]:
x=stemSentence(sentence)
print(x)

We - we
have - have
alluded - allud
to - to
the - the
idea - idea
that - that
MRI - mri
is - is
based - base
on - on
the - the
interaction - interact
of - of
nuclear - nuclear
spin - spin
with - with
an - an
external - extern
magnetic - magnet
field. - field.

