In [1]:
import nltk
from nltk.stem import PorterStemmer,LancasterStemmer,SnowballStemmer,RegexpStemmer
from nltk.tokenize import word_tokenize

In [2]:
#sample text documents
documents = [
    'The cats are running and jumping in the garden.',
    'She is a beautiful runner and loves to run fast.',
    'Running help to build stamina and strength.',
    'He ran swiftyly and caught the ball'
]

In [3]:
#initialize stemmers
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snowball_stemmer = SnowballStemmer("english") # multiple language support 

#define a simple regex fro stemming (this can be customized)
regex_pattern = r'(ing|ed|es|s)$'
regex_stemmer = RegexpStemmer(regex_pattern)



In [4]:
#function to apply different semmers
def apply_stemmers(documents):
    results = {}
    
    for doc in documents:
        #tokenize the document
        tokens = word_tokenize(doc.lower())
        
        
        #apply different stemmers 
        porter_stems = [porter_stemmer.stem(token) for token in tokens]
        lancaster_stems = [lancaster_stemmer.stem(token) for token in tokens]
        snowball_stems = [snowball_stemmer.stem(token) for token in tokens]
        regex_stems = [regex_stemmer.stem(token) for token in tokens]
        
        #store results 
        results[doc] = {
            'porter' : porter_stems,
            'lancaster':lancaster_stems,
            'snowball':snowball_stems,
            'regex':regex_stems
        }
    return results



In [15]:
#apply the stemmers to the sample documents
stemmed_results = apply_stemmers(documents)

#print the result 
for original_doc, stems in stemmed_results.items():
    print(f"\nOriginal Documnets: {original_doc}" )
    for stemmer_name,stemmed_words in stems.items():
        print(f"{stemmer_name.capitalize()},stems:{stemmed_words}")


Original Documnets: The cats are running and jumping in the garden.
Porter,stems:['the', 'cat', 'are', 'run', 'and', 'jump', 'in', 'the', 'garden', '.']
Lancaster,stems:['the', 'cat', 'ar', 'run', 'and', 'jump', 'in', 'the', 'gard', '.']
Snowball,stems:['the', 'cat', 'are', 'run', 'and', 'jump', 'in', 'the', 'garden', '.']
Regex,stems:['the', 'cat', 'are', 'runn', 'and', 'jump', 'in', 'the', 'garden', '.']

Original Documnets: She is a beautiful runner and loves to run fast.
Porter,stems:['she', 'is', 'a', 'beauti', 'runner', 'and', 'love', 'to', 'run', 'fast', '.']
Lancaster,stems:['she', 'is', 'a', 'beauty', 'run', 'and', 'lov', 'to', 'run', 'fast', '.']
Snowball,stems:['she', 'is', 'a', 'beauti', 'runner', 'and', 'love', 'to', 'run', 'fast', '.']
Regex,stems:['she', 'i', 'a', 'beautiful', 'runner', 'and', 'lov', 'to', 'run', 'fast', '.']

Original Documnets: Running help to build stamina and strength.
Porter,stems:['run', 'help', 'to', 'build', 'stamina', 'and', 'strength', '.']
La

# but

In [27]:
from nltk.stem import WordNetLemmatizer

#initialze the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

#function to apply lemmatization 
def apply_lemmatization(doc):
    
    #tokenize the documents
    tokens = word_tokenize(doc)
    
    #apply lemmatization to each token 
    lemmatized_tokens = [lemmatizer.lemmatize(token,pos='v') for token in tokens]
    
    return {
        "original" : tokens,
        "lemmatized":lemmatized_tokens
    }
    
    
#process each document and print result 
for doc in documents:
    results = apply_lemmatization(doc)
    print(f"Original Document : {results['original']}")
    print(f"Lemmatized:{results['lemmatized']}")
    print("="*40)
    
          
    
    
    
    
    
    
    

Original Document : ['The', 'cats', 'are', 'running', 'and', 'jumping', 'in', 'the', 'garden', '.']
Lemmatized:['The', 'cat', 'be', 'run', 'and', 'jump', 'in', 'the', 'garden', '.']
Original Document : ['She', 'is', 'a', 'beautiful', 'runner', 'and', 'loves', 'to', 'run', 'fast', '.']
Lemmatized:['She', 'be', 'a', 'beautiful', 'runner', 'and', 'love', 'to', 'run', 'fast', '.']
Original Document : ['Running', 'help', 'to', 'build', 'stamina', 'and', 'strength', '.']
Lemmatized:['Running', 'help', 'to', 'build', 'stamina', 'and', 'strength', '.']
Original Document : ['He', 'ran', 'swiftyly', 'and', 'caught', 'the', 'ball']
Lemmatized:['He', 'run', 'swiftyly', 'and', 'catch', 'the', 'ball']
