In [1]:
! pip install nltk



# Stemming 

Stemming is a text preprocessing technique that simplifies words by stripping prefixes and suffixes, yielding base forms for effective processing and 
storage. For instance, the word “running” becomes “run” once we’ve performed stemming. However, while performing this technique, it’s important to 
note that it can result in inaccuracies and semantic loss, as we’ll get to see. Because of this, even though stemming has advantages like reducing 
vocabulary, it requires careful application.

### 1. PorterStemmer

In [6]:
# Define a word list

words=["eating","eats","eaten","writing","writes","programming","programs","history","finally","finalized"]

In [10]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

for word in words:
    print(word , '----------->' , stemmer.stem(word))

eating -----------> eat
eats -----------> eat
eaten -----------> eaten
writing -----------> write
writes -----------> write
programming -----------> program
programs -----------> program
history -----------> histori
finally -----------> final
finalized -----------> final


In [13]:
## Some issues with Stemming

print(stemmer.stem('congratulations'))



congratul
sit


### 2. RegexpStemmer

In [14]:
from nltk.stem import RegexpStemmer

# Instantiate and define
stemmer = RegexpStemmer('ing$|s$|e$|able$|' , min=4)

print(stemmer.stem('cars'))
print(stemmer.stem('mass'))
print(stemmer.stem('was'))
print(stemmer.stem('coming'))
print(stemmer.stem('agreeable'))
print(stemmer.stem('compute'))

car
mas
was
com
agree
comput


### 3. SnowballStemmer

In [4]:
from nltk.stem import SnowballStemmer

print(" ".join(SnowballStemmer.languages))

arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish


In [19]:
# instantiate and define 

gstemmer = SnowballStemmer("german")
print(gstemmer.stem("Autobahnen"))

estemmer = SnowballStemmer("english")
print(estemmer.stem("coming"))

autobahn
come


In [20]:
for word in words:
    print(word+"---->"+estemmer.stem(word))

eating---->eat
eats---->eat
eaten---->eaten
writing---->write
writes---->write
programming---->program
programs---->program
history---->histori
finally---->final
finalized---->final


In [2]:
# Read the necessary dataset
import pandas as pd

df = pd.read_csv("C:/Users/ariji/OneDrive/Desktop/Data/reviews.csv")
df.head()

Unnamed: 0,review_id,text
0,txt145,The software had a steep learning curve at fir...
1,txt327,I'm really impressed with the user interface o...
2,txt209,The latest update to the software fixed severa...
3,txt825,I encountered a few glitches while using the s...
4,txt878,I was skeptical about trying the software init...


In [5]:
snowball_stemmer = SnowballStemmer('english')

In [6]:
df['stemmed_text'] = df['text'].apply(lambda x: ' '.join([snowball_stemmer.stem(word) for word in x.split()]))
print(df['stemmed_text'])

0     the softwar had a steep learn curv at first, b...
1     i'm realli impress with the user interfac of t...
2     the latest updat to the softwar fix sever bug ...
3     i encount a few glitch while use the software,...
4     i was skeptic about tri the softwar initially,...
5     the analyt featur have provid us with valuabl ...
6     i appreci the regular updat that the softwar r...
7     i attend a train session for the software, and...
8     the softwar document could be more comprehensi...
9     i'v recommend the softwar to colleagu due to i...
10    the softwar integr with third-parti plugin has...
11    i'm look forward to the upcom releas of the so...
12    the user communiti is activ and supportive, ma...
13    i'v been use the softwar for a while now, and ...
14    the user interfac could use some modernization...
15    i went for a run and the softwar did a good jo...
Name: stemmed_text, dtype: object


In [None]:
### 4. Comparison of multiple Stemming algorithms

In [21]:
# import and download prerequisites

import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('porter_stemmer')
nltk.download('snowball_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ariji\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading porter_stemmer: Package 'porter_stemmer' not
[nltk_data]     found in index
[nltk_data] Error loading snowball_tagger: Package 'snowball_tagger'
[nltk_data]     not found in index
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ariji\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
# Define a simple paragraph
paragraph = "The quick brown fox jumps over the lazy dog."

# Create a list of stemmers
stemmers = [
    nltk.PorterStemmer(),
    nltk.SnowballStemmer("english"),
]

# Tokenize the paragraph
tokens = nltk.word_tokenize(paragraph)

# Stem each token using different stemmers
stemmed_words = []
for stemmer in stemmers:
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    stemmed_words.append(stemmed_tokens)

# Print the stemmed words for each stemmer
for i, stemmed_tokens in enumerate(stemmed_words):
    print(f"Stemmed words using stemmer {i+1}: {stemmed_tokens}")

Stemmed words using stemmer 1: ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']
Stemmed words using stemmer 2: ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']


In [24]:
# Define a complex paragraph

paragraph = """The quick brown fox jumps over the lazy dog. The lazy dog, feeling quite put out by the fox's antics, decided to chase the fox away 
from his favorite nap spot. The fox, being the mischievous creature he was, merely laughed and continued to taunt the dog. As the sun began to set, 
the exhausted dog finally gave up the chase and retreated to his cozy den, vowing to get revenge on the pesky fox the next day."""

# Create a list of stemmers
stemmers = [
    nltk.PorterStemmer(),
    nltk.SnowballStemmer("english"),
]

# Tokenize the paragraph
tokens = nltk.word_tokenize(paragraph)

# Stem each token using different stemmers
stemmed_words = []
for stemmer in stemmers:
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    stemmed_words.append(stemmed_tokens)

# Print the stemmed words for each stemmer
for i, stemmed_tokens in enumerate(stemmed_words):
    print(f"Stemmed words using stemmer {i+1}: {stemmed_tokens}")

Stemmed words using stemmer 1: ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.', 'the', 'lazi', 'dog', ',', 'feel', 'quit', 'put', 'out', 'by', 'the', 'fox', "'s", 'antic', ',', 'decid', 'to', 'chase', 'the', 'fox', 'away', 'from', 'hi', 'favorit', 'nap', 'spot', '.', 'the', 'fox', ',', 'be', 'the', 'mischiev', 'creatur', 'he', 'wa', ',', 'mere', 'laugh', 'and', 'continu', 'to', 'taunt', 'the', 'dog', '.', 'as', 'the', 'sun', 'began', 'to', 'set', ',', 'the', 'exhaust', 'dog', 'final', 'gave', 'up', 'the', 'chase', 'and', 'retreat', 'to', 'hi', 'cozi', 'den', ',', 'vow', 'to', 'get', 'reveng', 'on', 'the', 'peski', 'fox', 'the', 'next', 'day', '.']
Stemmed words using stemmer 2: ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.', 'the', 'lazi', 'dog', ',', 'feel', 'quit', 'put', 'out', 'by', 'the', 'fox', "'s", 'antic', ',', 'decid', 'to', 'chase', 'the', 'fox', 'away', 'from', 'his', 'favorit', 'nap', 'spot', '.', 'the', 'fox', ',', 'b