## Stemming

In [10]:
# import libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [11]:
text = "Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire."
print(text)

Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire.


In [12]:
tokens = word_tokenize(text.lower())
print(tokens)

['very', 'orderly', 'and', 'methodical', 'he', 'looked', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'ticking', 'a', 'sonorous', 'sermon', 'under', 'his', 'flapped', 'newly', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pitted', 'its', 'gravity', 'and', 'longevity', 'against', 'the', 'levity', 'and', 'evanescence', 'of', 'the', 'brisk', 'fire', '.']


In [5]:
stemmer = PorterStemmer()
porter_stemmed = [stemmer.stem(token) for token in tokens]
print(porter_stemmed)
len(porter_stemmed)

['veri', 'orderli', 'and', 'method', 'he', 'look', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'tick', 'a', 'sonor', 'sermon', 'under', 'hi', 'flap', 'newli', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pit', 'it', 'graviti', 'and', 'longev', 'against', 'the', 'leviti', 'and', 'evanesc', 'of', 'the', 'brisk', 'fire', '.']


47

In [6]:
# snowball stemmer
stemmer = SnowballStemmer("english")
snowball_stemmed = [stemmer.stem(token) for token in tokens]
print(snowball_stemmed)
len(snowball_stemmed)

['veri', 'order', 'and', 'method', 'he', 'look', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'tick', 'a', 'sonor', 'sermon', 'under', 'his', 'flap', 'newli', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pit', 'it', 'graviti', 'and', 'longev', 'against', 'the', 'leviti', 'and', 'evanesc', 'of', 'the', 'brisk', 'fire', '.']


47

In [13]:
from nltk.stem.lancaster import *

lancasterStemmer = LancasterStemmer()
lancaster_stemmed = [lancasterStemmer.stem(token) for token in tokens]
print(lancaster_stemmed)
len(lancaster_stemmed)

['very', 'ord', 'and', 'method', 'he', 'look', ',', 'with', 'a', 'hand', 'on', 'each', 'kne', ',', 'and', 'a', 'loud', 'watch', 'tick', 'a', 'son', 'sermon', 'und', 'his', 'flap', 'new', 'bought', 'waist-co', ',', 'as', 'though', 'it', 'pit', 'it', 'grav', 'and', 'longev', 'against', 'the', 'lev', 'and', 'evanesc', 'of', 'the', 'brisk', 'fir', '.']


47

In [14]:
df = pd.DataFrame({'token': tokens, 'porter_stemmed': porter_stemmed, 'snowball_stemmed': snowball_stemmed,'lancaster_stemmed': lancaster_stemmed})
df = df[['token', 'porter_stemmed', 'snowball_stemmed','lancaster_stemmed']]

In [15]:
df[(df.token != df.porter_stemmed) | (df.token != df.snowball_stemmed)| (df.token != df.lancaster_stemmed)]

Unnamed: 0,token,porter_stemmed,snowball_stemmed,lancaster_stemmed
0,very,veri,veri,very
1,orderly,orderli,order,ord
3,methodical,method,method,method
5,looked,look,look,look
12,knee,knee,knee,kne
18,ticking,tick,tick,tick
20,sonorous,sonor,sonor,son
22,under,under,under,und
23,his,hi,his,his
24,flapped,flap,flap,flap
