In [32]:
from nltk.tokenize import (
    sent_tokenize,
    word_tokenize,
    RegexpTokenizer,
    WhitespaceTokenizer,
    WordPunctTokenizer,
)
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer, LancasterStemmer

# [1.0 tokenize]()


[1.1 sent_tokenize & word_tokenize]()


In [33]:
text = """Hello Mr. Hany, how are you doing today? The weater is great and Python is awesome.
        The sky is pinhish-blue. You should not eat carboard."""

# sent_tokenize: split the text by sentence


print("Spliting Text by sentence: \n")
print("\n".join(sent_tokenize(text=text, language="english")), "\n")

# word_tokenize: split the text by Words
print("Spliting Text by Words: \n")
print(" - ".join(word_tokenize(text=text, language="english", preserve_line=True)))

Spliting Text by sentence: 

Hello Mr. Hany, how are you doing today?
The weater is great and Python is awesome.
The sky is pinhish-blue.
You should not eat carboard. 

Spliting Text by Words: 

Hello - Mr. - Hany - , - how - are - you - doing - today - ? - The - weater - is - great - and - Python - is - awesome. - The - sky - is - pinhish-blue. - You - should - not - eat - carboard - .


[1.2 RegexpTokenizer]()


In [34]:
text = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."

# RegexpTokenizer: used like regular expressions, which takes a letter to split based on them

took = RegexpTokenizer(r"\w+|\$[\d\.]+|\S+[.]+")

print(took.tokenize(text))

['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', 'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks']


[1.3 WhitespaceTokenizer and WordPunctTokenizer]()


In [35]:
# WordPunctTokenizer: Tokenize a text into a sequence of alphabetic and non-alphabetic characters, using the regexp \w+|[^\w\s]+.
took = WordPunctTokenizer()

print(took.tokenize(text))


# WhitespaceTokenizer : Tokenize a string on whitespace (space, tab, newline). In general, users should use the string split() method instead.
took = WhitespaceTokenizer()

print(took.tokenize(text))

['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']


# [2.0 Stop]()


[2.1 stop words]()


In [36]:
stop_words = set(stopwords.words(fileids="english"))
# Stop words are a set of commonly used words in a language, Stop words are commonly used NLP, in which they are removed from the sentance
# filter the words based on the stop words in the language
text = "This is an example showing off stop words filtration."

words = word_tokenize(text)

filtered_words = [word for word in words if not word in stop_words]


filtered_words

['This', 'example', 'showing', 'stop', 'words', 'filtration', '.']

In [37]:
stop_words = set(stopwords.words(fileids="english"))
# Stop words are a set of commonly used words in a language, Stop words are commonly used NLP, in which they are removed from the sentance
# filter the words based on the stop words in the language
text = "This is an example showing off stop words filtration."

words = word_tokenize(text)

filtered_words = [word for word in words if not word in stop_words]

print("Before filtering : ", words, "\n")
print("after filtering : ", filtered_words, "\n")

Before filtering :  ['This', 'is', 'an', 'example', 'showing', 'off', 'stop', 'words', 'filtration', '.'] 

after filtering :  ['This', 'example', 'showing', 'stop', 'words', 'filtration', '.'] 



# [3.0 stem]()

Interfaces used to remove morphological affixes from words, leaving only the word stem. Stemming algorithms aim to remove those affixes required for eg. grammatical role, tense, derivational morphology leaving only the stem of the word


[3.1 porter]()

An algorithm for suffix stripping


In [38]:
ps = PorterStemmer()

some_words = [
    "maximum",
    "presumably",
    "multiply",
    "provision",
    "owed",
    "saying",
    "meant",
    "cement",
]


for w in some_words:
    print(ps.stem(w))

print()

text = """
    It is very imporatant to be pythonly while you are pythoning with python.
    All pythoners have pythoned poorly at least ones.
"""

words = word_tokenize(text)
print("words before : ", words, "\n")


filtered_words = set()

for w in words:
    filtered_words.add(ps.stem(w))

print("words after : ", filtered_words, "\n")

maximum
presum
multipli
provis
owe
say
meant
cement

words before :  ['It', 'is', 'very', 'imporatant', 'to', 'be', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python', '.', 'All', 'pythoners', 'have', 'pythoned', 'poorly', 'at', 'least', 'ones', '.'] 

words after :  {'.', 'to', 'pythonli', 'veri', 'poorli', 'it', 'least', 'with', 'imporat', 'are', 'is', 'one', 'be', 'python', 'while', 'have', 'all', 'at', 'you'} 



[3.2 LancasterStemmer]()

The Lancaster Stemmer is an algorithm for stemming words in the English language, It aims to efficiently remove prefixes and suffixes from words to find their root form.


In [39]:
ps = LancasterStemmer(strip_prefix_flag=True)

some_words = [
    "maximum",
    "presumably",
    "multiply",
    "provision",
    "owed",
    "saying",
    "meant",
    "cement",
    "kilometer",
]
for w in some_words:
    print(ps.stem(w))
print()

text = """It is very imporatant to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least ones."""
words = word_tokenize(text)

print("words before : ", words, "\n")

filtered_words = set()

for w in words:
    filtered_words.add(ps.stem(w))

print("words after : ", filtered_words, "\n")

maxim
presum
multiply
provid
ow
say
meant
cem
met

words before :  ['It', 'is', 'very', 'imporatant', 'to', 'be', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python', '.', 'All', 'pythoners', 'have', 'pythoned', 'poorly', 'at', 'least', 'ones', '.'] 

words after :  {'hav', '.', 'at', 'you', 'al', 'to', 'poor', 'least', 'python', 'very', 'ar', 'on', 'be', 'whil', 'imp', 'with', 'is', 'it'} 

