In [12]:
import nltk
from nltk.tokenize import WhitespaceTokenizer, TreebankWordTokenizer, TweetTokenizer, MWETokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import wordpunct_tokenize,word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
text = "NLTK is a powerful library for natural language processing. It provides various tokenization techniques."

In [3]:
#WhitespaceTokenizer
whitespace_tokenizer = WhitespaceTokenizer()
whitespace_tokens = whitespace_tokenizer.tokenize(text)
print("Whitespcae Tokenization: ",whitespace_tokens)

Whitespcae Tokenization:  ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing.', 'It', 'provides', 'various', 'tokenization', 'techniques.']


In [7]:
# PunctuationTokenization
'''The punctuation-based tokenizer splits the given text based on punctuation and whitespace.'''
punc_text = 'Hope is *the only thing.s stonger than, fear! '
punc_tokens = wordpunct_tokenize(punc_text)
print("Punctuation Tokenization: ",punc_tokens)

Punctuation Tokenization:  ['Hope', 'is', '*', 'the', 'only', 'thing', '.', 's', 'stonger', 'than', ',', 'fear', '!']


In [8]:
#Treebank Tokenization
'''The problem which we had in the punctuation tokenizer of splitting the words into an incorrect format like doesn’t into doesn, ‘, and t but now the problem is solved. Treebank tokenizer contains rules for English contractions.'''
treebank_text = "What you don't want to do to yourself, do'nt do to others! 10.5"

treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(treebank_text)
print("Treebank Tokenization: ",treebank_tokens)

Treebank Tokenization:  ['What', 'you', 'do', "n't", 'want', 'to', 'do', 'to', 'yourself', ',', "do'nt", 'do', 'to', 'others', '!', '10.5']


In [11]:
tweet_text = "@yuvraj HEllllllo everyone, I wish you all a good #day 421 352 658"
tweet_tokenizer = TweetTokenizer(preserve_case=True,reduce_len = True, strip_handles = True, match_phone_numbers = False)
tweet_tokens = tweet_tokenizer.tokenize(tweet_text)
print("Tweet Tokenization: ", tweet_tokens)

Tweet Tokenization:  ['HElllo', 'everyone', ',', 'I', 'wish', 'you', 'all', 'a', 'good', '#day', '421', '352', '658']


In [15]:
#MWE Tokenization
'''NLTK’s multi-word expression tokenizer (MWETokenizer) provides a function add_mwe() 
    that allows the user to enter multiple word expressions before using the tokenizer on the text.
    More simply, it can merge multi-word expressions into single tokens.'''
mwe_text = "MS Dhoni is the king of Cricket"
MWE_tokenizer = MWETokenizer()
MWE_tokenizer.add_mwe(('MS','Dhoni'))
MWE_tokens = MWE_tokenizer.tokenize(word_tokenize(mwe_text))
print("MWE Tokenization: ",MWE_tokens)

MWE Tokenization:  ['MS_Dhoni', 'is', 'the', 'king', 'of', 'Cricket']


In [16]:
#Stemming using PorterStemmer 
porter_stemmer = PorterStemmer()
porter_list = ['Singing','Dancing','University','Connections','programming']
porter_stemmer_words = [porter_stemmer.stem(word) for word in porter_list]
print("PorterStemmer : ", porter_stemmer_words)

PorterStemmer :  ['sing', 'danc', 'univers', 'connect', 'program']


In [17]:
#Stemming using SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
snowball_list = ['Singing','Dancing','University','Connections','programming']
snowball_stemmer_words = [snowball_stemmer.stem(word) for word in snowball_list]
print("SnowballStemmer: ",snowball_stemmer_words)

SnowballStemmer:  ['sing', 'danc', 'univers', 'connect', 'program']


In [19]:
#Lemmatization 
wordnet = WordNetLemmatizer()
example_words = ["programmer", "programming","programs","connection","connecting","connector"]
print("{0:20}{1:20}".format("--Word--","--Lemma--"))
for word in example_words:
    print("{0:20}{1:20}".format(word,wordnet.lemmatize(word,pos="v")))

--Word--            --Lemma--           
programmer          programmer          
programming         program             
programs            program             
connection          connection          
connecting          connect             
connector           connector           
