In [59]:
import nltk
from nltk.tokenize import TweetTokenizer,MWETokenizer,RegexpTokenizer,WhitespaceTokenizer,WordPunctTokenizer
from nltk.stem import RegexpStemmer,PorterStemmer

# 15. Tokenizing text using various Tokenizers

In [2]:
sentence = 'Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, \
New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official \
@indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"'
print(sentence)

Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official @indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"


### Using Tweet tokenizer

In [4]:
tweet_tokenizer = TweetTokenizer()

In [6]:
tweet_tokens = tweet_tokenizer.tokenize(sentence)
print('Number of tokens ',len(tweet_tokens))
print(tweet_tokens)

Number of tokens  38
['Sunil', 'tweeted', ',', '"', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', ',', 'New', 'Delhi', '.', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', '!', 'Awesome', 'airshow', '!', '@india_official', '@indian_army', '#India', '#70thRepublic_Day', '.', 'For', 'more', 'photos', 'ping', 'me', 'sunil@photoking.com', ':)', '"']


### Using MWE tokenizer

In [28]:
mwe = MWETokenizer([('Republic', 'Day'),('Indian','Army')])

In [29]:
mwe_tokens = mwe.tokenize(sentence.split())
print('Number of tokens ',len(mwe_tokens))
print(mwe_tokens)

Number of tokens  29
['Sunil', 'tweeted,', '"Witnessing', '70th', 'Republic_Day', 'of', 'India', 'from', 'Rajpath,', 'New', 'Delhi.', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army!', 'Awesome', 'airshow!', '@india_official', '@indian_army', '#India', '#70thRepublic_Day.', 'For', 'more', 'photos', 'ping', 'me', 'sunil@photoking.com', ':)"']


We can see in the above output that Republic Day is a single token whereas India Army is not. That's because in the sentence its "Army!". ie there is an unwated "!". lets remove it and repeat the process

In [30]:
mwe_tokens = mwe.tokenize(sentence.replace("!",'').split())
print('Number of tokens ',len(mwe_tokens))
print(mwe_tokens)

Number of tokens  28
['Sunil', 'tweeted,', '"Witnessing', '70th', 'Republic_Day', 'of', 'India', 'from', 'Rajpath,', 'New', 'Delhi.', 'Mesmerizing', 'performance', 'by', 'Indian_Army', 'Awesome', 'airshow', '@india_official', '@indian_army', '#India', '#70thRepublic_Day.', 'For', 'more', 'photos', 'ping', 'me', 'sunil@photoking.com', ':)"']


Now after replacing '!' we have Indian_Army as a single token. 

### Using Regular expression tokenizer

In [34]:
reg_exp = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
reg_exp_tokens = reg_exp.tokenize(sentence)
print('Number of Tokens are -',len(reg_exp_tokens))
print(reg_exp_tokens)

Number of Tokens are - 36
['Sunil', 'tweeted', ',', '"Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', ',', 'New', 'Delhi', '.', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', '!', 'Awesome', 'airshow', '!', '@india_official', '@indian_army', '#India', '#70thRepublic_Day.', 'For', 'more', 'photos', 'ping', 'me', 'sunil', '@photoking.com', ':)"']


### Using WhiteSpace tokenizer

In [38]:
ws_tokenizer = WhitespaceTokenizer()

In [40]:
ws_tokens = ws_tokenizer.tokenize(sentence)
print('Number of tokens are ',len(ws_tokens))
print(ws_tokens)

Number of tokens are  30
['Sunil', 'tweeted,', '"Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath,', 'New', 'Delhi.', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army!', 'Awesome', 'airshow!', '@india_official', '@indian_army', '#India', '#70thRepublic_Day.', 'For', 'more', 'photos', 'ping', 'me', 'sunil@photoking.com', ':)"']


### Using Word Punkt Tokenizer

In [42]:
punkt_tokenizer = WordPunctTokenizer()

In [43]:
punkt_tokens = punkt_tokenizer.tokenize(sentence)
print('Number of tokens ',len(punkt_tokens))
print(punkt_tokens)

Number of tokens  45
['Sunil', 'tweeted', ',', '"', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', ',', 'New', 'Delhi', '.', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', '!', 'Awesome', 'airshow', '!', '@', 'india_official', '@', 'indian_army', '#', 'India', '#', '70thRepublic_Day', '.', 'For', 'more', 'photos', 'ping', 'me', 'sunil', '@', 'photoking', '.', 'com', ':)"']


# 16. Stemming using Regular expression

In [46]:
sentence = 'I love playing football and cooking meals'
print(sentence)

I love playing football and cooking meals


In [51]:
reg_exp_stemmer = RegexpStemmer('ing$',min = 4)

In [56]:
" ".join([reg_exp_stemmer.stem(word) for word in sentence.split()])

'I love play football and cook meals'

# 17. Porter Stemmer

In [58]:
sentence = "Before eating, it would be nice to sanitize your hands with a sanitizer"
# its november of 2020, and we are using sanitizers for much more than mentioned.
print(sentence)

Before eating, it would be nice to sanitize your hands with a sanitizer


In [61]:
porter_stemmer = PorterStemmer()
" ".join([porter_stemmer.stem(word) for word in sentence.split()]) 
# also it reduces all to lower case

'befor eating, it would be nice to sanit your hand with a sanit'