In [37]:
import re
from nltk import word_tokenize
from nltk import ngrams
from textblob import TextBlob
from keras.preprocessing.text import text_to_word_sequence

# 12. Text Cleaning and Tokenization

In [3]:
sentence = 'Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, \
New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official \
@indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"'
print(sentence)

Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official @indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"


In [9]:
re.sub(r'([^\s\w]|_)+',' ',sentence)

'Sunil tweeted   Witnessing 70th Republic Day of India from Rajpath  New Delhi  Mesmerizing performance by Indian Army  Awesome airshow   india official  indian army  India  70thRepublic Day  For more photos ping me sunil photoking com  '

Using the above regular expression we have subtituted any character which is not a word or a whitespace, or is an underscore by a whitespace. 

In [11]:
# using split() to get tokens
print(re.sub(r'([^\s\w]|_)+', ' ', sentence).split())

['Sunil', 'tweeted', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', 'New', 'Delhi', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', 'Awesome', 'airshow', 'india', 'official', 'indian', 'army', 'India', '70thRepublic', 'Day', 'For', 'more', 'photos', 'ping', 'me', 'sunil', 'photoking', 'com']


# 13. Extracting N-grams

In [16]:
sentence = 'The cute little boy is playing with the kitten'

In [17]:
def n_gram_extractor(sentence,n):
    tokens = re.sub(r'([^\s\w]|_)+', ' ', sentence).split()
    for i in range(len(tokens) - n + 1):
        print(tokens[i:i+n])

In [18]:
n_gram_extractor(sentence,2)

['The', 'cute']
['cute', 'little']
['little', 'boy']
['boy', 'is']
['is', 'playing']
['playing', 'with']
['with', 'the']
['the', 'kitten']


In [19]:
n_gram_extractor(sentence,3)

['The', 'cute', 'little']
['cute', 'little', 'boy']
['little', 'boy', 'is']
['boy', 'is', 'playing']
['is', 'playing', 'with']
['playing', 'with', 'the']
['with', 'the', 'kitten']


### Using nltk library

In [25]:
tokens = word_tokenize(sentence)
# bigram
list(ngrams(tokens,2))

[('The', 'cute'),
 ('cute', 'little'),
 ('little', 'boy'),
 ('boy', 'is'),
 ('is', 'playing'),
 ('playing', 'with'),
 ('with', 'the'),
 ('the', 'kitten')]

In [26]:
# trigram 
list(ngrams(tokens,3))

[('The', 'cute', 'little'),
 ('cute', 'little', 'boy'),
 ('little', 'boy', 'is'),
 ('boy', 'is', 'playing'),
 ('is', 'playing', 'with'),
 ('playing', 'with', 'the'),
 ('with', 'the', 'kitten')]

### Using textblob

In [29]:
blob = TextBlob(sentence)

In [32]:
# bigram
blob.ngrams(2)

[WordList(['The', 'cute']),
 WordList(['cute', 'little']),
 WordList(['little', 'boy']),
 WordList(['boy', 'is']),
 WordList(['is', 'playing']),
 WordList(['playing', 'with']),
 WordList(['with', 'the']),
 WordList(['the', 'kitten'])]

In [33]:
blob.ngrams(3)

[WordList(['The', 'cute', 'little']),
 WordList(['cute', 'little', 'boy']),
 WordList(['little', 'boy', 'is']),
 WordList(['boy', 'is', 'playing']),
 WordList(['is', 'playing', 'with']),
 WordList(['playing', 'with', 'the']),
 WordList(['with', 'the', 'kitten'])]

# 14. Tokenizing text with different packages - Keras and textblob

In [38]:
sentence = 'Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, \
New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official \
@indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"'
print(sentence)

Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official @indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"


In [40]:
# using keras 
print(text_to_word_sequence(sentence))

['sunil', 'tweeted', 'witnessing', '70th', 'republic', 'day', 'of', 'india', 'from', 'rajpath', 'new', 'delhi', 'mesmerizing', 'performance', 'by', 'indian', 'army', 'awesome', 'airshow', 'india', 'official', 'indian', 'army', 'india', '70threpublic', 'day', 'for', 'more', 'photos', 'ping', 'me', 'sunil', 'photoking', 'com']


In [41]:
#using textblob 
blob = TextBlob(sentence)
blob.words

WordList(['Sunil', 'tweeted', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', 'New', 'Delhi', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', 'Awesome', 'airshow', 'india_official', 'indian_army', 'India', '70thRepublic_Day', 'For', 'more', 'photos', 'ping', 'me', 'sunil', 'photoking.com'])