**NLP Tokenization** Code snippets from **Text Analytics from Python** and **NLTK Book (FREE)**

Repo: https://github.com/dipanjanS/text-analytics-with-python

NLTK Book: https://www.nltk.org/book/

Reference: https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/

### NLTK Basic Tokenization and Stop word Removal

In [1]:
import nltk
import re
import string
from pprint import pprint

corpus = ["The brown fox wasn't that quick and he couldn't win the race",
          "Hey that's a great deal! I just bought a phone for $199",
          "@@You'll (learn) a **lot** in the book. Python is an amazing language!@@"]


In [2]:
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences] 
    return word_tokens
    
token_list = [tokenize_text(text) 
              for text in corpus]
pprint(token_list)


[[['The',
   'brown',
   'fox',
   'was',
   "n't",
   'that',
   'quick',
   'and',
   'he',
   'could',
   "n't",
   'win',
   'the',
   'race']],
 [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
  ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']],
 [['@',
   '@',
   'You',
   "'ll",
   '(',
   'learn',
   ')',
   'a',
   '**lot**',
   'in',
   'the',
   'book',
   '.'],
  ['Python', 'is', 'an', 'amazing', 'language', '!'],
  ['@', '@']]]


In [3]:
def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    return filtered_tokens
    
def remove_characters_before_tokenization(sentence,
                                          keep_apostrophes=False):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = r'[?|$|&|*|%|@|(|)|~]'
        filtered_sentence = re.sub(PATTERN, r'', sentence)
    else:
        PATTERN = r'[^a-zA-Z0-9 ]'
        filtered_sentence = re.sub(PATTERN, r'', sentence)
    return filtered_sentence

In [4]:
cleaned_corpus = [remove_characters_before_tokenization(sentence, keep_apostrophes=True) 
                  for sentence in corpus]
print cleaned_corpus


["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for 199", "You'll learn a lot in the book. Python is an amazing language!"]


In [5]:
from contractions import CONTRACTION_MAP

def expand_contractions(sentence, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence

In [6]:

expanded_corpus = [expand_contractions(sentence, CONTRACTION_MAP) 
                    for sentence in cleaned_corpus]    
print expanded_corpus
print 


['The brown fox was not that quick and he could not win the race', 'Hey that is a great deal! I just bought a phone for 199', 'You will learn a lot in the book. Python is an amazing language!']



In [7]:
def remove_stopwords(tokens):
    stopword_list = nltk.corpus.stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens


In [8]:
expanded_corpus_tokens = [tokenize_text(text)
                          for text in expanded_corpus]    
filtered_list_3 =  [[remove_stopwords(tokens) 
                        for tokens in sentence_tokens] 
                        for sentence_tokens in expanded_corpus_tokens]
print filtered_list_3


[[['The', 'brown', 'fox', 'quick', 'could', 'win', 'race']], [['Hey', 'great', 'deal', '!'], ['I', 'bought', 'phone', '199']], [['You', 'learn', 'lot', 'book', '.'], ['Python', 'amazing', 'language', '!']]]


In [9]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))
print(len(stopWords))
print(stopWords)

179
set([u'all', u'just', u"don't", u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'don', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u"should've", u"haven't", u'do', u'them', u'his', u'very', u"you've", u'they', u'not', u'during', u'now', u'him', u'nor', u"wasn't", u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u"won't", u'where', u"mustn't", u"isn't", u'few', u'because', u"you'd", u'doing', u'some', u'hasn', u"hasn't", u'are', u'our', u'ourselves', u'out', u'what', u'for', u"needn't", u'below', u're', u'does', u"shouldn't", u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u"mightn't", u"doesn't", u'were', u'here', u'shouldn', u'hers', u"aren't", u'by', u'on', u'about', u'couldn', u'of', u"wouldn't", u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u"hadn't", u'mightn', u"couldn't", u'wasn', u'your', u"you're", u'from', u'her', u'their', u'aren', u"it

**Kinds of sentence tokenizers**

In [10]:
sentence = "The brown fox wasn't that quick and he couldn't win the race"

# default word tokenizer
default_wt = nltk.word_tokenize
words = default_wt(sentence)
print words       

# treebank word tokenizer
treebank_wt = nltk.TreebankWordTokenizer()
words = treebank_wt.tokenize(sentence)
print words

wordpunkt_wt = nltk.WordPunctTokenizer()
words = wordpunkt_wt.tokenize(sentence)
print words

whitespace_wt = nltk.WhitespaceTokenizer()
words = whitespace_wt.tokenize(sentence)
print words

['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']
['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']
['The', 'brown', 'fox', 'wasn', "'", 't', 'that', 'quick', 'and', 'he', 'couldn', "'", 't', 'win', 'the', 'race']
['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


### Lemma and Stemming

In [11]:
# porter stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

print ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')
print ps.stem('lying')
print ps.stem('strange')

# lancaster stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

print '----'
print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

print ls.stem('lying')

print ls.stem('strange')

jump jump jump
lie
strang
----
jump jump jump
lying
strange


In [12]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

# lemmatize nouns
print wnl.lemmatize('cars', 'n')
print wnl.lemmatize('men', 'n')
print '----'
# lemmatize verbs
print wnl.lemmatize('running', 'v')
print wnl.lemmatize('ate', 'v')
print '----'
# lemmatize adjectives
print wnl.lemmatize('saddest', 'a')
print wnl.lemmatize('fancier', 'a')
print '----'
# ineffective lemmatization
print wnl.lemmatize('ate', 'n')
print wnl.lemmatize('fancier', 'v')


car
men
----
run
eat
----
sad
fancy
----
ate
fancier


###  Parts of Speech and Chunking

In [13]:
from nltk import tag
sent = nltk.word_tokenize(sentence)
tagged_sent = tag.pos_tag(sent)
print tagged_sent

[('The', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('was', 'VBD'), ("n't", 'RB'), ('that', 'IN'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('could', 'MD'), ("n't", 'RB'), ('win', 'VB'), ('the', 'DT'), ('race', 'NN')]


In [14]:
from nltk import chunk
tree = chunk.ne_chunk(tagged_sent)
tree


LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

Tree('S', [('The', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('was', 'VBD'), ("n't", 'RB'), ('that', 'IN'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('could', 'MD'), ("n't", 'RB'), ('win', 'VB'), ('the', 'DT'), ('race', 'NN')])

In [15]:
from nltk.corpus import treebank_chunk
treebank_chunk.tagged_sents()[0]

[(u'Pierre', u'NNP'),
 (u'Vinken', u'NNP'),
 (u',', u','),
 (u'61', u'CD'),
 (u'years', u'NNS'),
 (u'old', u'JJ'),
 (u',', u','),
 (u'will', u'MD'),
 (u'join', u'VB'),
 (u'the', u'DT'),
 (u'board', u'NN'),
 (u'as', u'IN'),
 (u'a', u'DT'),
 (u'nonexecutive', u'JJ'),
 (u'director', u'NN'),
 (u'Nov.', u'NNP'),
 (u'29', u'CD'),
 (u'.', u'.')]

### Chunk Grammar

In [16]:
from nltk.chunk import RegexpParser
from pattern.en import tag

simple_sentence = 'the quick fox jumped over the lazy dog'
tagged_simple_sent = tag(simple_sentence)
print tagged_simple_sent


[(u'the', u'DT'), (u'quick', u'JJ'), (u'fox', u'NN'), (u'jumped', u'VBD'), (u'over', u'IN'), (u'the', u'DT'), (u'lazy', u'JJ'), (u'dog', u'NN')]


In [17]:
chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""
rc = RegexpParser(chunk_grammar)
c = rc.parse(tagged_simple_sent)
print c


(S
  (NP the/DT quick/JJ fox/NN)
  jumped/VBD
  over/IN
  (NP the/DT lazy/JJ dog/NN))


In [18]:
chink_grammar = """
NP: {<.*>+} # chunk everything as NP
}<VBD|IN>+{
"""
rc = RegexpParser(chink_grammar)
c = rc.parse(tagged_simple_sent)
print c


(S
  (NP the/DT quick/JJ fox/NN)
  jumped/VBD
  over/IN
  (NP the/DT lazy/JJ dog/NN))


### WordNet synonyms and meaning

In [19]:
from nltk.corpus import wordnet
syns = wordnet.synsets("program")
print(syns[0].name())
print(syns[0].definition())
print(syns[0].examples())

plan.n.01
a series of steps to be carried out or goals to be accomplished
[u'they drew up a six-step plan', u'they discussed plans for a new bond issue']


In [20]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print('---')
print(set(antonyms))

set([u'beneficial', u'right', u'secure', u'just', u'unspoilt', u'respectable', u'good', u'goodness', u'dear', u'salutary', u'ripe', u'expert', u'skillful', u'in_force', u'proficient', u'unspoiled', u'dependable', u'soundly', u'honorable', u'full', u'undecomposed', u'safe', u'adept', u'upright', u'trade_good', u'sound', u'in_effect', u'practiced', u'effective', u'commodity', u'estimable', u'well', u'honest', u'near', u'skilful', u'thoroughly', u'serious'])
---
set([u'bad', u'badness', u'ill', u'evil', u'evilness'])


In [21]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.909090909091
