In [22]:
import nltk
from nltk.corpus import gutenberg
from pprint import pprint

## SENTENCE TOKENIZATION

# loading text corpora
alice = gutenberg.raw(fileids='carroll-alice.txt')
sample_text = 'We will discuss briefly about the basic syntax,\
 structure and design philosophies. \
 There is a defined hierarchical syntax for Python code which you should remember \
 when writing code! Python is a really powerful programming language!'
               
# Total characters in Alice in Wonderland
print (len(alice))

144395


In [23]:
print (alice[0:100])

[Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [24]:
## default sentence tokenizer
default_st = nltk.sent_tokenize
alice_sentences = default_st(text=alice)
sample_sentences = default_st(text=sample_text)

In [25]:
print ('Total sentences in sample_text:', len(sample_sentences))

Total sentences in sample_text: 3


In [26]:
print ('Sample text sentences :-')
pprint(sample_sentences)

Sample text sentences :-
['We will discuss briefly about the basic syntax, structure and design '
 'philosophies.',
 'There is a defined hierarchical syntax for Python code which you should '
 'remember  when writing code!',
 'Python is a really powerful programming language!']


In [27]:
print ('\nTotal sentences in alice:', len(alice_sentences))
print ('First 5 sentences in alice:-')
pprint(alice_sentences[0:5])


Total sentences in alice: 1625
First 5 sentences in alice:-
["[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I.",
 'Down the Rabbit-Hole\n'
 '\n'
 'Alice was beginning to get very tired of sitting by her sister on the\n'
 'bank, and of having nothing to do: once or twice she had peeped into the\n'
 'book her sister was reading, but it had no pictures or conversations in\n'
 "it, 'and what is the use of a book,' thought Alice 'without pictures or\n"
 "conversation?'",
 'So she was considering in her own mind (as well as she could, for the\n'
 'hot day made her feel very sleepy and stupid), whether the pleasure\n'
 'of making a daisy-chain would be worth the trouble of getting up and\n'
 'picking the daisies, when suddenly a White Rabbit with pink eyes ran\n'
 'close by her.',
 'There was nothing so VERY remarkable in that; nor did Alice think it so\n'
 "VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!",
 'Oh dear!']


# Ways to tokenize sentences 

In [28]:
## Other languages sentence tokenization
from nltk.corpus import europarl_raw

german_text = europarl_raw.german.raw(fileids='ep-00-01-17.de')
# Total characters in the corpus
print (len(german_text))
# First 100 characters in the corpus
print (german_text[0:100])

157171
 
Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sit


In [29]:
# 1st way (default sentence tokenizer) 
german_sentences_def = default_st(text=german_text, language='german')

# 2nd way (loading german text tokenizer into a PunktSentenceTokenizer instance)
german_tokenizer = nltk.data.load(resource_url='tokenizers/punkt/german.pickle')
german_sentences = german_tokenizer.tokenize(german_text)

# verify the type of german_tokenizer should be PunktSentenceTokenizer (which specializes in dealing with the German)
print (type(german_tokenizer))

<class 'nltk.tokenize.punkt.PunktSentenceTokenizer'>


In [30]:
# check if results of both tokenizers match
# should be True
print (german_sentences_def == german_sentences)
# print first 5 sentences of the corpus
for sent in german_sentences[0:5]:
    print (sent)

True
 
Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen , wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe , daß Sie schöne Ferien hatten .
Wie Sie feststellen konnten , ist der gefürchtete " Millenium-Bug " nicht eingetreten .
Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden .
Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen .
Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen - , allen Opfern der Stürme , insbesondere in den verschiedenen Ländern der Europäischen Union , in einer Schweigeminute zu gedenken .


In [31]:
# 3rd way (using PunktSentenceTokenizer for sentence tokenization)

punkt_st = nltk.tokenize.PunktSentenceTokenizer()
sample_sentences = punkt_st.tokenize(sample_text)
pprint(sample_sentences)

['We will discuss briefly about the basic syntax, structure and design '
 'philosophies.',
 'There is a defined hierarchical syntax for Python code which you should '
 'remember  when writing code!',
 'Python is a really powerful programming language!']


In [32]:
# 4th way (RegexpTokenizer)

SENTENCE_TOKENS_PATTERN = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s'
regex_st = nltk.tokenize.RegexpTokenizer(
            pattern=SENTENCE_TOKENS_PATTERN,
            gaps=True)
sample_sentences = regex_st.tokenize(sample_text)
pprint(sample_sentences)         

['We will discuss briefly about the basic syntax, structure and design '
 'philosophies.',
 ' There is a defined hierarchical syntax for Python code which you should '
 'remember  when writing code!',
 'Python is a really powerful programming language!']


# WORD TOKENIZATION

In [33]:
sentence = "The brown fox wasn't that quick and he couldn't win the race"

In [34]:
# default word tokenizer
default_wt = nltk.word_tokenize
words = default_wt(sentence)
print (words   )

['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']


In [35]:
# treebank word tokenizer
treebank_wt = nltk.TreebankWordTokenizer()
words = treebank_wt.tokenize(sentence)
print (words)

['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']


In [36]:
# regex word tokenizer
TOKEN_PATTERN = r'\w+'        
regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN,
                                gaps=False)
words = regex_wt.tokenize(sentence)
print (words)

['The', 'brown', 'fox', 'wasn', 't', 'that', 'quick', 'and', 'he', 'couldn', 't', 'win', 'the', 'race']


In [37]:
GAP_PATTERN = r'\s+'        
regex_wt = nltk.RegexpTokenizer(pattern=GAP_PATTERN,
                                gaps=True)
words = regex_wt.tokenize(sentence)
print (words)

['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


In [38]:
word_indices = list(regex_wt.span_tokenize(sentence))
print (word_indices)
print ([sentence[start:end] for start, end in word_indices])

[(0, 3), (4, 9), (10, 13), (14, 20), (21, 25), (26, 31), (32, 35), (36, 38), (39, 47), (48, 51), (52, 55), (56, 60)]
['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


In [39]:
# derived regex tokenizers
wordpunkt_wt = nltk.WordPunctTokenizer()
words = wordpunkt_wt.tokenize(sentence)
print (words)

['The', 'brown', 'fox', 'wasn', "'", 't', 'that', 'quick', 'and', 'he', 'couldn', "'", 't', 'win', 'the', 'race']


In [40]:
whitespace_wt = nltk.WhitespaceTokenizer()
words = whitespace_wt.tokenize(sentence)
print (words)

['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


# Normalization

In [41]:
import nltk
import re
import string
from pprint import pprint

corpus = ["The brown fox wasn't that quick and he couldn't win the race",
          "Hey that's a great deal! I just bought a phone for $199",
          "@@You'll (learn) a **lot** in the book. Python is an amazing language!@@"]


def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences] 
    return word_tokens
    
token_list = [tokenize_text(text) 
              for text in corpus]
pprint(token_list)

[[['The',
   'brown',
   'fox',
   'was',
   "n't",
   'that',
   'quick',
   'and',
   'he',
   'could',
   "n't",
   'win',
   'the',
   'race']],
 [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
  ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']],
 [['@',
   '@',
   'You',
   "'ll",
   '(',
   'learn',
   ')',
   'a',
   '**lot**',
   'in',
   'the',
   'book',
   '.'],
  ['Python', 'is', 'an', 'amazing', 'language', '!'],
  ['@', '@']]]


In [42]:
def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    return filtered_tokens
    
filtered_list_1 =  [filter(None,[remove_characters_after_tokenization(tokens) 
                                for tokens in sentence_tokens]) 
                    for sentence_tokens in token_list]
print (filtered_list_1)

[<filter object at 0x0000018F71113108>, <filter object at 0x0000018F7110EAC8>, <filter object at 0x0000018F7110EA88>]


Essentially, what we do here is use the string.punctuation attribute, which consists
of all possible special characters/symbols, and create a regex pattern from it. We use it
to match tokens that are symbols and characters and remove them. The filter function
helps us remove empty tokens obtained after removing the special character tokens using
the regex sub method. 

In [43]:
def remove_characters_before_tokenization(sentence,
                                          keep_apostrophes=False):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = r'[?|$|&|*|%|@|(|)|~]'
        filtered_sentence = re.sub(PATTERN, r'', sentence)
    else:
        PATTERN = r'[^a-zA-Z0-9 ]'
        filtered_sentence = re.sub(PATTERN, r'', sentence)
    return filtered_sentence
    
filtered_list_2 = [remove_characters_before_tokenization(sentence) 
                    for sentence in corpus]    
print (filtered_list_2)

['The brown fox wasnt that quick and he couldnt win the race', 'Hey thats a great deal I just bought a phone for 199', 'Youll learn a lot in the book Python is an amazing language']


The preceding outputs show two different ways of removing special characters before
tokenization—removing all special characters versus retaining apostrophes and sentence
periods—using regular expressions. 

In [44]:
cleaned_corpus = [remove_characters_before_tokenization(sentence, keep_apostrophes=True) 
                  for sentence in corpus]
print (cleaned_corpus)

["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for 199", "You'll learn a lot in the book. Python is an amazing language!"]


In [45]:
from contractions import CONTRACTION_MAP 
def expand_contractions(sentence, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence
    
expanded_corpus = [expand_contractions(sentence, CONTRACTION_MAP) 
                    for sentence in cleaned_corpus]    
print (expanded_corpus)

['The brown fox was not that quick and he could not win the race', 'Hey that is a great deal! I just bought a phone for 199', 'You will learn a lot in the book. Python is an amazing language!']


In [46]:
# case conversion    
print (corpus[0].lower())
print (corpus[0].upper())

the brown fox wasn't that quick and he couldn't win the race
THE BROWN FOX WASN'T THAT QUICK AND HE COULDN'T WIN THE RACE


In [47]:
# removing stopwords
def remove_stopwords(tokens):
    stopword_list = nltk.corpus.stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens
    
expanded_corpus_tokens = [tokenize_text(text)
                          for text in expanded_corpus]    
filtered_list_3 =  [[remove_stopwords(tokens) 
                        for tokens in sentence_tokens] 
                        for sentence_tokens in expanded_corpus_tokens]
print (filtered_list_3)

[[['The', 'brown', 'fox', 'quick', 'could', 'win', 'race']], [['Hey', 'great', 'deal', '!'], ['I', 'bought', 'phone', '199']], [['You', 'learn', 'lot', 'book', '.'], ['Python', 'amazing', 'language', '!']]]


In [48]:
# removing repeated characters
sample_sentence = 'My schooool is realllllyyy amaaazingggg'
sample_sentence_tokens = tokenize_text(sample_sentence)[0]
sample_sentence_tokens

['My', 'schooool', 'is', 'realllllyyy', 'amaaazingggg']

In [49]:
from nltk.corpus import wordnet

def remove_repeated_characters(tokens):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
            
    correct_tokens = [replace(word) for word in tokens]
    return correct_tokens

print (remove_repeated_characters(sample_sentence_tokens))

['My', 'school', 'is', 'really', 'amazing']


In each step we will try to eliminate one of the repeated characters using a
substitution for the match by utilizing the regex match groups (groups 1, 2, and 3) using
the pattern r’\1\2\3’ and then keep iterating through this process till no repeated
characters remain. 

This will convert finallllyyyy to finally as we keep check whether the word is semantically correct.

# Spelling corrections

In [21]:
import re, collections

def tokens(text): 
    """
    Get all words from the corpus
    """
    return re.findall('[a-z]+', text.lower()) 

WORDS = tokens(open('big.txt').read())
WORD_COUNTS = collections.Counter(WORDS)

# top 10 words in corpus
print (WORD_COUNTS.most_common(10))

[('the', 80030), ('of', 40025), ('and', 38313), ('to', 28766), ('in', 22050), ('a', 21155), ('that', 12512), ('he', 12401), ('was', 11410), ('it', 10681)]


In [52]:
def known(words):
    """
    Return the subset of words that are actually 
    in our WORD_COUNTS dictionary.
    """
    return {w for w in words if w in WORD_COUNTS}


def edits0(word): 
    """
    Return all strings that are zero edits away 
    from the input word (i.e., the word itself).
    """
    return {word}



def edits1(word):
    """
    Return all strings that are one edit away 
    from the input word.
    """
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    def splits(word):
        """
        Return a list of all possible (first, rest) pairs 
        that the input word is made of.
        """
        return [(word[:i], word[i:]) 
                for i in range(len(word)+1)]
                
    pairs      = splits(word)
    deletes    = [a+b[1:]           for (a, b) in pairs if b]
    transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
    replaces   = [a+c+b[1:]         for (a, b) in pairs for c in alphabet if b]
    inserts    = [a+c+b             for (a, b) in pairs for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    """Return all strings that are two edits away 
    from the input word.
    """
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}
    
    
def correct(word):
    """
    Get the best correct spelling for the input word
    """
    # Priority is for edit distance 0, then 1, then 2
    # else defaults to the input word itself.
    candidates = (known(edits0(word)) or 
                  known(edits1(word)) or 
                  known(edits2(word)) or 
                  [word])
    return max(candidates, key=WORD_COUNTS.get)


def correct_match(match):
    """
    Spell-correct word in match, 
    and preserve proper upper/lower/title case.
    """
    
    word = match.group()
    def case_of(text):
        """
        Return the case-function appropriate 
        for text: upper, lower, title, or just str.:
            """
        return (str.upper if text.isupper() else
                str.lower if text.islower() else
                str.title if text.istitle() else
                str)
    return case_of(word)(correct(word.lower()))

    
def correct_text_generic(text):
    """
    Correct all the words within a text, 
    returning the corrected text.
    """
    return re.sub('[a-zA-Z]+', correct_match, text)



print (correct_text_generic('fianlly'))

finally


# Stemming algorithms

In [53]:
# porter stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

print (ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped'))

print (ps.stem('lying'))

print (ps.stem('strange'))


jump jump jump
lie
strang


In [54]:
# lancaster stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

print (ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped'))

print (ls.stem('lying'))

print (ls.stem('strange'))


jump jump jump
lying
strange


In [55]:
# regex stemmer
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)

print (rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped'))

print (rs.stem('lying'))

print (rs.stem('strange'))

jump jump jump
ly
strange


In [57]:
# snowball stemmer
from nltk.stem import SnowballStemmer
ss = SnowballStemmer("german")

print ('Supported Languages:', SnowballStemmer.languages)

# autobahnen -> cars
# autobahn -> car
print(ss.stem('autobahnen'))

# springen -> jumping
# spring -> jump
print(ss.stem('springen'))


Supported Languages: ('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')
autobahn
spring


In [58]:
# lemmatization
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

# lemmatize nouns
print (wnl.lemmatize('cars', 'n'))
print (wnl.lemmatize('men', 'n'))

# lemmatize verbs
print (wnl.lemmatize('running', 'v'))
print (wnl.lemmatize('ate', 'v'))

# lemmatize adjectives
print (wnl.lemmatize('saddest', 'a'))
print (wnl.lemmatize('fancier', 'a'))

car
men
run
eat
sad
fancy


In [59]:
# ineffective lemmatization
print (wnl.lemmatize('ate', 'n'))
print (wnl.lemmatize('fancier', 'v'))

ate
fancier


The part of speech is extremely important here because if that is wrong, the
lemmatization will not be effective