In [1]:
import nltk

In [2]:
# nltk.download('all')

In [3]:
# load the Brown Corpus
from nltk.corpus import brown

In [4]:
print('Total Categories:', len(brown.categories()))

Total Categories: 15


In [5]:
print (brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [6]:
# tokenized sentences
brown.sents(categories='mystery')

[['There', 'were', 'thirty-eight', 'patients', 'on', 'the', 'bus', 'the', 'morning', 'I', 'left', 'for', 'Hanover', ',', 'most', 'of', 'them', 'disturbed', 'and', 'hallucinating', '.'], ['An', 'interne', ',', 'a', 'nurse', 'and', 'two', 'attendants', 'were', 'in', 'charge', 'of', 'us', '.'], ...]

In [7]:
# POS tagged sentences
brown.tagged_sents(categories='mystery')

[[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')], [('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')], ...]

In [8]:
# get sentences in natural form
sentences = brown.sents(categories='mystery')
sentences = [' '.join(sentence_token) for sentence_token in sentences]
print (sentences[0:5]) # printing first 5 sentences

['There were thirty-eight patients on the bus the morning I left for Hanover , most of them disturbed and hallucinating .', 'An interne , a nurse and two attendants were in charge of us .', "I felt lonely and depressed as I stared out the bus window at Chicago's grim , dirty West Side .", 'It seemed incredible , as I listened to the monotonous drone of voices and smelled the fetid odors coming from the patients , that technically I was a ward of the state of Illinois , going to a hospital for the mentally ill .', 'I suddenly thought of Mary Jane Brennan , the way her pretty eyes could flash with anger , her quiet competence , the gentleness and sweetness that lay just beneath the surface of her defenses .']


In [9]:
# get tagged words
tagged_words = brown.tagged_words(categories='mystery')
# get nouns from tagged words
nouns = [(word, tag) for word, tag in tagged_words if any(noun_tag in tag for noun_tag in ['NP', 'NN'])]
print (nouns[0:10]) # prints the first 10 nouns

[('patients', 'NNS'), ('bus', 'NN'), ('morning', 'NN'), ('Hanover', 'NP'), ('interne', 'NN'), ('nurse', 'NN'), ('attendants', 'NNS'), ('charge', 'NN'), ('bus', 'NN'), ('window', 'NN')]


In [10]:
# build frequency distribution for nouns
nouns_freq = nltk.FreqDist([word for word, tag in nouns])
# print top 10 occuring nouns
print (nouns_freq.most_common(10))

[('man', 106), ('time', 82), ('door', 80), ('car', 69), ('room', 65), ('Mr.', 63), ('way', 61), ('office', 50), ('eyes', 48), ('hand', 46)]


In [11]:
# load the Reuters Corpus
from nltk.corpus import reuters

In [12]:
print ('Total Categories:', len(reuters.categories()))
print (reuters.categories())

Total Categories: 90
['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [13]:
# get sentences in housing and income categories
sentences = reuters.sents(categories=['housing', 'income'])
sentences = [' '.join(sentence_tokens) for sentence_tokens in sentences]
print (sentences[0:5]) # prints the first 5 sentences

["YUGOSLAV ECONOMY WORSENED IN 1986 , BANK DATA SHOWS National Bank economic data for 1986 shows that Yugoslavia ' s trade deficit grew , the inflation rate rose , wages were sharply higher , the money supply expanded and the value of the dinar fell .", 'The trade deficit for 1986 was 2 . 012 billion dlrs , 25 . 7 pct higher than in 1985 .', 'The trend continued in the first three months of this year as exports dropped by 17 . 8 pct , in hard currency terms , to 2 . 124 billion dlrs .', 'Yugoslavia this year started quoting trade figures in dinars based on current exchange rates , instead of dollars based on a fixed exchange rate of 264 . 53 dinars per dollar .', "Yugoslavia ' s balance of payments surplus with the convertible currency area fell to 245 mln dlrs in 1986 from 344 mln in 1985 ."]


In [14]:
# fileid based access
print (reuters.fileids(categories=['housing', 'income']))

['test/16118', 'test/18534', 'test/18540', 'test/18664', 'test/18665', 'test/18672', 'test/18911', 'test/19875', 'test/20106', 'test/20116', 'training/1035', 'training/1036', 'training/10602', 'training/10604', 'training/11170', 'training/11665', 'training/2618', 'training/29', 'training/3105', 'training/3708', 'training/3720', 'training/3723', 'training/3898', 'training/5883', 'training/5886', 'training/6000', 'training/6067', 'training/6197', 'training/7005', 'training/7006', 'training/7015', 'training/7036', 'training/7098', 'training/7099', 'training/9615']


In [15]:
print (reuters.sents(fileids=[u'test/16118', u'test/18534']))

[['YUGOSLAV', 'ECONOMY', 'WORSENED', 'IN', '1986', ',', 'BANK', 'DATA', 'SHOWS', 'National', 'Bank', 'economic', 'data', 'for', '1986', 'shows', 'that', 'Yugoslavia', "'", 's', 'trade', 'deficit', 'grew', ',', 'the', 'inflation', 'rate', 'rose', ',', 'wages', 'were', 'sharply', 'higher', ',', 'the', 'money', 'supply', 'expanded', 'and', 'the', 'value', 'of', 'the', 'dinar', 'fell', '.'], ['The', 'trade', 'deficit', 'for', '1986', 'was', '2', '.', '012', 'billion', 'dlrs', ',', '25', '.', '7', 'pct', 'higher', 'than', 'in', '1985', '.'], ...]


In [16]:
# load the Wordnet Corpus
from nltk.corpus import wordnet as wn

In [17]:
word = 'hike' # taking hike as our word of interest
# get word synsets
word_synsets = wn.synsets(word)
print (word_synsets)

[Synset('hike.n.01'), Synset('rise.n.09'), Synset('raise.n.01'), Synset('hike.v.01'), Synset('hike.v.02')]


In [18]:
# get details for each synonym in synset
for synset in word_synsets:
    print ('Synset Name:', synset.name())
    print ('POS Tag:', synset.pos())
    print ('Definition:', synset.definition())
    print ('Examples:', synset.examples())
    print()

Synset Name: hike.n.01
POS Tag: n
Definition: a long walk usually for exercise or pleasure
Examples: ['she enjoys a hike in her spare time']

Synset Name: rise.n.09
POS Tag: n
Definition: an increase in cost
Examples: ['they asked for a 10% rise in rates']

Synset Name: raise.n.01
POS Tag: n
Definition: the amount a salary is increased
Examples: ['he got a 3% raise', 'he got a wage hike']

Synset Name: hike.v.01
POS Tag: v
Definition: increase
Examples: ['The landlord hiked up the rents']

Synset Name: hike.v.02
POS Tag: v
Definition: walk a long way, as for pleasure or physical exercise
Examples: ['We were hiking in Colorado', 'hike the Rockies']



In [19]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


# Processing and Understanding Text

## Text Tokenization

### Sentence Tokenization

In [20]:
import nltk
from nltk.corpus import gutenberg
from pprint import pprint

In [21]:
alice = gutenberg.raw(fileids='carroll-alice.txt')
sample_text = 'We will discuss briefly about the basic syntax, structure and design philosophies. There is a defined hierarchical syntax for Python code which you should remember when writing code! Python is a really powerful programming language!'

In [22]:
# Total characters in Alice in Wonderland
len(alice)

144395

In [23]:
# First 100 characters in the corpus
print (alice[0:100])

[Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [24]:
default_st = nltk.sent_tokenize
alice_sentences = default_st(text=alice)
sample_sentences = default_st(text=sample_text)

print ('Total sentences in sample_text:', len(sample_sentences))
print ('Sample text sentences :-')
pprint(sample_sentences)
print ('\nTotal sentences in alice:', len(alice_sentences))
print ('First 5 sentences in alice:-')
pprint(alice_sentences[0:5])

Total sentences in sample_text: 3
Sample text sentences :-
['We will discuss briefly about the basic syntax, structure and design '
 'philosophies.',
 'There is a defined hierarchical syntax for Python code which you should '
 'remember when writing code!',
 'Python is a really powerful programming language!']

Total sentences in alice: 1625
First 5 sentences in alice:-
["[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I.",
 'Down the Rabbit-Hole\n'
 '\n'
 'Alice was beginning to get very tired of sitting by her sister on the\n'
 'bank, and of having nothing to do: once or twice she had peeped into the\n'
 'book her sister was reading, but it had no pictures or conversations in\n'
 "it, 'and what is the use of a book,' thought Alice 'without pictures or\n"
 "conversation?'",
 'So she was considering in her own mind (as well as she could, for the\n'
 'hot day made her feel very sleepy and stupid), whether the pleasure\n'
 'of making a daisy-chain would be worth the t

In [25]:
# tokenize text of other languages
from nltk.corpus import europarl_raw

In [26]:
german_text = europarl_raw.german.raw(fileids='ep-00-01-17.de')
# Total characters in the corpus
print (len(german_text))
# First 100 characters in the corpus
print (german_text[0:100])

157171
 
Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sit


In [27]:
german_sentences_def = default_st(text=german_text,language='german')

# loading german text tokenizer into a PunktSentenceTokenizer instance
german_tokenizer = nltk.data.load(resource_url='tokenizers/punkt/german.pickle')
german_sentences = german_tokenizer.tokenize(german_text)

# verify the type of german_tokenizer
# should be PunktSentenceTokenizer
print (type(german_tokenizer))

<class 'nltk.tokenize.punkt.PunktSentenceTokenizer'>


In [28]:
print (german_sentences_def == german_sentences)
# print first 5 sentences of the corpus
for sent in german_sentences[0:5]:
    print (sent)

True
 
Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen , wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe , daß Sie schöne Ferien hatten .
Wie Sie feststellen konnten , ist der gefürchtete " Millenium-Bug " nicht eingetreten .
Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden .
Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen .
Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen - , allen Opfern der Stürme , insbesondere in den verschiedenen Ländern der Europäischen Union , in einer Schweigeminute zu gedenken .


In [29]:
punkt_st = nltk.tokenize.PunktSentenceTokenizer()
sample_sentences = punkt_st.tokenize(sample_text)
pprint(sample_sentences)

['We will discuss briefly about the basic syntax, structure and design '
 'philosophies.',
 'There is a defined hierarchical syntax for Python code which you should '
 'remember when writing code!',
 'Python is a really powerful programming language!']


In [30]:
SENTENCE_TOKENS_PATTERN = '(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s'
regex_st = nltk.tokenize.RegexpTokenizer(pattern=SENTENCE_TOKENS_PATTERN,gaps=True)
sample_sentences = regex_st.tokenize(sample_text)
pprint(sample_sentences)

['We will discuss briefly about the basic syntax, structure and design '
 'philosophies.',
 'There is a defined hierarchical syntax for Python code which you should '
 'remember when writing code!',
 'Python is a really powerful programming language!']


### Word Tokenization

In [31]:
sentence = "The brown fox wasn't that quick and he couldn't win the race"

default_wt = nltk.word_tokenize
words = default_wt(sentence)
print (words)

['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']


In [32]:
treebank_wt = nltk.TreebankWordTokenizer()
words = treebank_wt.tokenize(sentence)
print (words)

['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']


In [33]:
# pattern to identify tokens themselves
TOKEN_PATTERN = '\w+'
regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN,gaps=False)
words = regex_wt.tokenize(sentence)
print (words)

['The', 'brown', 'fox', 'wasn', 't', 'that', 'quick', 'and', 'he', 'couldn', 't', 'win', 'the', 'race']


In [34]:
# pattern to identify gaps in tokens
GAP_PATTERN = '\s+'
regex_wt = nltk.RegexpTokenizer(pattern=GAP_PATTERN,gaps=True)
words = regex_wt.tokenize(sentence)
print (words)

['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


In [35]:
# get start and end indices of each token and then print them
word_indices = list(regex_wt.span_tokenize(sentence))
print (word_indices)
print ([sentence[start:end] for start, end in word_indices])

[(0, 3), (4, 9), (10, 13), (14, 20), (21, 25), (26, 31), (32, 35), (36, 38), (39, 47), (48, 51), (52, 55), (56, 60)]
['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


In [36]:
wordpunkt_wt = nltk.WordPunctTokenizer()
words = wordpunkt_wt.tokenize(sentence)
print (words)

['The', 'brown', 'fox', 'wasn', "'", 't', 'that', 'quick', 'and', 'he', 'couldn', "'", 't', 'win', 'the', 'race']


In [37]:
whitespace_wt = nltk.WhitespaceTokenizer()
words = whitespace_wt.tokenize(sentence)
print (words)

['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


## Text Normalization

In [38]:
import re
import string

In [39]:
corpus = ["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for $199", "@@You'll (learn) a **lot** in the book. Python is an amazing language !@@"]

### Tokenizing Text

In [40]:
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
    return word_tokens

In [41]:
token_list = [tokenize_text(text) for text in corpus]
print(token_list)

[[['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'the', 'race']], [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'], ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']], [['@', '@', 'You', "'ll", '(', 'learn', ')', 'a', '**lot**', 'in', 'the', 'book', '.'], ['Python', 'is', 'an', 'amazing', 'language', '!'], ['@', '@']]]


### Removing Special Characters

In [42]:
def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    return filtered_tokens

In [53]:
def remove_characters(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    # filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    filtered_tokens = [re.sub(pattern, "", token) for token in tokens]
    return filtered_tokens

In [56]:
filtered_list_1 = [(remove_characters(tokens) for tokens in sentence_tokens) for sentence_tokens in token_list] 
filtered_list_1

[<generator object <listcomp>.<genexpr> at 0x7f5aaa8f2d00>,
 <generator object <listcomp>.<genexpr> at 0x7f5aaa8f2ca8>,
 <generator object <listcomp>.<genexpr> at 0x7f5aaa8f2d58>]

In [44]:
def remove_characters_before_tokenization(sentence,keep_apostrophes=False):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = '[?|$|&|*|%|@|(|)|~]' # add other characters here to remove them
        filtered_sentence = re.sub(PATTERN, '', sentence)
    else:
        PATTERN = '[^a-zA-Z0-9 ]' # only extract alpha-numeric characters
        filtered_sentence = re.sub(PATTERN, '', sentence)
    return filtered_sentence

In [45]:
filtered_list_2 = [remove_characters_before_tokenization(sentence) for sentence in corpus]
print (filtered_list_2)

['The brown fox wasnt that quick and he couldnt win the race', 'Hey thats a great deal I just bought a phone for 199', 'Youll learn a lot in the book Python is an amazing language ']


In [46]:
cleaned_corpus = [remove_characters_before_tokenization(sentence,keep_apostrophes=True) for sentence in corpus]
print (cleaned_corpus)

["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for 199", "You'll learn a lot in the book. Python is an amazing language !"]


### Expanding Contractions

In [47]:
def expand_contractions(sentence, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                        if contraction_mapping.get(match)\
                        else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence

In [48]:
from contractions import CONTRACTION_MAP

expanded_corpus = [expand_contractions(sentence, CONTRACTION_MAP) for sentence in cleaned_corpus]
print (expanded_corpus)

['The brown fox was not that quick and he could not win the race', 'Hey that is a great deal! I just bought a phone for 199', 'You will learn a lot in the book. Python is an amazing language !']


### Removing Stopwords

In [49]:
def remove_stopwords(tokens):
    stopword_list = nltk.corpus.stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens

In [50]:
corpus_tokens = [tokenize_text(text) for text in corpus]
filtered_list_3 = [[remove_stopwords(tokens) for tokens in sentence_tokens] for sentence_tokens in corpus_tokens]
print (filtered_list_3)

[[['The', 'brown', 'fox', "n't", 'quick', 'could', "n't", 'win', 'race']], [['Hey', "'s", 'great', 'deal', '!'], ['I', 'bought', 'phone', '$', '199']], [['@', '@', 'You', "'ll", '(', 'learn', ')', '**lot**', 'book', '.'], ['Python', 'amazing', 'language', '!'], ['@', '@']]]


### Correcting Words

#### Correcting Repeating Characters

In [51]:
old_word = 'finalllyyy'
repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
match_substitution = r'\1\2\3'
step = 1

while True:
    # remove one repeated character
    new_word = repeat_pattern.sub(match_substitution,old_word)
    if new_word != old_word:
        print ('Step: {} Word: {}'.format(step, new_word))
        step += 1 # update step
        # update old word to last substituted state
        old_word = new_word
        continue
    else:
        print ("Final word:", new_word)
        break

Step: 1 Word: finalllyy
Step: 2 Word: finallly
Step: 3 Word: finally
Step: 4 Word: finaly
Final word: finaly


In [52]:
from nltk.corpus import wordnet

old_word = 'finalllyyy'
repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
match_substitution = r'\1\2\3'
step = 1

while True:
    # check for semantically correct word
    if wordnet.synsets(old_word):
        print ("Final correct word:", old_word)
        break

    # remove one repeated character
    new_word = repeat_pattern.sub(match_substitution,old_word)
    if new_word != old_word:
        print ('Step: {} Word: {}'.format(step, new_word))
        step += 1 # update step
        # update old word to last substituted state
        old_word = new_word
        continue
    else:
        print ("Final word:", new_word)
        break

Step: 1 Word: finalllyy
Step: 2 Word: finallly
Step: 3 Word: finally
Final correct word: finally


In [53]:
def remove_repeated_characters(tokens):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word

    correct_tokens = [replace(word) for word in tokens]
    return correct_tokens

In [54]:
sample_sentence = 'My schooool is realllllyyy amaaazingggg'
sample_sentence_tokens = tokenize_text(sample_sentence)[0]
print (sample_sentence_tokens)
print (remove_repeated_characters(sample_sentence_tokens))

['My', 'schooool', 'is', 'realllllyyy', 'amaaazingggg']
['My', 'school', 'is', 'really', 'amazing']


#### Correcting Spellings

In [55]:
import re, collections

In [56]:
def tokens(text):
    """
    Get all words from the corpus
    """
    return re.findall('[a-z]+', text.lower())

In [58]:
WORDS = tokens(open('big.txt').read())
WORD_COUNTS = collections.Counter(WORDS)

In [59]:
# top 10 words in the corpus
print (WORD_COUNTS.most_common(10))

[('the', 80030), ('of', 40025), ('and', 38313), ('to', 28766), ('in', 22050), ('a', 21155), ('that', 12512), ('he', 12401), ('was', 11410), ('it', 10681)]


In [60]:
def edits0(word): 
    """
    Return all strings that are zero edits away 
    from the input word (i.e., the word itself).
    """
    return {word}

In [61]:
def edits1(word):
    """
    Return all strings that are one edit away 
    from the input word.
    """
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    def splits(word):
        """
        Return a list of all possible (first, rest) pairs 
        that the input word is made of.
        """
        return [(word[:i], word[i:]) 
                for i in range(len(word)+1)]
                
    pairs      = splits(word)
    deletes    = [a+b[1:]           for (a, b) in pairs if b]
    transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
    replaces   = [a+c+b[1:]         for (a, b) in pairs for c in alphabet if b]
    inserts    = [a+c+b             for (a, b) in pairs for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

In [62]:
def edits2(word):
    """Return all strings that are two edits away 
    from the input word.
    """
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}

In [63]:
def known(words):
    """
    Return the subset of words that are actually 
    in our WORD_COUNTS dictionary.
    """
    return {w for w in words if w in WORD_COUNTS}

In [64]:
# input word
word = 'fianlly'
# zero edit distance from input word
edits0(word)

{'fianlly'}

In [65]:
# returns null set since it is not a valid word
known(edits0(word))

set()

In [66]:
# one edit distance from input word
edits1(word)

{'afianlly',
 'aianlly',
 'bfianlly',
 'bianlly',
 'cfianlly',
 'cianlly',
 'dfianlly',
 'dianlly',
 'efianlly',
 'eianlly',
 'faanlly',
 'faianlly',
 'fainlly',
 'fanlly',
 'fbanlly',
 'fbianlly',
 'fcanlly',
 'fcianlly',
 'fdanlly',
 'fdianlly',
 'feanlly',
 'feianlly',
 'ffanlly',
 'ffianlly',
 'fganlly',
 'fgianlly',
 'fhanlly',
 'fhianlly',
 'fiaally',
 'fiaanlly',
 'fiablly',
 'fiabnlly',
 'fiaclly',
 'fiacnlly',
 'fiadlly',
 'fiadnlly',
 'fiaelly',
 'fiaenlly',
 'fiaflly',
 'fiafnlly',
 'fiaglly',
 'fiagnlly',
 'fiahlly',
 'fiahnlly',
 'fiailly',
 'fiainlly',
 'fiajlly',
 'fiajnlly',
 'fiaklly',
 'fiaknlly',
 'fiallly',
 'fially',
 'fialnlly',
 'fialnly',
 'fiamlly',
 'fiamnlly',
 'fianally',
 'fianaly',
 'fianblly',
 'fianbly',
 'fianclly',
 'fiancly',
 'fiandlly',
 'fiandly',
 'fianelly',
 'fianely',
 'fianflly',
 'fianfly',
 'fianglly',
 'fiangly',
 'fianhlly',
 'fianhly',
 'fianilly',
 'fianily',
 'fianjlly',
 'fianjly',
 'fianklly',
 'fiankly',
 'fianlaly',
 'fianlay',
 'fi

In [67]:
# get correct words from above set
known(edits1(word))

{'finally'}

In [68]:
# two edit distances from input word
edits2(word)

{'fapianlly',
 'fiaolily',
 'fzanyly',
 'mianllyr',
 'fianmlvly',
 'wfianxlly',
 'fiasnoly',
 'jfianrly',
 'fianellyu',
 'fqaxnlly',
 'fianlslcy',
 'fzianyly',
 'fiapnllmy',
 'ufitnlly',
 'fipnily',
 'fdiaunlly',
 'lkfianlly',
 'fwianklly',
 'cianlzy',
 'yfiavnlly',
 'efianllyj',
 'fianlilyf',
 'fivanwly',
 'fbizanlly',
 'fianlku',
 'fiinllyc',
 'fiaenlty',
 'fizaplly',
 'yfianxlly',
 'vianllye',
 'fiquanlly',
 'fiabnlfly',
 'figznlly',
 'vianllvy',
 'fianldyf',
 'finlily',
 'fiatllc',
 'fiaxvnlly',
 'fianlee',
 'feianldly',
 'fiaxnglly',
 'rfianllo',
 'fyianllyr',
 'iapnlly',
 'fianqllgy',
 'xfiamnlly',
 'fiaanilly',
 'fiawllyp',
 'fiaglbly',
 'fiaxllty',
 'fiianhly',
 'fiauntlly',
 'fianlalf',
 'fianllmk',
 'ifanlwy',
 'rwanlly',
 'fvianclly',
 'fianllbwy',
 'fimnslly',
 'foianclly',
 'fidajlly',
 'fitnlwy',
 'fiazlyly',
 'ftqianlly',
 'fkanlcy',
 'mfianlloy',
 'fhienlly',
 'kianllyb',
 'jfianlmly',
 'fiboanlly',
 'fiaxnllyp',
 'fianllnyr',
 'fiainlqly',
 'fianiilly',
 'fianillye',
 

In [69]:
# get correct words from above set
known(edits2(word))

{'faintly', 'finally', 'finely', 'frankly'}

In [70]:
candidates = (known(edits0(word)) or known(edits1(word)) or known(edits2(word)) or [word])
candidates

{'finally'}

In [71]:
def correct(word):
    """
    Get the best correct spelling for the input word
    """
    # Priority is for edit distance 0, then 1, then 2
    # else defaults to the input word itself.
    candidates = (known(edits0(word)) or 
                  known(edits1(word)) or 
                  known(edits2(word)) or 
                  [word])
    return max(candidates, key=WORD_COUNTS.get)

In [72]:
correct('fianlly')

'finally'

In [73]:
correct('FIANLLY')

'FIANLLY'

In [74]:
def correct_match(match):
    """
    Spell-correct word in match, 
    and preserve proper upper/lower/title case.
    """
    
    word = match.group()
    def case_of(text):
        """
        Return the case-function appropriate 
        for text: upper, lower, title, or just str.:
            """
        return (str.upper if text.isupper() else
                str.lower if text.islower() else
                str.title if text.istitle() else
                str)
    return case_of(word)(correct(word.lower()))

In [75]:
def correct_text_generic(text):
    """
    Correct all the words within a text, 
    returning the corrected text.
    """
    return re.sub('[a-zA-Z]+', correct_match, text)

In [76]:
correct_text_generic('fianlly')

'finally'

In [77]:
correct_text_generic('FIANLLY')

'FINALLY'

In [83]:
from pattern3.en import suggest

IndentationError: expected an indented block (tree.py, line 37)

In [84]:
print suggest('fianlly')
print suggest('flaot')

SyntaxError: invalid syntax (<ipython-input-84-f89ba718ba8b>, line 1)

### Stemming

In [85]:
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [86]:
print (ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped'))

jump jump jump


In [87]:
print (ps.stem('lying'))

lie


In [88]:
print (ps.stem('strange'))

strang


In [89]:
# Lancaster Stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

In [90]:
print (ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped'))

jump jump jump


In [91]:
print (ls.stem('lying'))

lying


In [92]:
print (ls.stem('strange'))

strange


In [93]:
# Regex based stemmer
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)

In [94]:
print (rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped'))

jump jump jump


In [95]:
print (rs.stem('lying'))

ly


In [96]:
print (rs.stem('strange'))

strange


In [97]:
# Snowball Stemmer
from nltk.stem import SnowballStemmer
ss = SnowballStemmer("german")

In [98]:
print ('Supported Languages:', SnowballStemmer.languages)

Supported Languages: ('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [99]:
ss.stem('autobahnen')

'autobahn'

In [100]:
ss.stem('springen')

'spring'

### Lemmatization

In [1]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [102]:
# lemmatize nouns
wnl.lemmatize('cars', 'n')

'car'

In [103]:
wnl.lemmatize('men', 'n')

'men'

In [104]:
# lemmatize verbs
print (wnl.lemmatize('running', 'v'))
print (wnl.lemmatize('ate', 'v'))

run
eat


In [105]:
# lemmatize adjectives
print (wnl.lemmatize('saddest', 'a'))
print (wnl.lemmatize('fancier', 'a'))

sad
fancy


In [3]:
wnl.lemmatize('helped', 'v')

'help'

In [4]:
wnl.lemmatize('cars')

'car'

In [5]:
wnl.lemmatize('saddest')

'saddest'

In [6]:
wnl.lemmatize('women')

'woman'

# Text Classification

## Text Normalization

In [2]:
from pattern3.en import tag
from nltk.corpus import wordnet as wn

# Annotate text tokens with POS tags
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text

IndentationError: expected an indented block (tree.py, line 37)

In [None]:
# lemmatize text based on POS tags    
def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text