## In this chapter, Bya will cover the following recipes:
1. Stemming words
2. Lemmatizing words with WordNet
3. Replacing words matching regular expressions
4. Removing repeating characters
5. Spelling correction with Enchant
6. Replacing synonyms
7. Replacing negations with antonyms

# 1. Stemming words

In [1]:
# Libraries
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import RegexpStemmer

In [2]:
# PorterStemmer
stemmer = PorterStemmer()
print "Word: ", stemmer.stem('cooking')
text = 'Stemming is funnier than a bummer says the sushi loving computer scientist'
print "Text: ", " ".join(map(lambda word: stemmer.stem(word), text.split()))

Word:  cook
Text:  Stem is funnier than a bummer say the sushi love comput scientist


In [3]:
# LancasterStemmer
stemmer = LancasterStemmer()
print "Word: ", stemmer.stem('cooking')
text = 'Stemming is funnier than a bummer says the sushi loving computer scientist'
print "Text: ", " ".join(map(lambda word: stemmer.stem(word), text.split()))

Word:  cook
Text:  stem is funny than a bum say the sush lov comput sci


In [4]:
# RegexpStemmer
stemmer = RegexpStemmer('ing')
print stemmer.stem('cooking')
print stemmer.stem('cookery')
print stemmer.stem('ingleside')
stemmer = RegexpStemmer('@')
print stemmer.stem('@mongolia@')

cook
cookery
leside
mongolia


# 2. Lemmatizing words with WordNet

In [5]:
# Libraries
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [6]:
# WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print lemmatizer.lemmatize('cooking')
lemmatizer.lemmatize('cooking', pos='v')
print lemmatizer.lemmatize('cookbooks')

cooking
cookbook


In [7]:
# Instead of just chopping off the es like the PorterStemmer
# class, the WordNetLemmatizer class finds a valid root word. 
# Where a stemmer only looks at the form of the word,
# the lemmatizer looks at the meaning of the word. 
# By returning a lemma, you will always get a valid word.
stemmer = PorterStemmer()
print "Stemmer: ", stemmer.stem('believes')

print "Lemma: ", lemmatizer.lemmatize('believes')

Stemmer:  believ
Lemma:  belief


# 3. Replacing words matching regular expressions

In [8]:
# Defining RegexpReplacer

# save this as 'replacers.py' 
# then use like "from replacers import RegexpReplacer"

import re

replacement_patterns = [
     (r'won\'t', 'will not'),
     (r'can\'t', 'cannot'),
     (r'i\'m', 'i am'),
     (r'ain\'t', 'is not'),
     (r'(\w+)\'ll', '\g<1> will'),
     (r'(\w+)n\'t', '\g<1> not'),
     (r'(\w+)\'ve', '\g<1> have'),
     (r'(\w+)\'s', '\g<1> is'),
     (r'(\w+)\'re', '\g<1> are'),
     (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s

In [9]:
# Usage of RegexpReplacer

# if 'replacers.py' file was saved:
# from replacers import RegexpReplacer"

replacer = RegexpReplacer()
print replacer.replace("can't is a contraction")
print replacer.replace("I should've done that thing I didn't do")

cannot is a contraction
I should have done that thing I did not do


### Replacement before tokenization

In [10]:
# if 'replacers.py' file was saved:
# from replacers import RegexpReplacer"

from nltk.tokenize import word_tokenize
replacer = RegexpReplacer()

print "Before RegexpReplacer: \n", word_tokenize("can't is a contraction")
print "After RegexpReplacer: \n", word_tokenize(replacer.replace("can't is a contraction"))

Before RegexpReplacer: 
['ca', "n't", 'is', 'a', 'contraction']
After RegexpReplacer: 
['can', 'not', 'is', 'a', 'contraction']


# 4. Removing repeating characters

In [11]:
# Defining RepeatReplacer object

# save this as 'replacers.py' 
# then use like "from replacers import RepeatReplacer"

import re
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    def replace(self, word):
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

In [12]:
# Usage of RepeatReplacer

# from replacers import RepeatReplacer

replacer = RepeatReplacer()
print replacer.replace('looooove')
print replacer.replace('oooooh')
print replacer.replace('goose')
print replacer.replace('feel')

love
oh
gose
fel


### The ```replace()``` function with a `WordNet` lookup

In [13]:
import re
from nltk.corpus import wordnet

class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

In [14]:
replacer = RepeatReplacer()
print replacer.replace('looooove')
print replacer.replace('oooooh')
print replacer.replace('goose')
print replacer.replace('feel')

love
ooh
goose
feel


# 5. Spelling correction with Enchant

In [15]:
import enchant
d = enchant.Dict("en_US")
print d.check("Hello")
print d.check("Helo")
print d.suggest("Helo")
print d.check("Bya")
print d.suggest("Bya")

True
False
['He lo', 'Hole', 'Hello', 'Helot', 'Halo', 'Hero', 'Hell', 'Held', 'Helm', 'Help', 'Helios', 'Helyn', 'Helsa', 'Helli']
False
['Bay', 'Ya', 'Ba', 'Bea', 'B ya', 'By', 'Baa', 'Bra', 'Boa', 'Bye', 'By a']


### New class called  `SpellingReplacer` in `replacers.py`

In [16]:
# the replace() method will check Enchant 
# to see whether the word is valid. 
# If not, we will look up the suggested alternatives 
# and return the best match using nltk.metrics.edit_distance():

import enchant
from nltk.metrics import edit_distance

class SpellingReplacer(object):
    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        else:
            return word

In [17]:
# Usage of SpellingReplacer

# from replacers import SpellingReplacer
replacer = SpellingReplacer()
print replacer.replace('cookbok')
print replacer.replace('bya')

cookbook
bay


### Dictionaries

In [18]:
# check other languages
enchant.list_languages()

['de_DE', 'en_AU', 'en_GB', 'en_US', 'fr_FR']

In [19]:
import enchant

dUS = enchant.Dict('en_US')
print dUS.check('theater')

dGB = enchant.Dict('en_GB')
print dGB.check('theater')

True
False


In [20]:
# from replacers import SpellingReplacer
us_replacer = SpellingReplacer('en_US')
print us_replacer.replace('theater')

gb_replacer = SpellingReplacer('en_GB')
print gb_replacer.replace('theater')

theater
theatre


### Personal word lists

In [21]:
replacer = SpellingReplacer()
print replacer.replace('bya')

bay


In [22]:
# add class to 'replacers.py'

class CustomSpellingReplacer(SpellingReplacer):
    def __init__(self, spell_dict, max_dist=2):
        self.spell_dict = spell_dict
        self.max_dist = max_dist

In [23]:
# from replacers import CustomSpellingReplacer
d = enchant.DictWithPWL('en_US', '/Users/Bya/git/wordle-me/WordExtract/mywords.txt')
print  d.check('bya')
replacer = CustomSpellingReplacer(d)
print replacer.replace('bya')

True
bya


# 6. Replacing synonyms

In [24]:
# create a WordReplacer class in replacers.py 
# that takes a word replacement mapping:

class WordReplacer(object):
    def __init__(self, word_map):
        self.word_map = word_map
    def replace(self, word):
        return self.word_map.get(word, word)

In [25]:
# from replacers import WordReplacer
replacer = WordReplacer({'bday': 'birthday',
                         'bya': 'byambasuren'
                        })

print replacer.replace('bday')

print replacer.replace('bya')

birthday
byambasuren


### CSV synonym replacement

In [26]:
# The CsvWordReplacer class extends WordReplacer in replacers.py 
# in order to construct the word_map dictionary from a CSV file:
import csv

class CsvWordReplacer(WordReplacer):
    def __init__(self, fname):
        word_map = {}
        for line in csv.reader(open(fname)):
            word, syn = line
            word_map[word] = syn
        super(CsvWordReplacer, self).__init__(word_map)

In [27]:
# from replacers import CsvWordReplacer
replacer = CsvWordReplacer('/Users/Bya/git/wordle-me/WordExtract/synonyms.csv')
print replacer.replace('bday')
print replacer.replace('bya')

 birthday
 byambasuren


# 7. Replacing negations with antonyms

### `class: AntonymReplacer`

In [28]:
# create an AntonymReplacer class in replacers.py:

from nltk.corpus import wordnet

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
    
    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []
        while i < l:
            word = sent[i]
            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])
                if ant:
                    words.append(ant)
                    i += 2
                    continue
            words.append(word)
            i += 1
        return words

In [29]:
# Usage of AntonymReplacer

# from replacers import AntonymReplacer
replacer = AntonymReplacer()
print replacer.replace('good')
print replacer.replace('uglify')

sent = ["let's", 'not', 'uglify', 'our', 'code']
print replacer.replace_negations(sent)

None
beautify
["let's", u'beautify', 'our', 'code']


### `class: AntonymWordReplacer`

In [30]:
# create an AntonymWordReplacer class in replacers.py:

class AntonymWordReplacer(WordReplacer, AntonymReplacer):
    pass

In [31]:
# usage of AntonymWordReplacer

# from replacers import AntonymWordReplacer
replacer = AntonymWordReplacer({'evil': 'good'})
print replacer.replace_negations(['good', 'is', 'not', 'evil'])

['good', 'is', 'good']
