# Replacing and Correcting Words

## Stemming words

In [1]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter = PorterStemmer()
lancaster = LancasterStemmer()

In [2]:
porter.stem("cooking"), lancaster.stem("cooking")

('cook', 'cook')

In [3]:
porter.stem("cookery"), lancaster.stem("cookery")

('cookeri', 'cookery')

In [4]:
from nltk.stem import SnowballStemmer

snowball = SnowballStemmer("portuguese")

snowball.languages

('danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [8]:
snowball.stem("putaria")

'put'

### Lemmatizing words with WordNet

In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize("cooking")
# same as lemmatizer.lemmatize("cooking", pos="n")

'cooking'

In [10]:
lemmatizer.lemmatize("cooking", pos='v')

'cook'

In [11]:
lemmatizer.lemmatize("cookbooks")

'cookbook'

In [25]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

print("stem:\t{}\nlemma:\t{}".format(stemmer.stem("believes"), lemmatizer.lemmatize("believes")))

stem:	believ
lemma:	belief


#### Combining stemming with lemmatization

In [26]:
stemmer.stem("buses")

'buse'

In [27]:
lemmatizer.lemmatize("buses")

'bus'

In [28]:
stemmer.stem(lemmatizer.lemmatize("buses"))

'bu'

## Translating text with Babelfish

In [33]:
# a API do babelfish foi desabilitada :~
# há outras soluções, usando o google translate, por exemplo

## Replacing words matching regular expressions

In [34]:
import re

In [35]:
replacement_patterns = [
    (r'won\'t', 'will not'),
    (r"can't", "cannot"),
    (r"i'm", "i am"),
    (r"ain't", "is not"),
    (r"(\w+)'ll", "\g<1> will"),
    (r"(\w+)n't", "\g<1> not"),
    (r"(\w+)'ve", "\g<1> have"),
    (r"(\w+)'s", "\g<1> is"),
    (r"(\w+)'re", "\g<1> are"),
    (r"(\w+)'d", "\g<1> would")
]

class RegexpReplacer(object):
    
    def __init__(self, patterns):
        
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
        
    def replace(self, text):
        
        s = text
        for (regex, repl) in self.patterns:
            
            (s, _) = regex.subn(repl, s)
            
        return s

In [37]:
replacer = RegexpReplacer(replacement_patterns)

replacer.replace("can't is a contraction")

'cannot is a contraction'

In [38]:
replacer.replace("I should've done that thing I didn't do")

'I should have done that thing I did not do'

#### Replacement before tokenization

In [39]:
from nltk.tokenize import word_tokenize

word_tokenize("can't is a contraction")

['ca', "n't", 'is', 'a', 'contraction']

In [40]:
word_tokenize(replacer.replace("can't is a contraction"))

['can', 'not', 'is', 'a', 'contraction']