### Correcting Words

In [2]:
import nltk
from nltk.corpus import wordnet
import re

In [6]:
def tokenize_text(raw_text,language="english"):
    default_st=nltk.sent_tokenize
    default_wt=nltk.word_tokenize
    sentences=default_st(raw_text,language=language)
    words=[default_wt(sentence) for sentence in sentences]
    return words

In [39]:
def remove_repeated_characters(tokens):
    repeat_pattern=re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution=r'\1\2\3'
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word=repeat_pattern.sub(match_substitution,old_word)
        if old_word!=new_word:
            return replace(new_word)
        else:
            return new_word
    corrected_tokens=[replace(token) for token in tokens]
    return corrected_tokens

In [40]:
sample_sentence="aaamm realllyy aweeesoomme"
tokens=tokenize_text(sample_sentence)[0]
corrected_tokens=remove_repeated_characters(tokens)
corrected_tokens

['am', 'really', 'awesome']

In [None]:
from pattern.en import suggest
print("finallly")

** Stemming **
* Finding the stem of a word.
* There are many stemmer library modules available.They may behave differently for few words.

In [1]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()
ps.stem("lying")

'lie'

In [2]:
from nltk.stem import LancasterStemmer
ls=LancasterStemmer()
ls.stem("lying")

'lying'

In [4]:
from nltk.stem import SnowballStemmer#Used for stemming of other languages
ss=SnowballStemmer("german")
print(ss.stem("lying"))
ss.stem("autobahnen")

lying


'autobahn'

**Lemmatization**
* Same as Stemming but finds out the root word instead of the root stem.
* The root word changes based on the POS given as the parameter.
* WordNetLemmatizer internally uses the morphy() function belonging to the WordNetCorpusReader class

In [9]:
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()
wnl.lemmatize("cars","n")

'car'

In [10]:
print(wnl.lemmatize("running","v"))
print(wnl.lemmatize("fancier","a"))

run
fancy


** POS Tagger Library Versions**
* Used for obtaining the POS tags given tokens as input.

In [11]:
import nltk
sentence="The brown fox is quick and he is jumping over the lazy dog"
tokens=nltk.word_tokenize(sentence)
tagged_sent=nltk.pos_tag(tokens,tagset="universal")
tagged_sent

[('The', 'DET'),
 ('brown', 'ADJ'),
 ('fox', 'NOUN'),
 ('is', 'VERB'),
 ('quick', 'ADJ'),
 ('and', 'CONJ'),
 ('he', 'PRON'),
 ('is', 'VERB'),
 ('jumping', 'VERB'),
 ('over', 'ADP'),
 ('the', 'DET'),
 ('lazy', 'ADJ'),
 ('dog', 'NOUN')]

In [None]:
#Fix installation of pattern
from pattern.en import tag
tagged_sent=tag(sentence)
tagged_sent

In [16]:
from nltk.corpus import treebank
data=treebank.tagged_sents()
train_data=data[:3500]
test_data=data[3500:]

In [18]:
print(train_data[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


** Custom POS Taggers **
* Every tagger is a child class of TaggerIclass and all of them have tag() and evaluate() function
* DefaultTagger with base class as SequentialBackoffTagger assings same POS to all words.

In [19]:
from nltk.tag import DefaultTagger
dt=DefaultTagger("NN")
dt.evaluate(test_data)

0.1454158195372253

In [20]:
dt.tag(tokens)

[('The', 'NN'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('is', 'NN'),
 ('quick', 'NN'),
 ('and', 'NN'),
 ('he', 'NN'),
 ('is', 'NN'),
 ('jumping', 'NN'),
 ('over', 'NN'),
 ('the', 'NN'),
 ('lazy', 'NN'),
 ('dog', 'NN')]

* We can also use RegexpTagger Class with passing it a pair of pattern of words and their POS tag.

** Taggers By NLTK **
* Ngrams use Bayesian rule and maximize likelihood to find the POS tags for each sentence.
* NLTK provides the library implementations of below implemented Ngrams

In [21]:
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut=UnigramTagger(train_data)
bt=BigramTagger(train_data)
tt=TrigramTagger(train_data)

In [22]:
print("Unigram evaluation:",ut.evaluate(test_data))
print("Bigram evaluation:",bt.evaluate(test_data))
print("Trigram evaluation:",tt.evaluate(test_data))

Unigram evaluation: 0.8607803272340013
Bigram evaluation: 0.13466937748087907
Trigram evaluation: 0.08064672281924679


In [25]:
print(ut.tag(tokens))
print(bt.tag(tokens))
print(tt.tag(tokens))

[('The', 'DT'), ('brown', None), ('fox', None), ('is', 'VBZ'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', None), ('dog', None)]
[('The', 'DT'), ('brown', None), ('fox', None), ('is', None), ('quick', None), ('and', None), ('he', None), ('is', None), ('jumping', None), ('over', None), ('the', None), ('lazy', None), ('dog', None)]
[('The', 'DT'), ('brown', None), ('fox', None), ('is', None), ('quick', None), ('and', None), ('he', None), ('is', None), ('jumping', None), ('over', None), ('the', None), ('lazy', None), ('dog', None)]


** Combination of Ngrams**
* As you can observe there are places where the POS tag is given as None. 
* This is because of no presence of such combination in training data
* For Unigram it is the absence of such words in the training data
* For Bigram and Trigram it is due to the absence of sequences as they are in Train data.
* This can be handled by smoothing which is nothing but the combination of all the above models with few parameters so that if there are combinations not present they can fall back to different POS Tagger. For example Trigram->Bigram->Unigram->RegexTagger
* The methods used are Linear Interpolation and Discounting Methods.

** ClassifierBasedPOSTagger **
* Can be used for POS Tag classification.
* Provides feature_detector parameter to construct custom function for feature detection
* Provides classifier_builder parameter which accept NaiveBayesClassifer MaxentClassifer as input(provided by nltk library)

In [30]:
from nltk.classify import NaiveBayesClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

nbt=ClassifierBasedPOSTagger(train=train_data,classifier_builder=NaiveBayesClassifier.train)
nbt.evaluate(test_data)

0.9306806079969019

In [31]:
nbt.tag(tokens)

[('The', 'DT'),
 ('brown', 'JJ'),
 ('fox', 'NN'),
 ('is', 'VBZ'),
 ('quick', 'JJ'),
 ('and', 'CC'),
 ('he', 'PRP'),
 ('is', 'VBZ'),
 ('jumping', 'VBG'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'VBG')]