# Spell Checker v2

- en-US, en_UK, en-IN dictionary support
- Corrects variants of words (real word errors) as well
- moving towards context based sentence correction using ngram models

In [None]:
!pip install pyenchant
!apt-get install enchant
!pip install numpy
!pip install nltk
!pip install pandas
!pip install wget
!pip install textblob

In [None]:
import enchant
import numpy as np
import csv
import math, collections
import pandas as pd
import re
import itertools
import nltk
nltk.download('punkt')
from nltk import tokenize
import nltk.data
import wget
import pandas as pd 
from textblob import TextBlob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
class Sentence_Corrector :
    def __init__(self, training_file, tag="en_US") :
        self.laplaceUnigramCounts = collections.defaultdict(lambda: 0)
        self.laplaceBigramCounts = collections.defaultdict(lambda: 0)
        self.total = 0
        self.sentences = []
        self.importantKeywords = set()
        self.tag = tag
        self.d = enchant.Dict(tag)
        self.tokenize_file(training_file)
        self.train()

    def tokenize_file(self, file) :
        # """
        #   Read the file, tokenize and build a list of sentences
        # """
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        f = open(file)
        content = f.read()
        for sentence in tokenizer.tokenize(content):
            sentence_clean = [i.lower() for i in re.split('[^a-zA-Z]+', sentence) if i]
            self.sentences.append(sentence_clean)


    def train(self):
        # """
        #   Train unigram and bigram
        # """
        for sentence in self.sentences:
            sentence.insert(0, '<s>')
            sentence.append('</s>')
            for i in range(len(sentence) - 1):
                token1 = sentence[i]
                token2 = sentence[i + 1]
                self.laplaceUnigramCounts[token1] += 1
                self.laplaceBigramCounts[(token1, token2)] += 1
                self.total += 1
            self.total += 1
            self.laplaceUnigramCounts[sentence[-1]] += 1


    def candidate_word(self, word):
        # """
        # Generate similar word for a given word
        # """
        suggests = []
        for candidate in self.importantKeywords:
            if candidate.startswith(word):
                suggests.append(candidate)
        suggests.append(word)

        if len(suggests) == 1:
            suggests = self.d.suggest(word)
            suggests = [suggest.lower() for suggest in suggests][:4]
            suggests.append(word)
            suggests = list(set(suggests))

        return suggests, len(suggests)

    def candidate_sentence(self, sentence):
        # """
        # Takes one sentence, and return all the possible sentences, and also return a dictionary of word : suggested number of words
        # """
        candidate_sentences = []
        words_count = {}
        for word in sentence:
            candidate_sentences.append(self.candidate_word(word)[0])
            words_count[word] = self.candidate_word(word)[1]

        candidate_sentences = list(itertools.product(*candidate_sentences))
        return candidate_sentences, words_count

    def correction_score(self, words_count, old_sentence, new_sentence) :
        # """
        #   Take a old sentence and a new sentence, for each words in the new sentence, if it's same as the orginal sentence, assign 0.95 prob
        #   If it's not same as original sentence, give 0.05 / (count(similarword) - 1)
        # """
        score = 1
        for i in range(len(new_sentence)) :
            if new_sentence[i] in words_count :
                score *= 0.95
            else :
                score *= (0.05 / (words_count[old_sentence[i]] - 1))
        return math.log(score)

    def score(self, sentence):
        # """
        #     Takes a list of strings as argument and returns the log-probability of the
        #     sentence using the stupid backoff language model.
        #     Use laplace smoothing to avoid new words with 0 probability
        # """
        score = 0.0
        for i in range(len(sentence) - 1):
            if self.laplaceBigramCounts[(sentence[i],sentence[i + 1])] > 0:
                score += math.log(self.laplaceBigramCounts[(sentence[i],sentence[i + 1])])
                score -= math.log(self.laplaceUnigramCounts[sentence[i]])
            else:
                score += (math.log(self.laplaceUnigramCounts[sentence[i + 1]] + 1) + math.log(0.4))
                score -= math.log(self.total + len(self.laplaceUnigramCounts))
        return score

    def return_best_sentence(self, old_sentence) :
        # """
        #   Generate all candiate sentences and
        #   Calculate the prob of each one and return the one with highest probability
        #   Probability involves two part 1. correct probability and 2. language model prob
        #   correct prob : p(c | w)
        #   language model prob : use stupid backoff algorithm
        # """
        bestScore = float('-inf')
        bestSentence = []
        old_sentence = [word.lower() for word in old_sentence.split()]
        sentences, word_count = self.candidate_sentence(old_sentence)
        for new_sentence in sentences:
            new_sentence = list(new_sentence)
            score = self.correction_score(word_count, new_sentence, old_sentence)
            new_sentence.insert(0, '<s>')
            new_sentence.append('</s>')
            score += self.score(new_sentence)
            if score >= bestScore:
                bestScore = score
                bestSentence = new_sentence
        bestSentence = ' '.join(bestSentence[1:-1])
        return bestSentence, bestScore

In [None]:
import os
if not os.path.exists('./big.txt'):
  wget.download('http://norvig.com/big.txt', './big.txt')

corrector = Sentence_Corrector('./big.txt', tag='en_IN')

## Outputs
- Output for variant dicts like US-English, UK-English, IN-English
- Let us compare the outputs of our spell corrector based on it's initialised dictionaries (en_UK / en_US / en_IN)

In [None]:
corrector_in = Sentence_Corrector('./big.txt', tag='en_IN')
corrector_uk = Sentence_Corrector('./big.txt', tag='en_UK')
corrector_us = Sentence_Corrector('./big.txt', tag='en_US')

In [None]:
sent = "this is the wron spallin of the word"
d = collections.defaultdict(list)
correctors = [corrector_in, corrector_uk, corrector_us]
for corr in correctors:
  d['tag'].append(corr.tag)
  d['correction'].append(corr.return_best_sentence(sent)[0])

print("incorrect sentence: ", sent)
pd.DataFrame(d)

incorrect sentence:  this is the wron spallin of the word


Unnamed: 0,tag,correction
0,en_IN,this is the wrong spelling of the world
1,en_UK,this is the wrong spelling of the world
2,en_US,this is the wrong spinal of the world


we see that the en_IN and en_UK models give us the expected answer

In [None]:
sent2 = "this is ornage color"
d2 = collections.defaultdict(list)
for corr in correctors:
  d2['tag'].append(corr.tag)
  d2['correction'].append(corr.return_best_sentence(sent2)[0])

print("incorrect sentence: ", sent2)
pd.DataFrame(d2)

incorrect sentence:  this is ornage color


Unnamed: 0,tag,correction
0,en_IN,this is orange colour
1,en_UK,this is orange colour
2,en_US,this is orange color


"color" / "colour" correction is seen differently among these dictionaries

____________________________________________

# Context Based Grammar Checker
- Aim: to use ngram language model to correct contextually unlikely occuring sentences into more probable ones, thus capturing errors, both real word and non word errors.
- Let's use the previous SentenceCorrector class and use the unmasker bert mlm to get the word with highest score to correct the sentence

In [None]:
!pip install transformers
!pip install nltk 

In [None]:
import nltk
nltk.download('words')

from nltk.corpus import words

words = words.words()

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')

## Outputs
- We see the hybrid model of bert mlm and noisy channel sentence correction can yield better results with more accurate words.

Example:    "he might tries to run" 

Expected -> "he might try to run"

In [None]:
print("Masked Sentence: he might [MASK] to run")
pd.DataFrame(unmasker('he might [MASK] to run'))

Masked Sentence: he might [MASK] to run


Unnamed: 0,score,token,token_str,sequence
0,0.705319,2031,have,he might have to run
1,0.098274,2215,want,he might want to run
2,0.097698,2342,need,he might need to run
3,0.064293,3046,try,he might try to run
4,0.00711,2707,start,he might start to run


In [None]:
print("incorrect sentence: he might tries to run")
print("corrected: ", corrector.return_best_sentence('he might tries to run')[0])

incorrect sentence: he might tries to run
corrected:  he might tries to run


We see that the noisy channel model alone fails to correct this error but with the help of candidate words from the Masked model, we can combine the 2 to generate better predictions.

In [None]:
corrector.candidate_word('tries')

(['tires', 'tries', 'tories', 'triers'], 4)

In [None]:
pd.DataFrame(unmasker('he will [MASK] a paella'))

Unnamed: 0,score,token,token_str,sequence
0,0.26159,2022,be,he will be a paella
1,0.112422,2031,have,he will have a paella
2,0.096517,2468,become,he will become a paella
3,0.073602,4929,wear,he will wear a paella
4,0.032077,2191,make,he will make a paella


# Downfalls
- Needs better correction of misplaced words and word variations, 
- Eg. he be will makes food -> he will ~be~ ~makes~ make food 