In [1]:
from transformers import pipeline
from nltk.metrics.distance import edit_distance, jaccard_distance
from nltk.util import ngrams

In [2]:
unmasker = pipeline('fill-mask', model='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
#data: https://github.com/dwyl/english-words/blob/master/words_alpha.txt
with open("words_alpha.txt","r") as f:
    dictionary = [word.strip() for word in f.readlines()]

In [4]:
def transformer_spell_check(sentence):    
    input_text = ""
    c = 0
    words = sentence.split()
    for i in range(len(words)):
        c = 0

        if words[i].lower() in dictionary:
            c = 1
            input_text += words[i]
            input_text += " "
            continue
        elif c == 0:
            pos = i
            input_text += '[MASK] '
    if '[MASK]' in input_text:
        predictions = unmasker(input_text)
        min_dist = 100
        final_pred = ''
        for i in predictions:

            pred_word = i['token_str']
            distance = edit_distance(pred_word, words[pos]) 
            
            if distance < min_dist:
                min_dist = distance
                final_pred = pred_word
                
        print("Incorrect word: ",words[pos], " Corrected word: ",final_pred)
        words[pos] = final_pred

        print("Predicted Sentence:")
        print(" ".join([str(word) for word in words]))
    else:
        print("The semantics of the sentence is correct")

In [5]:
test_sentences = ['I have been thinkyng of you',
                 "WHY HAVE YOU COLLEDD ME",
                 "tell me woot happened",
                 "He will picc up calls",
                 "wot did u gift him"]

for text in test_sentences:
    print("\nInput Text: ",text)
    transformer_spell_check(text)


Input Text:  I have been thinkyng of you
Incorrect word:  thinkyng  Corrected word:  thinking
Predicted Sentence:
I have been thinking of you

Input Text:  WHY HAVE YOU COLLEDD ME
Incorrect word:  COLLEDD  Corrected word:  called
Predicted Sentence:
WHY HAVE YOU called ME

Input Text:  tell me woot happened
Incorrect word:  woot  Corrected word:  what
Predicted Sentence:
tell me what happened

Input Text:  He will picc up calls
Incorrect word:  picc  Corrected word:  pick
Predicted Sentence:
He will pick up calls

Input Text:  wot did u gift him
The semantics of the sentence is correct
