# Import Required Libraries

In [1]:
import string

from transformers import pipeline, BertTokenizer, BertForMaskedLM, AlbertTokenizer, AlbertForMaskedLM, RobertaTokenizer, RobertaModel
from transformers.pipelines.fill_mask import FillMaskPipeline
import torch

from spacy.tokens.token import Token
from spacy.tokens.doc import Doc
import editdistance
import pandas as pd
import string

  return torch._C._cuda_getDeviceCount() > 0


In [87]:
# torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch_device = 'cpu'
print(f"Torch Device: {torch_device}")

language = 'en'
model_type = 'bert'

# EN
model_name = "bert-large-uncased" # Bert large
# model_name = "bert-base-uncased" # Bert base
# model_name = "roberta-large"  # Roberta

# FA
# model_name = "HooshvareLab/albert-fa-zwnj-base-v2" # Albert
# model_name = "HooshvareLab/bert-fa-base-uncased" # BERT V2
# model_name = "HooshvareLab/bert-fa-zwnj-base" # BERT V3

if model_type == 'bert':
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForMaskedLM.from_pretrained(model_name).to(
        torch_device) if language == 'en' else BertForMaskedLM.from_pretrained(model_name).to(torch_device)
    MASK = "[MASK]"
    unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer)

elif model_type == 'albert':
    tokenizer = AlbertTokenizer.from_pretrained(model_name)
    model = AlbertForMaskedLM.from_pretrained(model_name).to(
        torch_device) if language == 'en' else AlbertForMaskedLM.from_pretrained(model_name).to(torch_device)
    MASK = "[MASK]"
    unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer)

elif model_type == 'roberta':
    MASK = "<mask>"
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    unmasker = pipeline('fill-mask', model='roberta-large')

else:
    print(f"{model_type} not found.")

vocab: set = set(tokenizer.get_vocab().keys())

if model_type == 'roberta':
    vocab = set(map(lambda s: s[1:], vocab))

print(f"{language} {model_type} Model Loaded ...")

Torch Device: cpu


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


en bert Model Loaded ...


# Stanza

spaCy's tokenization is non-destructive, so it always represents the original input text and never adds or deletes anything. This is kind of a core principle of the Doc object: you should always be able to reconstruct and reproduce the original input text.

## Setup

In [88]:
import stanza
import spacy
import spacy_stanza

In [4]:
if language == 'fa':
    stanza.install_corenlp()
    stanza.download('fa')
    nlp = spacy_stanza.load_pipeline("fa")

elif language == 'en':
    spacy.prefer_gpu()
    nlp = spacy.load("en_core_web_lg")

else:
    raise ValueError(f"{language} not supported.")


## Correct Lexico Typo

In [89]:
def lexico_typo_correction(
        text,
        max_edit_distance_to_length_ratio=0.45,
        max_edit_distance=2,
        min_score=1e-7,
        top_k=10,
        verbose=False,
):
    while True:
        some_token_corrected = False
        doc = nlp(text)
        for index, current_token in enumerate(doc):
            current_token: Token
            start_char_index: int = current_token.idx
            end_char_index = start_char_index + len(current_token)

            if current_token.text not in vocab:
                masked_text = doc.text[:start_char_index] + MASK + doc.text[end_char_index:]

                predicts = unmasker(masked_text, top_k=top_k)

                ### Select Token From Predicts
                predicts = pd.DataFrame(predicts)

                predicts['token_str'] = predicts['token_str'].apply(lambda tk: tk.replace(" ", ""))
                predicts['edit_distance'] = predicts['token_str'].apply(lambda tk: editdistance.eval(current_token.text, tk))

                predicts['edit_distance_to_len_ratio'] = predicts['edit_distance'] / len(current_token.text)

                selected_predicts = predicts[(predicts['edit_distance_to_len_ratio'] <= max_edit_distance_to_length_ratio) &
                                             (predicts['edit_distance'] <= max_edit_distance) & (predicts['score'] >= min_score)]

                try:
                    selected_predict = selected_predicts['token_str'].iloc[0]
                except:
                    selected_predict = current_token.text
                    if selected_predict not in vocab:
                        vocab.add(selected_predict)

                if selected_predict != current_token.text:
                    some_token_corrected = True
                    result_text = masked_text.replace(MASK, selected_predict, 1)
                    text = result_text

                if verbose:

                    # print("*" * 50)
                    # print(f"Token: {current_token.text}")
                    #
                    # print("Predicts: \n")
                    # print(predicts[['token_str', 'score']])
                    #
                    # print("Filtered Predicts: \n")
                    # print(selected_predicts[['token_str', 'score']])
                    # print(f"{current_token.text} -> {selected_predict}")

                    typo_correction_details = {
                        "raw": current_token.text,
                        "corrected": selected_predict,
                        "span": f"[{start_char_index}, {end_char_index}]",
                        "type": "lexical"
                    }

                    print(typo_correction_details)

                if some_token_corrected:
                    break

        if not some_token_corrected:
            break

    return text


## Correct Contextual Typo



In [90]:
def contextual_typo_correction(
        text,
        max_edit_distance_to_length_ratio=0.45,
        max_edit_distance=2,
        min_score=1e-7,
        top_k=10,
        verbose=False,
):
    while True:
        some_token_corrected = False
        doc = nlp(text)
        for index in range(len(doc)):
            current_token: Token = doc[index]

            start_char_index = current_token.idx
            end_char_index = start_char_index + len(current_token)

            masked_text = doc.text[:start_char_index] + MASK + doc.text[end_char_index:]

            predicts = unmasker(masked_text, top_k=top_k)

            ### Select Token From Predicts
            predicts = pd.DataFrame(predicts)

            predicts['token_str'] = predicts['token_str'].apply(lambda tk: tk.replace(" ", ""))
            predicts['edit_distance'] = predicts['token_str'].apply(
                lambda tk: editdistance.eval(current_token.text, tk))
            predicts['edit_distance_to_len_ratio'] = predicts['edit_distance'] / len(current_token.text)

            try:
                if current_token.text in string.punctuation:
                    selected_predict = predicts['token_str'].iloc[0]

                elif current_token.text.isdigit():
                    selected_predict = current_token.text

                else:
                    selected_predicts = predicts[
                        (predicts['edit_distance_to_len_ratio'] <= max_edit_distance_to_length_ratio) &
                        (predicts['edit_distance'] <= max_edit_distance) &
                        (predicts['score'] >= min_score)]

                    selected_predict = selected_predicts.sort_values('score')['token_str'].iloc[0]

                    current_token_text_score = selected_predicts['score'][selected_predicts['token_str'] == current_token.text]
                    selected_token_text_score = selected_predicts.sort_values('score')['score'].iloc[0]

                    if current_token.text != selected_predict:

                        if current_token.text in selected_predicts['token_str'].values:
                            print("\n")
                            print(f"current_token_text_score: {current_token_text_score}, selected_token_text_score: {selected_token_text_score}")
                            print("\n")

                            if selected_token_text_score / current_token_text_score > 50:
                                selected_predict = selected_predicts.sort_values('edit_distance_to_len_ratio')['token_str'].iloc[0]

                            else:
                                selected_predict = current_token.text

                        else:
                            selected_predict = selected_predicts.sort_values('edit_distance_to_len_ratio')['token_str'].iloc[0]

            except:
                selected_predict = current_token.text

            if selected_predict != current_token.text:
                some_token_corrected = True
                result_text = masked_text.replace(MASK, selected_predict, 1)
                text = result_text

            if verbose:
                print("*" * 50)
                print(f"Token: {current_token.text}")

                print("Predicts: \n")
                print(predicts[['token_str', 'score']])

                print("Filtered Predicts: \n")
                print(selected_predicts[['token_str', 'score', 'edit_distance_to_len_ratio']])
                print(f"{current_token.text} -> {selected_predict}")

                if current_token.text != selected_predict:
                    typo_correction_details = {
                        "raw": current_token.text,
                        "corrected": selected_predict,
                        "span": f"[{start_char_index}, {end_char_index}]",
                        "type": "contextual"
                    }

                    print(typo_correction_details)

            if some_token_corrected:
                break

        if not some_token_corrected:
            break

    return text

# Correction Pipeline Class

In [91]:
class SpellCorrector:

    def __init__(
            self,
            max_edit_distance_to_length_ratio=0.45,
            max_edit_distance=2,
            min_score=1e-7,
            verbose=False,
            top_k=50
    ):
        self.max_edit_distance_to_length_ratio = max_edit_distance_to_length_ratio
        self.max_edit_distance = max_edit_distance
        self.min_score = min_score
        self.verbose = verbose
        self.top_k = top_k

    def _lexico_typo_correction(self, text):
        return lexico_typo_correction(text, self.max_edit_distance_to_length_ratio, self.max_edit_distance,
                                      self.min_score, self.top_k, self.verbose, )

    def _contextual_typo_correction(self, text):
        return contextual_typo_correction(text, self.max_edit_distance_to_length_ratio, self.max_edit_distance,
                                          self.min_score, self.top_k, self.verbose, )

    def correction_pipeline(self, text):

        print(f"raw       : {text}")
        # print("Lexico Correction ...") if self.verbose else print()
        corrected_text = self._lexico_typo_correction(text)

        # print("Contextual Correction ...") if self.verbose else print()
        corrected_text = self._contextual_typo_correction(corrected_text)

        print(f"corrected : {corrected_text}")
        return corrected_text

    def __call__(self, text, *args, **kwargs):
        return self.correction_pipeline(text)


# Test On Sample Texts

In [94]:
MAX_EDIT_DISTANCE_TO_LEN_RATIO = 0.4
MAX_EDIT_DISTANCE = 2
MIN_SCORE = 0.0001
TOP_K = 250
VERBOSE = True

if language == 'en':
    # input_text = "The capitan of Iran is tehran."
    # input_text = "i am speeking english very wall."
    # input_text = "He was stadying english for the finall exam."
    # text = "I'm studying [MASK] learning in my computer class."
    # text = "I'm a very [MASK] player in football."
    # input_text = "He drove a cat."
    # text = "do you want to watch tv."
    # text = "I love playing [MASK]."

    input_text = """
        The quantity thoery of money also assume that the quantity of money in an economy has a large influense on its level of economic activity. So, a change in the money supply results in either a change in the price levels or a change in the sopply of gods and services, or both. In addition, the theory assumes that changes in the money supply are the primary reason for changes in spending.
    """

    input_text = """
        Does it privent Iran from getting nuclear weapens. Many exports say that if all parties adhered to their pledges, the deal almost certainly could have achieved that goal for longer than a dekade!
    """

    input_text = """
        The Federal Reserve monitor risks to the financal system and works to help ensure the system supports a haelthy economy for U.S. households, communities, and busineses.
    """

    input_text = """
        Bitcoin is a decentrallized digital curency that can be transfered on the peer-to-peer bitcoin network. Bitcoin transactions are veryfied by network nodes throgh cryptography and recorded in a public distributed ledger called a blockchain. The criptocurrency was invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The curency began use in 2009 when its implemntation was released as open-source software.
    """

    input_text = """
        The 2022 FILA World Cup is scheduled to be the 22nd running of the FILA World Cup competition, the quadrennial international men's football championship contested by the national teams of the member associations of FIFA. It is scheduled to take place in Qatar from 21 Novamber to 18 Decamber 2022.
    """

    input_text = """
        President Daneld Trump annonced on Tuesday he will withdraw the United States from the Iran nuclear deal and restore far-reaching sanktions aimed at severing Iran from the global finansial system.
    """

    input_text = """
        Cars have very sweet features. It has two beautifull eye, adorably tiny paws, sharp claws, and two perky ear which are very sensitive to sounds. It has a tiny body covered with smoot fur and it has a furry tail as well. Cats have an adorable face with a tiny nose, a big mouth and a few whiskers under its nose.
    """


    if model_type != 'roberta':
        input_text = input_text.lower()

if language == 'fa':
    input_text = "امروز در استادیوم آزادی تیم ملی ایران و روسیه مسایقه می‌دهند."
    input_text = "پس از سال‌ها تلاش رازی موفق به کسف الکل شد. این دانشمند تیرانی باعث افتخار در تاریخ کور است."
    input_text = "هفته آینده احتمالا توافق بسته‌ای امضا می‌شود."
    input_text = "اهل کدام کشور هستی."
    input_text = "سن شما چقدر است."
    input_text = "وقتی قیمت گوست قرمز یا صفید در کشورهای دیگر بیشتر شده است، ممکن است در جیران هم گرا شود."
    input_text = "در هفته گذشته قیمت تلا تغییر چندانی نداشت، و در همان محدوده 1850 دلاری کار خود را به پایان رساند. "
    input_text = "هدف از زندگانی چیست!"
    input_text = "همه رأس ساعت 3 در جلسه حاضر باشند."

    input_text = "بر اساس مسوبه سران قوا، معاملات فردایی طلا همانند معاملات فردایی ارض، ممنوع و غیرقانونی شناخته شد و فعالان این بازار به جرم اخلال اقتصادی، تحت پیگرد قرار خواهند گرفت. در نتیجه تانک مرکزی در بازار فردایی مداخله نخواهد کرد"

    input_text = """
        با نزدیک شدن قیمت دار غیر رسمی به سفف خود در روز قبل، تحلیلگران در بازار برای هفته بعد هشدار میدادند که باید احطیاط کرد و اقدامات امنیتی در بازار افزایش خواهد یافت.
    """

    input_text = """
    با تولانی شدن جنگ روسیه و اوکراین و سهم قابل توجهی که این دو کشور در تأمین کندم جهان داشتند، بازار کندم با نوسانات زیادی مواجه شد و قیمت محصولاتی که مواد اولیه‌شان کندم بود، در همه جای جهان افزایش یافت.
    """

    input_text = """
        علت واقعی تعویق در مزاکرات وین چیست.
    """

input_text = input_text.strip()

spell_corrector = SpellCorrector(MAX_EDIT_DISTANCE_TO_LEN_RATIO, MAX_EDIT_DISTANCE, MIN_SCORE, VERBOSE, TOP_K)

doc = nlp(input_text)

from spacy import displacy

# displacy.render(doc, style="dep")

print("Spell Correction for text sentences:")
result = spell_corrector(input_text)
result

Spell Correction for text sentences:
raw       : cars have very sweet features. it has two beautifull eye, adorably tiny paws, sharp claws, and two perky ear which are very sensitive to sounds. it has a tiny body covered with smoot fur and it has a furry tail as well. cats have an adorable face with a tiny nose, a big mouth and a few whiskers under its nose.
{'raw': 'smoot', 'corrected': 'soft', 'span': '[177, 182]', 'type': 'lexical'}
**************************************************
Token: cars
Predicts: 

    token_str     score
0        cats  0.733804
1        they  0.194963
2        dogs  0.010025
3      humans  0.005630
4          it  0.005564
..        ...       ...
245     bulls  0.000009
246      must  0.000009
247  biscuits  0.000009
248    models  0.000009
249    fruits  0.000009

[250 rows x 2 columns]
Filtered Predicts: 

   token_str     score  edit_distance_to_len_ratio
0       cats  0.733804                        0.25
60      cars  0.000120                        0.00

'cats have very sweet features. it has two beautiful eye, adorably tiny pawswith sharp claws, and two very ear which are very sensitive to sounds. it has a tiny body covered with soft fur and it has a furry tail as well. cats have an adorable face with a tiny nose, a big mouth and a few whiskers under its nose.'

In [71]:
"severing" == "severing"

True