# Import Required Libraries

In [2]:
import string

from transformers import pipeline, BertTokenizer, BertForMaskedLM, AlbertTokenizer, AlbertForMaskedLM, RobertaTokenizer, RobertaModel
from transformers.pipelines.fill_mask import FillMaskPipeline
import torch

from spacy.tokens.token import Token
from spacy.tokens.doc import Doc
import editdistance
import pandas as pd
import string

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch_device = 'cpu'
print(f"Torch Device: {torch_device}")

language = 'en'
model_type = 'roberta'

# EN

if model_type == 'bert':
    model_name = "bert-large-uncased"  # Bert large
    # model_name = "bert-base-uncased" # Bert base

if model_type == 'roberta':
    model_name = "roberta-large"  # Roberta

# FA
# model_name = "HooshvareLab/albert-fa-zwnj-base-v2" # Albert
# model_name = "HooshvareLab/bert-fa-base-uncased" # BERT V2
# model_name = "HooshvareLab/bert-fa-zwnj-base" # BERT V3

if model_type == 'bert':
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForMaskedLM.from_pretrained(model_name).to(
        torch_device) if language == 'en' else BertForMaskedLM.from_pretrained(model_name).to(torch_device)
    MASK = "[MASK]"
    unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer)

elif model_type == 'albert':
    tokenizer = AlbertTokenizer.from_pretrained(model_name)
    model = AlbertForMaskedLM.from_pretrained(model_name).to(
        torch_device) if language == 'en' else AlbertForMaskedLM.from_pretrained(model_name).to(torch_device)
    MASK = "[MASK]"
    unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer)

elif model_type == 'roberta':
    MASK = "<mask>"
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    unmasker = pipeline('fill-mask', model='roberta-large')

else:
    print(f"{model_type} not found.")

vocab: set = set(tokenizer.get_vocab().keys())

if model_type == 'roberta':
    vocab = set(map(lambda s: s[1:], vocab))

print(f"{language} {model_type} Model Loaded ...")

Torch Device: cpu
en roberta Model Loaded ...


In [5]:
len(vocab)

39620

# Stanza

spaCy's tokenization is non-destructive, so it always represents the original input text and never adds or deletes anything. This is kind of a core principle of the Doc object: you should always be able to reconstruct and reproduce the original input text.

## Setup

In [6]:
import stanza
import spacy
import spacy_stanza

In [7]:
if language == 'fa':
    stanza.install_corenlp()
    stanza.download('fa')
    nlp = spacy_stanza.load_pipeline("fa")

elif language == 'en':
    spacy.prefer_gpu()
    nlp = spacy.load("en_core_web_lg")

else:
    raise ValueError(f"{language} not supported.")


In [8]:
alpha = 10

## Correct Lexico Typo

In [51]:
def lexico_typo_correction(
        text,
        max_edit_distance_to_length_ratio=0.45,
        max_edit_distance=2,
        min_score=1e-7,
        top_k=10,
        verbose=False,
):
    while True:
        some_token_corrected = False
        doc = nlp(text)
        for index, current_token in enumerate(doc):
            current_token: Token
            start_char_index: int = current_token.idx
            end_char_index = start_char_index + len(current_token)

            if current_token.text not in vocab:
                masked_text = doc.text[:start_char_index] + MASK + doc.text[end_char_index:]

                predicts = unmasker(masked_text, top_k=top_k)

                # Select token from predicts
                predicts = pd.DataFrame(predicts)

                predicts.loc[:, 'token_str'] = predicts['token_str'].apply(lambda tk: tk.replace(" ", ""))
                predicts.loc[:, 'edit_distance'] = predicts['token_str'].apply(lambda tk: editdistance.eval(current_token.text, tk))

                # Filter tokens with at most 3 edit distance
                filtered_predicts = predicts.loc[predicts['edit_distance'] <= 3, :].copy()

                # Apply total score function
                # e: edit distance + 1
                # l: token length
                filtered_predicts.loc[:, 'e_to_l'] = (filtered_predicts.loc[:, 'edit_distance'] + 1) / len(current_token.text)

                filtered_predicts.loc[:, 'total_score'] = filtered_predicts.loc[:, 'score'] / filtered_predicts.loc[:, 'e_to_l'] * alpha

                filtered_predicts = filtered_predicts.sort_values('total_score', ascending=False)

                try:
                    selected_predict_row = filtered_predicts.iloc[0, :]
                    selected_predict = selected_predict_row['token_str']
                except:
                    print(f"\n ** filtered tokens size is 0. ** \n")

                    from spellchecker import SpellChecker
                    spell = SpellChecker()
                    selected_predict = spell.correction(current_token.text)

                if selected_predict != current_token.text:
                    some_token_corrected = True
                    result_text = masked_text.replace(MASK, selected_predict, 1)
                    text = result_text

                if verbose:
                    print("*" * 50)
                    print(f"Token: {current_token.text}")

                    print("Filtered Predicts: \n")
                    print(filtered_predicts[['token_str', 'score', 'total_score']])

                    print(f"{current_token.text} -> {selected_predict} : lexical")


                    typo_correction_details = {
                        "raw": current_token.text,
                        "corrected": selected_predict,
                        "span": f"[{start_char_index}, {end_char_index}]",
                        "type": "lexical"
                    }

                    # print(typo_correction_details)

                if some_token_corrected:
                    break

        if not some_token_corrected:
            break

    return text


## Correct Contextual Typo



In [56]:
def contextual_typo_correction(
        text,
        max_edit_distance_to_length_ratio=0.45,
        max_edit_distance=2,
        min_score=1e-7,
        top_k=10,
        verbose=False,
):
    while True:
        some_token_corrected = False
        doc = nlp(text)
        for index in range(len(doc)):
            current_token: Token = doc[index]

            start_char_index = current_token.idx
            end_char_index = start_char_index + len(current_token)

            masked_text = doc.text[:start_char_index] + MASK + doc.text[end_char_index:]

            predicts = unmasker(masked_text, top_k=top_k)

            try:
                if current_token.text in string.punctuation:
                    selected_predict = predicts['token_str'].iloc[0]

                elif current_token.text.isdigit():
                    selected_predict = current_token.text

                else:
                    ### Select Token From Predicts
                    predicts = pd.DataFrame(predicts)

                    predicts.loc[:, 'token_str'] = predicts['token_str'].apply(lambda tk: tk.replace(" ", ""))
                    predicts.loc[:, 'edit_distance'] = predicts['token_str'].apply(lambda tk: editdistance.eval(current_token.text, tk))

                    # Filter tokens with at most 3 edit distance
                    filtered_predicts = predicts.loc[predicts['edit_distance'] <= 3, :].copy()

                    # Apply total score function
                    # e: edit distance + 1
                    # l: token length
                    filtered_predicts.loc[:, 'e_to_l'] = (filtered_predicts.loc[:, 'edit_distance'] + 1) / len(current_token.text)

                    filtered_predicts.loc[:, 'total_score'] = filtered_predicts.loc[:, 'score'] / filtered_predicts.loc[:, 'e_to_l'] * alpha

                    filtered_predicts = filtered_predicts.sort_values('total_score', ascending=False)
                    selected_predict_row = filtered_predicts.iloc[0, :]

                    selected_predict = selected_predict_row['token_str']

                    current_token_text_tot_score = filtered_predicts.loc[filtered_predicts['token_str'] == current_token.text, 'total_score']
                    selected_token_text_tot_score = selected_predict['total_score']

                    print(f"current_token_text_total_score: {current_token_text_tot_score}, selected_token_total_score: {selected_token_text_tot_score}")

            except:
                selected_predict = current_token.text

            if selected_predict != current_token.text:
                some_token_corrected = True
                result_text = masked_text.replace(MASK, selected_predict, 1)
                text = result_text

            if verbose:
                print("*" * 50)
                print(f"Token: {current_token.text}")

                print("Filtered Predicts: \n")
                print(filtered_predicts[['token_str', 'score', 'total_score']])

                print(f"{current_token.text} -> {selected_predict} : contextual")

                if current_token.text != selected_predict:
                    typo_correction_details = {
                        "raw": current_token.text,
                        "corrected": selected_predict,
                        "span": f"[{start_char_index}, {end_char_index}]",
                        "type": "contextual"
                    }

                    # print(typo_correction_details)

            if some_token_corrected:
                break

        if not some_token_corrected:
            break

    return text

# Correction Pipeline Class

In [55]:
class SpellCorrector:

    def __init__(
            self,
            max_edit_distance_to_length_ratio=0.45,
            max_edit_distance=2,
            min_score=1e-7,
            verbose=False,
            top_k=50
    ):
        self.max_edit_distance_to_length_ratio = max_edit_distance_to_length_ratio
        self.max_edit_distance = max_edit_distance
        self.min_score = min_score
        self.verbose = verbose
        self.top_k = top_k

    def _lexico_typo_correction(self, text):
        return lexico_typo_correction(text, self.max_edit_distance_to_length_ratio, self.max_edit_distance,
                                      self.min_score, self.top_k, self.verbose, )

    def _contextual_typo_correction(self, text):
        return contextual_typo_correction(text, self.max_edit_distance_to_length_ratio, self.max_edit_distance,
                                          self.min_score, self.top_k, self.verbose, )

    def correction_pipeline(self, text):
        print(f"raw       : {text}")
        # print("Lexico Correction ...") if self.verbose else print()
        corrected_text = self._lexico_typo_correction(text)

        # print("Contextual Correction ...") if self.verbose else print()
        corrected_text = self._contextual_typo_correction(corrected_text)

        print(f"corrected : {corrected_text}")
        return corrected_text

    def __call__(self, text, *args, **kwargs):
        return self.correction_pipeline(text)


# Test On Sample Texts

In [57]:
MAX_EDIT_DISTANCE_TO_LEN_RATIO = 0.4
MAX_EDIT_DISTANCE = 3
MIN_SCORE = 0.0
TOP_K = 250
VERBOSE = True

if language == 'en':
    # input_text = "The capitan of Iran is tehran."
    # input_text = "i am speeking english very wall."
    # input_text = "He was stadying english for the finall exam."
    # text = "I'm studying [MASK] learning in my computer class."
    # text = "I'm a very [MASK] player in football."
    # input_text = "He drove a cat."
    # text = "do you want to watch tv."
    # text = "I love playing [MASK]."

    input_text = """
        The quantity thoery of money also assume that the quantity of money in an economy has a large influense on its level of economic activity. So, a change in the money supply results in either a change in the price levels or a change in the sopply of gods and services, or both. In addition, the theory assumes that changes in the money supply are the primary reason for changes in spending.
    """

    input_text = """
        Does it privent Iran from getting nuclear weapens. Many exports say that if all parties adhered to their pledges, the deal almost certainly could have achieved that goal for longer than a dekade!
    """

    input_text = """
        The Federal Reserve monitor risks to the financal system and works to help ensure the system supports a haelthy economy for U.S. households, communities, and busineses.
    """

    input_text = """
        Bitcoin is a decentrallized digital curency that can be transfered on the peer-to-peer bitcoin network. Bitcoin transactions are veryfied by network nodes throgh cryptography and recorded in a public distributed ledger called a blockchain. The criptocurrency was invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The curency began use in 2009 when its implemntation was released as open-source software.
    """

    input_text = """
        The 2022 FILA World Cup is scheduled to be the 22nd running of the FILA World Cup competition, the quadrennial international men's football championship contested by the national teams of the member associations of FIFA. It is scheduled to take place in Qatar from 21 Novamber to 18 Decamber 2022.
    """

    input_text = """
        President Daneld Trump annonced on Tuesday he will withdraw the United States from the Iran nuclear deal and restore far-reaching sanktions aimed at withdrawal Iran from the global finansial system.
    """

    input_text = """
        Cars have very sweet features. It has two beautifull eye, adorable tiny paws, sharp claws, and two fury ear which are very sensitive to sounds. It has a tiny body covered with smoot fur and it has a furry tail as well. Cats have an adorable face with a tiny nose, a big mouth and a few whiskers under its nose.
    """

    # input_text = """
    #     I am going to stadiom.
    # """

    if model_type != 'roberta':
        input_text = input_text.lower()

if language == 'fa':
    input_text = "امروز در استادیوم آزادی تیم ملی ایران و روسیه مسایقه می‌دهند."
    input_text = "پس از سال‌ها تلاش رازی موفق به کسف الکل شد. این دانشمند تیرانی باعث افتخار در تاریخ کور است."
    input_text = "هفته آینده احتمالا توافق بسته‌ای امضا می‌شود."
    input_text = "اهل کدام کشور هستی."
    input_text = "سن شما چقدر است."
    input_text = "وقتی قیمت گوست قرمز یا صفید در کشورهای دیگر بیشتر شده است، ممکن است در جیران هم گرا شود."
    input_text = "در هفته گذشته قیمت تلا تغییر چندانی نداشت، و در همان محدوده 1850 دلاری کار خود را به پایان رساند. "
    input_text = "هدف از زندگانی چیست!"
    input_text = "همه رأس ساعت 3 در جلسه حاضر باشند."

    input_text = "بر اساس مسوبه سران قوا، معاملات فردایی طلا همانند معاملات فردایی ارض، ممنوع و غیرقانونی شناخته شد و فعالان این بازار به جرم اخلال اقتصادی، تحت پیگرد قرار خواهند گرفت. در نتیجه تانک مرکزی در بازار فردایی مداخله نخواهد کرد"

    input_text = """
        با نزدیک شدن قیمت دار غیر رسمی به سفف خود در روز قبل، تحلیلگران در بازار برای هفته بعد هشدار میدادند که باید احطیاط کرد و اقدامات امنیتی در بازار افزایش خواهد یافت.
    """

    input_text = """
    با تولانی شدن جنگ روسیه و اوکراین و سهم قابل توجهی که این دو کشور در تأمین کندم جهان داشتند، بازار کندم با نوسانات زیادی مواجه شد و قیمت محصولاتی که مواد اولیه‌شان کندم بود، در همه جای جهان افزایش یافت.
    """

    input_text = """
        علت واقعی تعویق در مزاکرات وین چیست.
    """

input_text = input_text.strip()

spell_corrector = SpellCorrector(MAX_EDIT_DISTANCE_TO_LEN_RATIO, MAX_EDIT_DISTANCE, MIN_SCORE, VERBOSE, TOP_K)
from spacy import displacy
# displacy.render(doc, style="dep")

print("Spell Correction for text sentences:")
result = spell_corrector(input_text)
result

Spell Correction for text sentences:
raw       : Cars have very sweet features. It has two beautifull eye, adorable tiny paws, sharp claws, and two perky ear which are very sensitive to sounds. It has a tiny body covered with smoot fur and it has a furry tail as well. Cats have an adorable face with a tiny nose, a big mouth and a few whiskers under its nose.
**************************************************
Token: beautifull
Filtered Predicts: 

    token_str     score  total_score
14  beautiful  0.014907     0.745338
beautifull -> beautiful : lexical
**************************************************
Token: perky
Filtered Predicts: 

    token_str     score  total_score
11      furry  0.010215     0.127687
21       pink  0.004458     0.055723
72       very  0.000765     0.012751
63     pretty  0.000836     0.010456
76      horny  0.000708     0.008844
94        pet  0.000562     0.007021
187      weak  0.000161     0.002008
204     curly  0.000140     0.001753
211      dark  0.000129

'Cars have very sweet features. It has two beautiful eye, adorable tiny paws, sharp claws, and two furry ear which are very sensitive to sounds. It has a tiny body covered with smoot fur and it has a furry tail as well. Cats have an adorable face with a tiny nose, a big mouth and a few whites under its nose.'

In [61]:
"whisker" in vocab

False