In [7]:
%pip install spacy torch stanza spacy-stanza transformers nltk hazm black pyspellchecker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspellchecker
  Downloading pyspellchecker-0.6.3-py3-none-any.whl (2.7 MB)
[K     |████████████████████████████████| 2.7 MB 5.1 MB/s 
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.6.3


In [None]:
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.3 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=8fbf11e7d5811ffb6e3e21a58ad9f39023b730d23a9be3170ba9184175f05539
  Stored in directory: /tmp/pip-ephem-wheel-cache-rzqldani/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


# Import Required Libraries

In [9]:
import string
import torch

import editdistance
import pandas as pd
import string

from transformers import (
    pipeline,
    BertTokenizer,
    BertForMaskedLM,
    AlbertTokenizer,
    AlbertForMaskedLM,
    RobertaTokenizer,
    RobertaModel,
)
from transformers.pipelines.fill_mask import FillMaskPipeline
from spacy.tokens.token import Token
from spacy.tokens.doc import Doc


In [10]:
# torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch_device = "cpu"
print(f"Torch Device: {torch_device}")

# language = 'fa'
# model_type = 'bert'

language = "en"
model_type = "roberta"

if language == "en":
    if model_type == "bert":
        model_name = "bert-large-uncased"  # Bert large
        # model_name = "bert-base-uncased" # Bert base
    elif model_type == "roberta":
        model_name = "roberta-large"  # Roberta
    else:
        raise f"{model_type} model not found."

elif language == "fa":
    if model_type == "bert":
        # model_name = "HooshvareLab/bert-fa-base-uncased"  # BERT V2
        model_name = "HooshvareLab/bert-fa-zwnj-base"  # BERT V3
    elif model_type == "albert":
        model_name = "HooshvareLab/albert-fa-zwnj-base-v2"  # Albert
    else:
        raise f"{model_type} model not found."

else:
    raise f"{language} language not found."

if model_type == "bert":
    MASK = "[MASK]"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForMaskedLM.from_pretrained(model_name).to(torch_device)
    unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer)

elif model_type == "albert":
    MASK = "[MASK]"
    tokenizer = AlbertTokenizer.from_pretrained(model_name)
    model = AlbertForMaskedLM.from_pretrained(model_name).to(torch_device)
    unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer)

elif model_type == "roberta":
    MASK = "<mask>"
    tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
    unmasker = pipeline("fill-mask", model="roberta-large", tokenizer=tokenizer)

else:
    print(f"{model_type} not found.")

vocab: set = set(tokenizer.get_vocab().keys())

if model_type == "roberta":
    vocab = set(map(lambda s: s[1:], vocab))

print(f"len vocab: {len(vocab)}")
print(f"{language} {model_type} Model Loaded ...")


Torch Device: cpu
len vocab: 39620
en roberta Model Loaded ...


# Stanza

spaCy's tokenization is non-destructive, so it always represents the original input text and never adds or deletes anything. This is kind of a core principle of the Doc object: you should always be able to reconstruct and reproduce the original input text.

## Setup

In [11]:
import stanza
import spacy
import spacy_stanza

In [12]:
if language == "fa":
    stanza.install_corenlp()
    stanza.download("fa")
    nlp = spacy_stanza.load_pipeline("fa")

elif language == "en":
    spacy.prefer_gpu()
    nlp = spacy.load("en_core_web_lg")

else:
    raise ValueError(f"{language} not supported.")


In [13]:
def half_space_case(predicted: str, current: str):
    wo_half_space_current = current.replace("‌", "")
    return wo_half_space_current == predicted


# Correction Pipeline Class

In [47]:
from spellchecker import SpellChecker

class SpellCorrector:
    def __init__(self, alpha=5, max_edit_distance=2, verbose=False, top_k=50):
        self.alpha = alpha
        self.max_edit_distance = max_edit_distance
        self.verbose = verbose
        self.top_k = top_k
        self.spell_checker = SpellChecker()

    def print_summary(self, type):
        if not self.verbose:
          return
        text = self.text
        current_token = self.current_token
        start_char_index = self.start_char_index
        end_char_index = self.end_char_index

        print("*" * 50)
        print(f"Token: {current_token.text}")

        print("Filtered Predicts: \n")
        if current_token.text in string.punctuation:
            print(self.filtered_predicts[["token_str", "score"]])
        else:
            print(self.filtered_predicts[["token_str", "score", "total_score"]])

        if self.some_token_corrected:
            print(f"{current_token.text} -> {self.selected_predict} : {type}")
            typo_correction_details = {
                "raw": current_token.text,
                "corrected": self.selected_predict,
                "span": f"[{start_char_index}, {end_char_index}]",
                "around": text[start_char_index - 10 : end_char_index + 10],
                "type": "contextual",
            }

            print(typo_correction_details)
        print("#" * 50)

    def set_predictions(self):
        start_char_index: int = self.current_token.idx
        end_char_index = start_char_index + len(self.current_token)

        masked_text = (
            self.doc.text[:start_char_index] + MASK + self.doc.text[end_char_index:]
        )

        predicts = unmasker(masked_text, top_k=self.top_k)
        predicts = pd.DataFrame(predicts)
        
        self.predicts = predicts
        self.start_char_index = start_char_index
        self.end_char_index = end_char_index
        self.masked_text = masked_text
        return predicts

    def set_filtered_predictions(self):
        predicts = self.predicts
        predicts.loc[:, "token_str"] = predicts["token_str"].apply(
            lambda tk: tk.replace(" ", "")
        )
        predicts.loc[:, "edit_distance"] = predicts["token_str"].apply(
            lambda tk: editdistance.eval(self.current_token.text, tk)
        )

        # Filter tokens with at most 3 edit distance
        filtered_predicts = predicts.loc[
            predicts["edit_distance"] <= self.max_edit_distance, :
        ].copy()

        # Apply total score function
        # e: edit distance + 1
        # l: token length
        filtered_predicts.loc[:, "e_to_l"] = (
            filtered_predicts.loc[:, "edit_distance"] + 1
        ) / len(self.current_token.text)

        filtered_predicts.loc[:, "total_score"] = (
            filtered_predicts.loc[:, "score"]
            / filtered_predicts.loc[:, "e_to_l"] ** self.alpha
        )

        filtered_predicts = filtered_predicts.sort_values(
            "total_score", ascending=False
        )
        self.filtered_predicts = filtered_predicts

    def correct_predict(self, selected_predict):
        if selected_predict != self.current_token.text:
            if not half_space_case(selected_predict, self.current_token.text):
                self.some_token_corrected = True
                self.text = self.masked_text.replace(MASK, selected_predict, 1)
                self.doc = nlp(self.text)

            else:
                vocab.add(self.current_token.text)

        self.selected_predict = selected_predict


    def correct_lexico_typo(self):
        while True:
            self.some_token_corrected = False
            self.doc = nlp(self.text)
            for index, current_token in enumerate(self.doc):
                self.current_token: Token = current_token

                if current_token.text not in vocab:
                    self.set_predictions()

                    try:
                        if current_token.text in string.punctuation:
                            selected_predict = self.predicts["token_str"].iloc[0]
                        elif any(c.isdigit() for c in current_token.text):
                            print("DIGIT")
                            selected_predict = current_token.text
                        else:
                            self.set_filtered_predictions()
                            selected_predict_row = self.filtered_predicts.iloc[0, :]
                            selected_predict = selected_predict_row["token_str"]
                    except Exception as e:
                        print(
                            f"Error: {e} From {current_token.text} Filtered Predictions Length: {len(self.filtered_predicts)}"
                        )
                        selected_predict = self.spell_checker.correction(self.current_token.text)

                    self.correct_predict(selected_predict)
                    self.print_summary('lexical')

                    if self.some_token_corrected:
                        break

            if not self.some_token_corrected:
                break


    def correct_contextual_typo(self):
        while True:
            self.some_token_corrected = False
            self.doc = nlp(self.text)
            for index, current_token in enumerate(self.doc):
                self.current_token: Token = current_token
                self.set_predictions()

                try:
                    if current_token.text in string.punctuation:
                        self.filtered_predicts = self.predicts.loc[
                            self.predicts["token_str"].apply(lambda tk: tk in string.punctuation), :
                        ].copy()
                        selected_predict = self.filtered_predicts["token_str"].iloc[0]
                    elif any(c.isdigit() for c in current_token.text):
                        selected_predict = current_token.text
                    else:
                        self.set_filtered_predictions()
                        selected_predict_row = self.filtered_predicts.iloc[0, :]
                        selected_predict = selected_predict_row["token_str"]

                except Exception as e:
                    selected_predict = current_token.text
                    print(
                        f"Error: {e} From {current_token.text} Filtered Predictions Length: {len(self.filtered_predicts)}"
                    )

                self.correct_predict(selected_predict)
                self.print_summary('contexual')
                if self.some_token_corrected:
                    break

            if not self.some_token_corrected:
                break


    def correction_pipeline(self):
        print(f"Lexico Correction ... . text = {self.text}") if self.verbose else print()
        self.correct_lexico_typo()

        print(f"Contextual Correction ... . text = {self.text}") if self.verbose else print()
        self.correct_contextual_typo()


    def __call__(self, text, *args, **kwargs):
        self.text = text
        self.correction_pipeline()
        return self.text


# Sample Texts

In [48]:
if language == "en":
    test_cases = [
        {
            "input_text": """
            I was playing fotball, but then I broke my legg. The goal keeper saved a very powerfull shout. It as a very good hatch.
        """,
            "true_text": """
            I was playing football, but then I broke my leg. The goal keeper saved a very powerfull shot. It was a very good match.
        """,
        },    
        {
            "input_text": """
            The quantity thoery of money also assume that the quantity of money in an economy has a large influense on its level of economic activity. So, a change in the money supply results in either a change in the price levels or a change in the sopply of gods and services, or both. In addition, the theory assumes that changes in the money supply are the primary reason for changes in spending.
        """,
            "true_text": """
            The quantity theory of money also assumes that the quantity of money in an economy has a large influence on its level of economic activity. So, a change in the money supply results in either a change in the price levels or a change in the supply of goods and services, or both. In addition, the theory assumes that changes in the money supply are the primary reason for changes in spending.
        """,
        },
        {
            "input_text": """
            Does it privent Iran from getting nuclear weapens. Many exports say that if all parties adhered to their pledges, the deal almost certainly could have achieved that goal for longer than a dekade!
        """,
            "true_text": """
            Does it prevent Iran from getting nuclear weapons? Many experts say that if all parties adhere to their pledges, the deal almost certainly could have achieved that goal for longer than a decade.
        """,
        },
        {
            "input_text": """
            The Federal Reserve monitor risks to the financal system and works to help insure the system supports a haelthy economy for US households, communities, and busineses.
        """,
            "true_text": """
            The Federal Reserve monitors risks to the financial system and works to help ensure the system supports a healthy economy for US households, communities, and businesses.
        """,
        },
        {
            "input_text": """
            Bitcoin is a decentrallized digital curency that can be transfered on the peer-to-peer bitcoin network. Bitcoin transactions are veryfied by network nodes throgh cryptography and recorded in a public distributed ledger called a blockchain. The criptocurrency was invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The curency began use in 2009 when its implemntation was released as open-source software.
        """,
            "true_text": """
            Bitcoin is a decentralized digital currency that can be transferred on the peer-to-peer bitcoin network. Bitcoin transactions are verified by network nodes through cryptography and recorded in a public distributed ledger called a blockchain. The cryptocurrency was invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The currency began use in 2009 when its implementation was released as open-source software.
        """,
        },
        {
            "input_text": """
            The 2022 FILA World Cup is scheduled to be the 22nd running of the FILA World Cup competition, the quadrennial international men's football championship contested by the national teams of the member associations of FIFA. It is scheduled to take place in Qatar from 21 Novamber to 18 Decamber 2022.
        """,
            "true_text": """
            The 2022 FIFA World Cup is scheduled to be the 22nd running of the FIFA World Cup competition, the quadrennial international men's football championship contested by the national teams of the member associations of FIFA. It is scheduled to take place in Qatar from 21 November to 18 December 2022.
        """,
        },
        {
            "input_text": """
            President Daneld Trump annonced on Tuesday he well withdraw the United States from the Iran nuclear deal and restore far-reaching sanktions aimed at withdrawal Iran from the global finansial system.
        """,
            "true_text": """
            President Donald Trump announced on Tuesday he will withdraw the United States from the Iran nuclear deal and restore far-reaching sanctions aimed at withdrawal Iran from the global financial system.
        """,
        },
        {
            "input_text": """
            Cars has very sweet features. It has two beautifull eye, adorable tiny paws, sharp claws, and two fury ear which are very sensitive to sounds. It has a tiny body covered with sot fur and it has a furry tail as well. Cats have an adorable face with a tiny nose.
        """,
            "true_text": """
            Cat has very sweet features. It has two beautiful eyes, adorable tiny paws with sharp claws, and two furry ears which are very sensitive to sounds. It has a tiny body covered with soft fur and it has a furry tail as well. Cats have an adorable face with a tiny nose.
        """,
        },
    ]

    if model_type != "roberta":
        for test_case in test_cases:
            test_case["input_text"] = test_case["input_text"].lower()
            test_case["true_text"] = test_case["true_text"].lower()

elif language == "fa":
    test_cases = [
        # {
        #     "input_text": """
        #
        # """,
        #     "true_text": """
        #
        #  """
        # },
        {
            "input_text": "پس از سال‌ها تلاش، رازی موفق به کسف الکل شد. این دانشمند تیرانی باعث افتخار در تاریخ کور است.",
            "true_text": """""",
        },
        {
            "input_text": "وقتی قیمت گوست قرمز یا صفید در کشورهای دیگر بیشتر شده است، ممکن است در جیران هم گرا شود.",
            "true_text": """""",
        },
        {
            "input_text": "در هفته گذشته قیمت تلا تغییر چندانی نداشت، و در همان محدوده 1850 دلاری کار خود را به پایان رساند. ",
            "true_text": """""",
        },
        {
            "input_text": "بر اساس مسوبه سران قوا، معاملات فردایی طلا همانند معاملات فردایی ارض، ممنوع و غیرقانونی شناخته شد و فعالان این بازار به جرم اخلال اقتصادی، تحت پیگرد قرار خواهند گرفت. در نتیجه تانک مرکزی در بازار فردایی مداخله نخواهد کرد",
            "true_text": """""",
        },
        {
            "input_text": """
        با نزدیک شدن قیمت دار غیر رسمی به سفف خود در روز قبل، تحلیلگران در بازار برای هفته بعد هشدار میدادند که باید احطیاط کرد و اقدامات امنیتی در بازار افزایش خواهد یافت.
        """,
            "true_text": """""",
        },
        {
            "input_text": """
        با تولانی شدن جنگ روسیه و اوکراین و سهم قابل توجهی که این دو کشور در تأمین کندم جهان داشتند، بازار کندم با نوسانات زیادی مواجه شد و قیمت محصولاتی که مواد اولیه‌شان کندم بود، در همه جای جهان افزایش یافت.
        """,
            "true_text": """""",
        },
        {
            "input_text": """
        علت واقعی تعویق در مزاکرات وین چیست.
        """,
            "true_text": """""",
        },
    ]

else:
    raise f"{language} language not found."

ALPHA = 8 if language == "en" else 30
MAX_EDIT_DISTANCE = 2 if language == "en" else 2
TOP_K = 250 if language == "en" else 5000
VERBOSE = True

for test_case in test_cases:
    test_case["input_text"] = test_case["input_text"].strip()
    test_case["true_text"] = test_case["true_text"].strip()

spell_corrector = SpellCorrector(ALPHA, MAX_EDIT_DISTANCE, VERBOSE, TOP_K)
from spacy import displacy

for idx in range(len(test_cases)):
    test_case = test_cases[idx]

    input_text = test_case["input_text"]

    output_text = spell_corrector(input_text)

    print('Is output corrected: ', output_text == test_case["true_text"])

    print('Corrected text: ', output_text)

    print("\n")
    print("* " * 50)
    print(" *" * 50)
    print("\n")
    break


Lexico Correction ... . text = I was playing fotball, but then I broke my legg. The goal keeper saved a very powerfull shout. It as a very good hatch.
**************************************************
Token: fotball
Filtered Predicts: 

   token_str     score  total_score
26  football  0.002213    49.832583
fotball -> football : lexical
{'raw': 'fotball', 'corrected': 'football', 'span': '[14, 21]', 'around': 's playing football, but the', 'type': 'contextual'}
##################################################
**************************************************
Token: legg
Filtered Predicts: 

    token_str     score  total_score
0         leg  0.395549   101.260521
17       legs  0.004160     1.064951
227       egg  0.000021     0.005257
76        Leg  0.000141     0.001404
101      left  0.000074     0.000741
133      lung  0.000044     0.000443
legg -> leg : lexical
{'raw': 'legg', 'corrected': 'leg', 'span': '[44, 48]', 'around': ' broke my leg. The goal ', 'type': 'contextual'}
#

In [None]:
"الکل" in vocab