# Import Required Libraries

In [96]:
import string

from transformers import pipeline, BertTokenizer, BertForMaskedLM, AlbertTokenizer, AlbertForMaskedLM, RobertaTokenizer, RobertaModel
from transformers.pipelines.fill_mask import FillMaskPipeline
import torch

from spacy.tokens.token import Token
from spacy.tokens.doc import Doc
import editdistance
import pandas as pd
import string

In [97]:
# torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch_device = 'cpu'
print(f"Torch Device: {torch_device}")

language = 'fa'
model_type = 'bert'

# language = 'en'
# model_type = 'roberta'

if language == 'en':
    if model_type == 'bert':
        model_name = "bert-large-uncased"  # Bert large
        # model_name = "bert-base-uncased" # Bert base

    elif model_type == 'roberta':
        model_name = "roberta-large"  # Roberta

    else:
        raise f"{model_type} model not found."

elif language == 'fa':
    if model_type == 'bert':
        # model_name = "HooshvareLab/bert-fa-base-uncased"  # BERT V2
        model_name = "HooshvareLab/bert-fa-zwnj-base" # BERT V3

    elif model_type == 'albert':
        model_name = "HooshvareLab/albert-fa-zwnj-base-v2"  # Albert

    else:
        raise f"{model_type} model not found."

else:
    raise f"{language} language not found."

if model_type == 'bert':
    MASK = "[MASK]"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForMaskedLM.from_pretrained(model_name).to(torch_device)
    unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer)

elif model_type == 'albert':
    MASK = "[MASK]"
    tokenizer = AlbertTokenizer.from_pretrained(model_name)
    model = AlbertForMaskedLM.from_pretrained(model_name).to(torch_device)
    unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer)

elif model_type == 'roberta':
    MASK = "<mask>"
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    unmasker = pipeline('fill-mask', model='roberta-large', tokenizer=tokenizer)

else:
    print(f"{model_type} not found.")

vocab: set = set(tokenizer.get_vocab().keys())

if model_type == 'roberta':
    vocab = set(map(lambda s: s[1:], vocab))

print(f"{language} {model_type} Model Loaded ...")

Torch Device: cpu
fa bert Model Loaded ...


In [98]:
len(vocab)

42000

# Stanza

spaCy's tokenization is non-destructive, so it always represents the original input text and never adds or deletes anything. This is kind of a core principle of the Doc object: you should always be able to reconstruct and reproduce the original input text.

## Setup

In [99]:
import stanza
import spacy
import spacy_stanza

In [100]:
if language == 'fa':
    stanza.install_corenlp()
    stanza.download('fa')
    nlp = spacy_stanza.load_pipeline("fa")

elif language == 'en':
    spacy.prefer_gpu()
    nlp = spacy.load("en_core_web_lg")

else:
    raise ValueError(f"{language} not supported.")




Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-06-06 10:31:39 INFO: Downloading default packages for language: fa (Persian)...
2022-06-06 10:31:40 INFO: File exists: /home/ahur4/stanza_resources/fa/default.zip
2022-06-06 10:31:42 INFO: Finished downloading models and saved to /home/ahur4/stanza_resources.


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-06-06 10:31:43 INFO: Loading these models for language: fa (Persian):
| Processor | Package |
-----------------------
| tokenize  | perdt   |
| mwt       | perdt   |
| pos       | perdt   |
| lemma     | perdt   |
| depparse  | perdt   |
| ner       | arman   |

2022-06-06 10:31:43 INFO: Use device: cpu
2022-06-06 10:31:43 INFO: Loading: tokenize
2022-06-06 10:31:43 INFO: Loading: mwt
2022-06-06 10:31:43 INFO: Loading: pos
2022-06-06 10:31:43 INFO: Loading: lemma
2022-06-06 10:31:43 INFO: Loading: depparse
2022-06-06 10:31:44 INFO: Loading: ner
2022-06-06 10:31:44 INFO: Done loading processors!


In [101]:
def half_space_case(predicted:str, current:str):
    wo_half_space_current = current.replace("‌", "")
    return wo_half_space_current == predicted

## Correct Lexico Typo

In [102]:
def lexico_typo_correction(
        text,
        alpha=10,
        max_edit_distance=2,
        top_k=10,
        verbose=False,
):
    while True:
        some_token_corrected = False
        doc = nlp(text)
        for index, current_token in enumerate(doc):
            current_token: Token
            start_char_index: int = current_token.idx
            end_char_index = start_char_index + len(current_token)

            if current_token.text not in vocab:
                masked_text = doc.text[:start_char_index] + MASK + doc.text[end_char_index:]

                predicts = unmasker(masked_text, top_k=top_k)

                # Select token from predicts
                predicts = pd.DataFrame(predicts)

                try:
                    if current_token.text in string.punctuation:
                        selected_predict = predicts['token_str'].iloc[0]

                    elif any(c.isdigit() for c in current_token.text):
                        print("DIGIT")
                        selected_predict = current_token.text

                    else:
                        predicts.loc[:, 'token_str'] = predicts['token_str'].apply(lambda tk: tk.replace(" ", ""))
                        predicts.loc[:, 'edit_distance'] = predicts['token_str'].apply(lambda tk: editdistance.eval(current_token.text, tk))

                        # Filter tokens with at most 3 edit distance
                        filtered_predicts = predicts.loc[predicts['edit_distance'] <= max_edit_distance, :].copy()

                        # Apply total score function
                        # e: edit distance + 1
                        # l: token length
                        filtered_predicts.loc[:, 'e_to_l'] = (filtered_predicts.loc[:, 'edit_distance'] + 1) / len(current_token.text)

                        filtered_predicts.loc[:, 'total_score'] = filtered_predicts.loc[:, 'score'] / filtered_predicts.loc[:, 'e_to_l'] ** alpha

                        filtered_predicts = filtered_predicts.sort_values('total_score', ascending=False)
                        selected_predict_row = filtered_predicts.iloc[0, :]

                        selected_predict = selected_predict_row['token_str']

                except Exception as e:

                    print(f"Error: {e} From {current_token.text} Filtered Predictions Length: {len(filtered_predicts)}")
                    from spellchecker import SpellChecker
                    spell = SpellChecker()
                    selected_predict = spell.correction(current_token.text)

                if selected_predict != current_token.text:
                    if not half_space_case(selected_predict, current_token.text):
                        some_token_corrected = True
                        result_text = masked_text.replace(MASK, selected_predict, 1)
                        text = result_text

                    else:
                        vocab.add(current_token.text)
                        selected_predict = current_token.text


                if verbose:
                    print("*" * 50)
                    print(f"Token: {current_token.text}")

                    print("Filtered Predicts: \n")
                    print(filtered_predicts[['token_str', 'score', 'total_score']])

                    print(f"{current_token.text} -> {selected_predict} : lexical")

                    if some_token_corrected:
                        typo_correction_details = {
                            "raw": current_token.text,
                            "corrected": selected_predict,
                            "span": f"[{start_char_index}, {end_char_index}]",
                            "type": "lexical"
                        }

                        print(typo_correction_details)

                if some_token_corrected:
                    break

        if not some_token_corrected:
            break

    return text


## Correct Contextual Typo



In [103]:
def contextual_typo_correction(
        text,
        alpha=10,
        max_edit_distance=2,
        top_k=10,
        verbose=False,
):
    doc = nlp(text)
    for index in range(len(doc)):

        current_token: Token = doc[index]

        print("*" * 50)
        print(f"Token: {current_token.text}")

        start_char_index = current_token.idx
        end_char_index = start_char_index + len(current_token)

        masked_text = doc.text[:start_char_index] + MASK + doc.text[end_char_index:]

        predicts = unmasker(masked_text, top_k=top_k)
        ### Select Token From Predicts
        predicts = pd.DataFrame(predicts)

        try:
            if current_token.text in string.punctuation:
                filtered_predicts = predicts.loc[predicts['token_str'].apply(lambda tk: tk in string.punctuation), :].copy()
                selected_predict = filtered_predicts['token_str'].iloc[0]

            elif any(c.isdigit() for c in current_token.text):
                selected_predict = current_token.text

            else:
                predicts.loc[:, 'token_str'] = predicts['token_str'].apply(lambda tk: tk.replace(" ", ""))
                predicts.loc[:, 'edit_distance'] = predicts['token_str'].apply(lambda tk: editdistance.eval(current_token.text, tk))

                # Filter tokens with at most 3 edit distance
                filtered_predicts = predicts.loc[predicts['edit_distance'] <= max_edit_distance, :].copy()

                # Apply total score function
                # e: edit distance + 1
                # l: token length
                filtered_predicts.loc[:, 'e_to_l'] = (filtered_predicts.loc[:, 'edit_distance'] + 1) / len(current_token.text)

                filtered_predicts.loc[:, 'total_score'] = filtered_predicts.loc[:, 'score'] / filtered_predicts.loc[:, 'e_to_l'] ** alpha

                filtered_predicts = filtered_predicts.sort_values('total_score', ascending=False)
                selected_predict_row = filtered_predicts.iloc[0, :]

                selected_predict = selected_predict_row['token_str']

        except Exception as e:
            print(f"Error: {e} From {current_token.text} Filtered Predictions Length: {len(filtered_predicts)}")
            selected_predict = current_token.text

        if selected_predict != current_token.text:
            if not half_space_case(selected_predict, current_token.text):
                text = masked_text.replace(MASK, selected_predict, 1)
                doc = nlp(text)

            else:
                vocab.add(current_token.text)
                selected_predict = current_token.text

        if verbose:
            if current_token.text != selected_predict:
                print("Filtered Predicts: \n")
                print(filtered_predicts[['token_str', 'score', 'total_score']])

                print(f"{current_token.text} -> {selected_predict} : contextual")

                typo_correction_details = {
                    "raw": current_token.text,
                    "corrected": selected_predict,
                    "span": f"[{start_char_index}, {end_char_index}]",
                    "around": text[start_char_index - 10: end_char_index + 10],
                    "type": "contextual"
                }

                print(typo_correction_details)

    return text

# Correction Pipeline Class

In [104]:
class SpellCorrector:

    def __init__(
            self,
            alpha=5,
            max_edit_distance=2,
            verbose=False,
            top_k=50
    ):
        self.alpha = alpha
        self.max_edit_distance = max_edit_distance
        self.verbose = verbose
        self.top_k = top_k

    def _lexico_typo_correction(self, text):
        return lexico_typo_correction(text, self.alpha, self.max_edit_distance, self.top_k, self.verbose, )

    def _contextual_typo_correction(self, text):
        return contextual_typo_correction(text, self.alpha, self.max_edit_distance, self.top_k, self.verbose, )

    def correction_pipeline(self, text):
        # print("Lexico Correction ...") if self.verbose else print()
        corrected_text = self._lexico_typo_correction(text)

        # print("Contextual Correction ...") if self.verbose else print()
        corrected_text = self._contextual_typo_correction(corrected_text)

        return corrected_text

    def __call__(self, text, *args, **kwargs):
        return self.correction_pipeline(text)


# Sample Texts

In [None]:
if language == 'en':
    test_cases = [
        {"input_text": """
            The quantity thoery of money also assume that the quantity of money in an economy has a large influense on its level of economic activity. So, a change in the money supply results in either a change in the price levels or a change in the sopply of gods and services, or both. In addition, the theory assumes that changes in the money supply are the primary reason for changes in spending.
        """,
         "true_text": """
            The quantity theory of money also assumes that the quantity of money in an economy has a large influence on its level of economic activity. So, a change in the money supply results in either a change in the price levels or a change in the supply of goods and services, or both. In addition, the theory assumes that changes in the money supply are the primary reason for changes in spending.
        """},

        {"input_text": """
            Does it privent Iran from getting nuclear weapens. Many exports say that if all parties adhered to their pledges, the deal almost certainly could have achieved that goal for longer than a dekade!
        """,
         "true_text": """
            Does it prevent Iran from getting nuclear weapons? Many experts say that if all parties adhere to their pledges, the deal almost certainly could have achieved that goal for longer than a decade.
        """},

        {"input_text": """
            The Federal Reserve monitor risks to the financal system and works to help insure the system supports a haelthy economy for US households, communities, and busineses.
        """,

         "true_text": """
            The Federal Reserve monitors risks to the financial system and works to help ensure the system supports a healthy economy for US households, communities, and businesses.
        """},

        {"input_text": """
            Bitcoin is a decentrallized digital curency that can be transfered on the peer-to-peer bitcoin network. Bitcoin transactions are veryfied by network nodes throgh cryptography and recorded in a public distributed ledger called a blockchain. The criptocurrency was invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The curency began use in 2009 when its implemntation was released as open-source software.
        """,
         "true_text": """
            Bitcoin is a decentralized digital currency that can be transferred on the peer-to-peer bitcoin network. Bitcoin transactions are verified by network nodes through cryptography and recorded in a public distributed ledger called a blockchain. The cryptocurrency was invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The currency began use in 2009 when its implementation was released as open-source software.
        """},

        {"input_text": """
            The 2022 FILA World Cup is scheduled to be the 22nd running of the FILA World Cup competition, the quadrennial international men's football championship contested by the national teams of the member associations of FIFA. It is scheduled to take place in Qatar from 21 Novamber to 18 Decamber 2022.
        """,
         "true_text": """
            The 2022 FIFA World Cup is scheduled to be the 22nd running of the FIFA World Cup competition, the quadrennial international men's football championship contested by the national teams of the member associations of FIFA. It is scheduled to take place in Qatar from 21 November to 18 December 2022.
        """},

        {"input_text": """
            President Daneld Trump annonced on Tuesday he well withdraw the United States from the Iran nuclear deal and restore far-reaching sanktions aimed at withdrawal Iran from the global finansial system.
        """,
         "true_text": """
            President Donald Trump announced on Tuesday he will withdraw the United States from the Iran nuclear deal and restore far-reaching sanctions aimed at withdrawal Iran from the global financial system.
        """},

        {"input_text": """
            Cars has very sweet features. It has two beautifull eye, adorable tiny paws, sharp claws, and two fury ear which are very sensitive to sounds. It has a tiny body covered with sot fur and it has a furry tail as well. Cats have an adorable face with a tiny nose.
        """,
         "true_text": """
            Cat has very sweet features. It has two beautiful eyes, adorable tiny paws with sharp claws, and two furry ears which are very sensitive to sounds. It has a tiny body covered with soft fur and it has a furry tail as well. Cats have an adorable face with a tiny nose.
        """}
    ]

    if model_type != 'roberta':
        for test_case in test_cases:
            test_case['input_text'] = test_case['input_text'].lower()
            test_case['true_text'] = test_case['true_text'].lower()

elif language == 'fa':
    test_cases = [
        # {
        #     "input_text": """
        #
        # """,
        #     "true_text": """
        #
        #  """
        # },
        {"input_text": "پس از سال‌ها تلاش، رازی موفق به کسف الکل شد. این دانشمند تیرانی باعث افتخار در تاریخ کور است.",
         "true_text": """"""
         },
        {"input_text": "وقتی قیمت گوست قرمز یا صفید در کشورهای دیگر بیشتر شده است، ممکن است در جیران هم گرا شود.",
         "true_text": """"""
         },
        {"input_text": "در هفته گذشته قیمت تلا تغییر چندانی نداشت، و در همان محدوده 1850 دلاری کار خود را به پایان رساند. ",
         "true_text": """"""
         },
        {
            "input_text": "بر اساس مسوبه سران قوا، معاملات فردایی طلا همانند معاملات فردایی ارض، ممنوع و غیرقانونی شناخته شد و فعالان این بازار به جرم اخلال اقتصادی، تحت پیگرد قرار خواهند گرفت. در نتیجه تانک مرکزی در بازار فردایی مداخله نخواهد کرد",
            "true_text": """"""
        },

        {"input_text": """
        با نزدیک شدن قیمت دار غیر رسمی به سفف خود در روز قبل، تحلیلگران در بازار برای هفته بعد هشدار میدادند که باید احطیاط کرد و اقدامات امنیتی در بازار افزایش خواهد یافت.
        """,
         "true_text": """"""
         },

        {"input_text": """
        با تولانی شدن جنگ روسیه و اوکراین و سهم قابل توجهی که این دو کشور در تأمین کندم جهان داشتند، بازار کندم با نوسانات زیادی مواجه شد و قیمت محصولاتی که مواد اولیه‌شان کندم بود، در همه جای جهان افزایش یافت.
        """,
         "true_text": """"""
         },
        {"input_text": """
        علت واقعی تعویق در مزاکرات وین چیست.
        """,
         "true_text": """"""
         },
    ]

else:
    raise f"{language} language not found."

ALPHA = 8 if language == 'en' else 30
MAX_EDIT_DISTANCE = 2 if language == 'en' else 2
TOP_K = 250 if language == 'en' else 5000
VERBOSE = True

for test_case in test_cases:
    test_case['input_text'] = test_case['input_text'].strip()
    test_case['true_text'] = test_case['true_text'].strip()

spell_corrector = SpellCorrector(ALPHA, MAX_EDIT_DISTANCE, VERBOSE, TOP_K)
from spacy import displacy

for idx in range(len(test_cases)):
    test_case = test_cases[idx]

    input_text = test_case['input_text']

    output_text = spell_corrector(input_text)

    print(output_text == test_case['true_text'])

    print(output_text)

    print("\n")
    print("* " * 50)
    print(" *" * 50)
    print("\n")



**************************************************
Token: کسف
Filtered Predicts: 

     token_str     score   total_score
2          کشف  0.065814  12619.911998
160        کسب  0.000415     79.552601
1898       کسر  0.000008      1.567860
3022       کسی  0.000004      0.753784
3215        کف  0.000004      0.685134
...        ...       ...           ...
4920        دس  0.000002      0.000002
4939        نف  0.000002      0.000002
4940       کلک  0.000002      0.000002
4952        سگ  0.000002      0.000002
4992        جس  0.000002      0.000002

[120 rows x 3 columns]
کسف -> کشف : lexical
{'raw': 'کسف', 'corrected': 'کشف', 'span': '[32, 35]', 'type': 'lexical'}
**************************************************
Token: تیرانی
Filtered Predicts: 

     token_str     score   total_score
24      ایرانی  0.006808  1.401631e+12
1216    تهرانی  0.000036  7.443975e+09
69     تایوانی  0.001803  1.936309e+06
344     نورانی  0.000211  2.263658e+05
406     شیرازی  0.000172  1.843384e+05
633     کی

In [None]:
"الکل" in vocab