In [1]:
import re
from collections import Counter
import locale
locale.setlocale(locale.LC_ALL, 'tr_TR.utf8')
from itertools import chain
import Levenshtein

In [2]:
import pandas as pd
import numpy as np
import random as rand
import math
import string

In [5]:
PATH = r"https://github.com/tdd-ai/spell-checking-and-correction/blob/main/evaluation/data/true_words.txt"

In [6]:
def read(path):
    sentences = []
    with open(path, encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            tr_word = line
            sentences.append(tr_word)
    return sentences

In [24]:
text = read(PATH)

In [26]:
text = pd.DataFrame(text, columns=["initial"])

In [27]:
text.head(10)

Unnamed: 0,initial
0,a
1,aa
2,aaa
3,aba
4,abaca
...,...
671882,şırıngaya
671883,şırıngayla
671884,şırıngayı
671885,şıvgın


In [28]:
vocab_to_int = {}
count = 0
for sentence in text.initial:
    for character in sentence:
        if character not in vocab_to_int:
            vocab_to_int[character] = count
            count += 1

# add_tokens = ['<','>',' ','_']
add_tokens = []
for token in add_tokens:
    vocab_to_int[token] = count
    count += 1
    
vocab_size = len(vocab_to_int)
print("The vocabulary contains {} characters.".format(vocab_size))
print(sorted(vocab_to_int))

The vocabulary contains 29 characters.
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'y', 'z', 'ç', 'ö', 'ü', 'ğ', 'ı', 'ş']


In [12]:
keyboard_joints = {
    'q': ["w","a"]
    , 'w': ["q","a","s","e"]
    , 'e': ["w","s","d","r"]
    , 'r': ["e","d","f","t"]
    , 't': ["r","f","g","y"]
    , 'y': ["t","g","h","u"]
    , 'u': ["y","h","j","ı"]
    , 'ı': ["u","j","k","o"]
    , 'o': ["ı","k","l","p"]
    , 'p': ["o","l","ş","ğ"]
    , 'ğ': ["p","ş","i","ü","y"]
    , 'ü': ["ğ","i"]
    , 'a': ["q","w","s","z"]
    , 's': ["w","e","a","d","z","x"]
    , 'd': ["e","r","s","f","x","c"]
    , 'f': ["r","t","d","g","c","v"]
    , 'g': ["t","y","f","h","v","b"]
    , 'h': ["y","u","g","j","b","n"]
    , 'j': ["u","ı","h","k","n","m"]
    , 'k': ["ı","o","j","l","m","ö"]
    , 'l': ["o","p","k","ş","ö","ç"]
    , 'ş': ["p","ğ","l","i","ç"]
    , 'i': ["ğ","ü","ş"]
    , 'z': ["a","s","x"]
    , 'x': ["s","d","z","c"]
    , 'c': ["d","f","x","v"]
    , 'v': ["f","g","c","b"]
    , 'b': ["g","h","v","n"]
    , 'n': ["h","j","b","m"]
    , 'm': ["j","k","n","ö"]
    , 'ö': ["k","l","m","ç"]
    , 'ç': ["l","ş","ö"]
}

In [13]:
def create_typos(sentence, typo_prob=0.3, i=0):
    if i == 2:
        return sentence
    words = sentence.split()
    typo_words = []
    # if len(sentence.split()) == 1:
    #     return sentence
    for i, word in enumerate(words):
        """
        if i == 0:
            typo_words.append(word)
            continue
        if len(word) < 2:
            typo_words.append(word)
            continue
        if i == 2:
            typo_words.append(word)
            continue
        """
        if len(word) < 4:
            typo_words.append(generate_typo(word))
            
        elif np.random.uniform(0, 1) < typo_prob:
            typo_words.append(generate_typo(word))
            
        # two edit distance typo
        elif typo_prob < np.random.uniform(0, 1) < typo_prob+0.2:
            idx = int(np.ceil(len(word)/2))
            word_1 = word[:idx]
            word_1 = generate_typo(word_1)
            word_2 = word[idx:]
            word_2 = generate_typo(word_2)
            if word == word_1+word_2:
                typo_words.append(generate_typo(word))
            else:
                word = word_1+word_2
                typo_words.append(word)
        else:
            typo_words.append(word)

    return " ".join(typo_words).strip()

In [14]:
def generate_typo(word):
    store_begin_char = ""
    store_end_char = ""
    _flag = False
    if any(char.isdigit() for char in word):
        return word
    for char in word:
        if char in string.punctuation:
            return word
    if len(word) < 3:
        return word
    
    if np.random.uniform(0, 1) < 0.8:
        _flag = True
        
    if _flag == True:
        store_begin_char = word[0]
        store_end_char = word[-1]
        word = word[1:-1]
    
    
    if len(word) < 2:
        return store_end_char + word + store_begin_char
    elif np.random.uniform(0, 1) < 0.4:
        typo_type = np.random.choice(["insertion", "deletion", "replace"])
    else:
        typo_type = "transpose"
    
    
    if typo_type == "insertion":
        idx = np.random.choice(len(word) + 1)
        if 1 < idx < len(word):
            word = word[:idx] + np.random.choice(list(keyboard_joints[word[idx-1]])) + word[idx:]
        else:
            typo_type = "transpose"
    if typo_type == "deletion":
        idx = np.random.choice(len(word))
        word = word[:idx] + word[idx + 1:]
    if typo_type == "replace":
        idx = np.random.choice(len(word))
        if word[idx] in keyboard_joints:
            close_keys = list(keyboard_joints[word[idx]])
            word = word[:idx] + np.random.choice(list(close_keys)) + word[idx + 1:]
        else:
            typo_type == "transpose"
    if typo_type == "transpose":
        idx = np.random.choice(len(word))
        if idx == 0:
            word = word[1] + word[idx] + word[idx + 2:]
        else:
            word = word[:idx - 1] + word[idx] + word[idx - 1] + word[idx + 1:]
    
    if _flag == True:
        word = store_begin_char + word + store_end_char
    
    return word

In [29]:
text["len"] = text["initial"].apply(len)
text = text[text["len"] > 3]

In [30]:
text

Unnamed: 0,initial,len
4,abaca,5
5,abacı,5
6,abacıdan,8
7,abacılar,8
8,abacının,8
...,...,...
671882,şırıngaya,9
671883,şırıngayla,10
671884,şırıngayı,9
671885,şıvgın,6


In [31]:
df = text.copy()

In [32]:
text["replaced"] = text["initial"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text["replaced"] = text["initial"]


In [34]:
text = text.sample(100000)

In [35]:
text

Unnamed: 0,initial,len,replaced
233005,hamileliklerinde,16,hamileliklerinde
544233,umulmadık,9,umulmadık
637400,çıbandır,8,çıbandır
439572,reprodüksiyon,13,reprodüksiyon
344388,kurduracak,10,kurduracak
...,...,...,...
76056,bildiğim,8,bildiğim
46858,ağırlaşabilir,13,ağırlaşabilir
277509,işitebileceği,13,işitebileceği
286584,kalkmamalıdır,13,kalkmamalıdır


In [36]:
df["replaced"] = df["initial"].apply(create_typos, typo_prob=0.8)

In [37]:
df2 = df.copy()

In [38]:
df = df[~df.duplicated("replaced")]
df

Unnamed: 0,initial,len,replaced
4,abaca,5,abcaa
5,abacı,5,abcaı
6,abacıdan,8,abacıdan
7,abacılar,8,abaclıar
8,abacının,8,aabcıınn
...,...,...,...
671882,şırıngaya,9,şrııngaya
671883,şırıngayla,10,ışrıngayla
671884,şırıngayı,9,şurıngyaı
671885,şıvgın,6,şıvıgn


In [39]:
df2

Unnamed: 0,initial,len,replaced
4,abaca,5,abcaa
5,abacı,5,abcaı
6,abacıdan,8,abacıdan
7,abacılar,8,abaclıar
8,abacının,8,aabcıınn
...,...,...,...
671882,şırıngaya,9,şrııngaya
671883,şırıngayla,10,ışrıngayla
671884,şırıngayı,9,şurıngyaı
671885,şıvgın,6,şıvıgn


In [41]:
df2["replaced"] = df2["initial"].apply(create_typos, typo_prob=0.8)

In [43]:
df2 = df2[~df2.duplicated("replaced")]
df2

Unnamed: 0,initial,len,replaced
4,abaca,5,abcaa
5,abacı,5,abac
6,abacıdan,8,abcıdan
7,abacılar,8,abacıloar
8,abacının,8,abacıbın
...,...,...,...
671882,şırıngaya,9,şıırngya
671883,şırıngayla,10,ışrıngayla
671884,şırıngayı,9,şrııngayı
671885,şıvgın,6,şuvgın


In [46]:
df_final = pd.concat([df, df2, text])

In [47]:
df_final

Unnamed: 0,initial,len,replaced
4,abaca,5,abcaa
5,abacı,5,abcaı
6,abacıdan,8,abacıdan
7,abacılar,8,abaclıar
8,abacının,8,aabcıınn
...,...,...,...
76056,bildiğim,8,bildiğim
46858,ağırlaşabilir,13,ağırlaşabilir
277509,işitebileceği,13,işitebileceği
286584,kalkmamalıdır,13,kalkmamalıdır


In [48]:
df_final = df_final[~df_final.duplicated("replaced")]

In [49]:
df_final

Unnamed: 0,initial,len,replaced
4,abaca,5,abcaa
5,abacı,5,abcaı
6,abacıdan,8,abacıdan
7,abacılar,8,abaclıar
8,abacının,8,aabcıınn
...,...,...,...
47567,aşağılamalardan,15,aşağılamalardan
22083,ansal,5,ansal
1844,adacıkların,11,adacıkların
515949,taşlasın,8,taşlasın


In [50]:
df_final[["replaced","initial"]].to_csv(r"\data_v47_true-words.csv", index=False, sep=";")