In [1]:
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


def swap_letters(word):
    if len(word) < 2:
        return word
    
    word = list(word)
    idx1, idx2 = random.sample(range(len(word)), 2)
    word[idx1], word[idx2] = word[idx2], word[idx1]
    return "".join(word)


def add_letter(word):
    idx = random.randint(0, len(word))
    letter = random.choice("abcdefghijklmnopqrstuvwxyz")
    return word[:idx] + letter + word[idx:]


def keyboard_mapping(word):
    mappings = {
        'a': 'qwsz', 'b': 'vghn', 'c': 'xdfv', 'd': 'ersxc',
        'e': 'rdsw', 'f': 'rtgv', 'g': 'tyhb', 'h': 'yujnb',
        'i': 'uojkn', 'j': 'ikmn', 'k': 'iolm', 'l': 'opk',
        'm': 'njk', 'n': 'bhjm', 'o': 'iklp', 'p': 'ol',
        'q': 'wa', 'r': 'edft', 's': 'wqaxz', 't': 'rfyg',
        'u': 'yhji', 'v': 'cfgb', 'w': 'qase', 'x': 'zsdc',
        'y': 'tghu', 'z': 'asx', 'đ': 'le', 'ư': 'ws', 'ơ': 'ow'
    }

    word = list(word)
    for i in range(len(word)):
        if word[i] in mappings:
            word[i] = random.choice(mappings[word[i]])
    return "".join(word)


def get_ACBT_features(word):
    word = word.lower()
    word = re.sub(r'[^a-zđơưáàảãạâấầẩẫậăắằẳẵặéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵ]', '', word)
    chars = list(word)
    features = []
    for i, c in enumerate(chars):
        features.append(c + str(i))
    return features

In [2]:
import numpy as np
import re


class Vocab:
    def __init__(self):
        self.words = []

    def addWord(self, word):
        if len(word) == 0: return
        if word not in self.words: self.words.append(word)

    def getWords(self):
        return np.array(self.words[:10])

    def getWordFromIndex(self, index):
        return list(self.words.keys())[list(self.words.values()).index(index)]


def cleanSentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-z0-9\sàáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳỹỷỵ]', ' ', sentence)
    sentence = ' '.join(sentence.split())
    return sentence

In [3]:
data_file = open('./data.txt', 'r', encoding='utf-8')
lines = data_file.readlines()

vocab = Vocab()
for line in lines:
    line = cleanSentence(line)
    words = line.split(' ')
    for word in words:
        vocab.addWord(word)

vocab.getWords()

array(['phát', 'hiện', 'xe', 'đò', 'buộc', 'hành', 'khách', 'trên', 'mui',
       'phủ'], dtype='<U5')

In [4]:
misspelled_words = []
for word in vocab.getWords():
    misspelled_words.append(swap_letters(word))
    misspelled_words.append(add_letter(word))
    misspelled_words.append(keyboard_mapping(word))

corpus = []
y = []

for i, word in enumerate(vocab.getWords()):
    features = get_ACBT_features(word)
    corpus.append(" ".join(features))
    y.append(i)

for i, word in enumerate(misspelled_words):
    features = get_ACBT_features(word)
    corpus.append(" ".join(features))
    y.append(i // 3)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

model = LogisticRegression()
model.fit(X, y)

In [7]:

test_word = get_ACBT_features("khach")
test_word = [" ".join(test_word)]
X_test = vectorizer.transform(test_word)

y_pred = model.predict(X_test)
print(vocab.getWords()[y_pred[0]])

khách
