In [11]:
import re
from collections import Counter
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
def process_data(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        text = file.read()
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    return words

In [13]:
def get_vocabulary(corpus_file):
    words = process_data(corpus_file)
    vocabulary = set(words)
    return vocabulary

def get_count(word_list):
    word_count_dict = Counter(word_list)
    return word_count_dict

In [14]:
def get_probs(word_count_dict):
    probs = {}
    total_count = sum(word_count_dict.values())
    for word, count in word_count_dict.items():
        probs[word] = count / total_count
    return probs

In [15]:
def build_language_model(corpus_file):
    words = process_data(corpus_file)
    word_count_dict = get_count(words)
    language_model = get_probs(word_count_dict)
    return language_model

corpus_file = "/kaggle/input/dataset-test/big.txt"
language_model = build_language_model(corpus_file)
vocabulary = get_vocabulary(corpus_file)

In [16]:
def delete_letter(word):
    delete_l = [word[:i] + word[i+1:] for i in range(len(word))]
    return delete_l

def switch_letter(word):
    switch_l = [word[:i] + word[i+1] + word[i] + word[i+2:] for i in range(len(word)-1)]
    return switch_l

def replace_letter(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    replace_l = [word[:i] + l + word[i+1:] for i in range(len(word)) for l in letters if l != word[i]]
    return replace_l

def insert_letter(word):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    insert_l = [word[:i] + l + word[i:] for i in range(len(word)+1) for l in letters]
    return insert_l

def edit_one_letter(word, allow_switches=True):
    edits = set(delete_letter(word) + replace_letter(word) + insert_letter(word))
    if allow_switches:
        edits.update(switch_letter(word))
    return edits

def edit_two_letters(word, allow_switches=True):
    edit_two_set = set()
    for edit in edit_one_letter(word, allow_switches):
        edit_two_set.update(edit_one_letter(edit, allow_switches))
    return edit_two_set

In [17]:
def known_word(words, vocabulary):
    return set(word for word in words if word in vocabulary)

def candidates(word, language_model, vocabulary):
    if word in vocabulary:
        return [word]
    
    candidates_one_edit = known_word(edit_one_letter(word), vocabulary)
    if candidates_one_edit:
        return sorted(candidates_one_edit, key=lambda x: language_model.get(x, 0), reverse=True)
    
    candidates_two_edits = known_word(edit_two_letters(word), vocabulary)
    if candidates_two_edits:
        return sorted(candidates_two_edits, key=lambda x: language_model.get(x, 0), reverse=True)
    
    return [word]

In [18]:
def correction(word, k, language_model, vocabulary):
    candidate_words = candidates(word, language_model, vocabulary)
    sorted_candidates = sorted(candidate_words, key=lambda x: language_model.get(x, 0), reverse=True)
    return sorted_candidates[:k]

In [22]:
test_word = "hetre"
suggested_candidates = candidates(test_word, language_model, vocabulary)
print(f"Suggested candidates for '{test_word}': {suggested_candidates}")

word = "somer"
k = 3
suggested_corrections = correction(word, k, language_model, vocabulary)
print(f"Corrections for '{word}': {suggested_corrections}")

string = "sdhe askk mee abot my age adn i told heer that i amm twenty four"
correct_string = []

for word in string.split():
    if word in vocabulary:
        correct_string.append(word)
    else:
        suggestions = correction(word, 2, language_model, vocabulary)  # Corrected order
        best_correction = suggestions[0] if suggestions else word
        correct_string.append(best_correction)

corrected_text = ' '.join(correct_string)
print(corrected_text)


Suggested candidates for 'hetre': ['here', 'etre', 'metre']
Corrections for 'somer': ['some', 'sober', 'somber']
she ask me about my age and i told her that i am twenty four
