# Install and import nltk

In [6]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


# Load the Gutenberg corpus as a larger dataset for better frequency counts

In [8]:
# Cell 1: Define the tokenize function
import re
from collections import Counter

def tokenize(text):
    words = re.findall(r'\w+', text)
    return Counter(words)

def load_large_corpus():
    words = nltk.corpus.gutenberg.words()  # This is a large text corpus
    return ' '.join(words).lower()

# Build the word frequency dictionary
corpus = load_large_corpus()
word_freqs = tokenize(corpus)

# Improved correction with stricter conditions for edits

In [11]:
def edits1(word):
    # All edits that are one edit away from 'word'
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def known(words):
    # Return the subset of words that are actually in the dictionary (word_freqs)
    return set(w for w in words if w in word_freqs)

def get_candidates(word):
    # Return a set of known words that are either one or two edits away from the word
    return (known([word]) or
            known(edits1(word)) or
            [word])  # If no candidates found, return the original word

def correct_spelling(word):
    # Get candidates
    candidates = get_candidates(word)
    
    # Return most probable correct word (from corpus or original)
    if candidates:
        return max(candidates, key=lambda w: word_freqs[w])
    else:
        return word  # Return the original word if no candidates found

def correct_sentence(sentence):
    corrected_sentence = []
    for word in sentence.split():
        corrected_sentence.append(correct_spelling(word))
    return ' '.join(corrected_sentence)


In [17]:
test_sentence = "Ths is an exampel of a sentnce with sevral mispelled wrds."
corrected_sentence = correct_sentence(test_sentence)
print(f"Original: {test_sentence}")
print(f"Corrected: {corrected_sentence}")

Original: Ths is an exampel of a sentnce with sevral mispelled wrds.
Corrected: Ths is an example of a sentence with several dispelled wrds.
