In [None]:
import nltk
nltk.download('all')

In [1]:
import nltk
nltk.download('punkt')       # Tokenizer
nltk.download('wordnet')     # Lemmatizer
nltk.download('omw-1.4')     # WordNet Data


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tyesw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tyesw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tyesw\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
import re

# Load and preprocess text file
with open("final.txt", "r", encoding="utf8") as f:
    text = f.read().lower()

# Extract words using regex
words = re.findall(r'\w+', text)

# Build vocabulary set
main_vocabulary = set(words)


In [3]:
from collections import Counter

# Function to count word frequencies
def counting_words(words):
    return dict(Counter(words))


In [5]:
# Calculate probability distribution of words
def calculate_probabilities(word_counts):
    total = sum(word_counts.values())
    return {word: count / total for word, count in word_counts.items()}


In [4]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Lemmatize a word
def lemmatize_word(word):
    return ' '.join(lemmatizer.lemmatize(w) for w in word.split())


In [6]:
# Deleting one letter at every position
def delete_letter(word):
    return [word[:i] + word[i+1:] for i in range(len(word))]


In [7]:
# Switching (transposing) two adjacent letters
def switch_letters(word):
    return [word[:i] + word[i+1] + word[i] + word[i+2:]
            for i in range(len(word) - 1)]


In [8]:
# Replacing each letter with every alphabet letter
def replace_letters(word):
    alphs = 'abcdefghijklmnopqrstuvwxyz'
    return [word[:i] + l + word[i+1:] 
            for i in range(len(word)) 
            for l in alphs]


In [9]:
# Inserting every alphabet letter at every possible position
def insert_letters(word):
    alphs = 'abcdefghijklmnopqrstuvwxyz'
    return [word[:i] + l + word[i:] 
            for i in range(len(word) + 1) 
            for l in alphs]


In [10]:
# One edit distance
def edit_one_letter(word: str, allow_switches: bool = True) -> set:
    edits = set()
    edits.update(delete_letter(word))
    if allow_switches:
        edits.update(switch_letters(word))
    edits.update(replace_letters(word))
    edits.update(insert_letters(word))
    return edits

# Two edit distances
def edit_two_letters(word: str, allow_switches: bool = True) -> set:
    edits_two = set()
    for edit1 in edit_one_letter(word, allow_switches=allow_switches):
        edits_two.update(edit_one_letter(edit1, allow_switches=allow_switches))
    return edits_two


In [11]:
from typing import Dict, Set, List, Tuple

# Get top-N correction suggestions
def get_corrections(
    word: str,
    probs: Dict[str, float],
    vocab: Set[str],
    n: int = 2
) -> List[Tuple[str, float]]:
    
    if word in vocab:
        candidates = {word}
    else:
        candidates = edit_one_letter(word).intersection(vocab)
        if not candidates:
            candidates = edit_two_letters(word).intersection(vocab)

    best_suggestions = sorted(
        ((w, probs.get(w, 0.0)) for w in candidates),
        key=lambda x: x[1],
        reverse=True
    )

    return best_suggestions[:n]


In [20]:
# Input from user
my_word = input("Enter any word: ").lower()

# Count words and calculate probabilities
word_count = counting_words(words)
probs = calculate_probabilities(word_count)

# Get suggestions
suggestions = get_corrections(my_word, probs, main_vocabulary, n=2)


# Output
if suggestions:
    print("\nDid you mean:")
    for suggestion, probability in suggestions:
        print(f"{suggestion} (prob: {probability:.6f})")
else:
    print("No suggestions found.")


Enter any word:  serach



Did you mean:
search (prob: 0.000053)
serch (prob: 0.000003)
