In [2]:
import nltk

#### Levenshtein Distance

It measures the difference between two sequences by calculating the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one word into the other. It is a measure of string similarity, with a lower distance indicating greater similarity.

The Levenshtein distance is the minimum number of edits needed to transform one string into another.
The allowed edits are:
Insertion: Adding a character.
Deletion: Removing a character.
Substitution: Replacing a character with another. 
Example
The Levenshtein distance between "kitten" and "sitting" is 3, as shown by these edits: 
Substitution: kitten → sitten (replace 'k' with 's')
Substitution: sitten → sittin (replace 'e' with 'i')
Insertion: sittin → sitting (insert 'g') 

Very useful for Fuzzy matching; we use backtracking to find the nearest correct replacement

Works in a Matrix system - input word, output word on two axes

Replace|Insert

Remove|min(replace,remove,insert)+1

In [3]:
input_str = "Intention"
output_str = "Execution"

rows = len(input_str) + 1
cols = len(output_str) + 1
matrix = [[0 for _ in range(cols)] for _ in range(rows)]

for i in range(rows):
    matrix[i][0] = i  
for j in range(cols):
    matrix[0][j] = j  

# Fill in the matrix
for i in range(1, rows):
    for j in range(1, cols):
        if input_str[i - 1] == output_str[j - 1]:
            cost = 0
        else:
            cost = 1

        matrix[i][j] = min(
            matrix[i - 1][j] + 1,      # deletion
            matrix[i][j - 1] + 1,      # insertion
            matrix[i - 1][j - 1] + cost  # substitution
        )

for row in matrix:
    print(row)

print("\nLevenshtein Distance:", matrix[-1][-1])


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[1, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[2, 2, 2, 3, 4, 5, 6, 7, 8, 8]
[3, 3, 3, 3, 4, 5, 5, 6, 7, 8]
[4, 4, 4, 3, 4, 5, 6, 6, 7, 8]
[5, 5, 5, 4, 4, 5, 6, 7, 7, 7]
[6, 6, 6, 5, 5, 5, 5, 6, 7, 8]
[7, 7, 7, 6, 6, 6, 6, 5, 6, 7]
[8, 8, 8, 7, 7, 7, 7, 6, 5, 6]
[9, 9, 9, 8, 8, 8, 8, 7, 6, 5]

Levenshtein Distance: 5


In [4]:
#!pip install autocorrect
from autocorrect import Speller
from nltk import word_tokenize

sentence = word_tokenize("Nturael Languge processin is funn")

spell = Speller()

In [5]:
def correct_spelling(tokens):
    correct = spell.autocorrect(tokens)
    return correct

In [6]:
correct_spelling(sentence)

AttributeError: 'Speller' object has no attribute 'autocorrect'

In [7]:
#!pip install spellchecker

In [8]:
from spellchecker import SpellChecker
spell = SpellChecker()
misspelled = spell.unknown(['Nturael', 'Languag', 'procesing'])

for word in misspelled:
    print(spell.correction(word))
    print(spell.candidates(word))

ModuleNotFoundError: No module named 'indexer'

In [9]:
#!pip install textblob

In [10]:
#!pip install editdistance

In [52]:
import editdistance
editdistance.eval('banana', 'bahama')

2

In [11]:
#!pip install Levenshtein