In [2]:
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
# Loading the corpus
corpus = brown.words()

# Case folding and getting vocab
lower_case_corpus = [w.lower() for w in corpus]
vocab = set(lower_case_corpus)

print('CORPUS EXAMPLE: ' + str(lower_case_corpus[:30]) + '\n\n')
print('VOCAB EXAMPLE: ' + str(list(vocab)[:10]))

CORPUS EXAMPLE: ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.', 'the', 'jury', 'further', 'said', 'in']


VOCAB EXAMPLE: ['connor', 'miraculously', 'disinterred', 'camden', 'insinuations', 'perspired', 'attended', 'pragmatism', 'painter', 'mac']


In [3]:
user_input=input("Enter a sentence : ")
words= word_tokenize(user_input)
words = [word.lower() for word in words]
print(words)

Enter a sentence :  I am the kng


['i', 'am', 'the', 'kng']


In [4]:
input_tokens= set(words)
corpus= set(lower_case_corpus)
incorrect_words= input_tokens-corpus
print("Missing/Incorrect words:", incorrect_words)

Missing/Incorrect words: {'kng'}


In [5]:
def find_previous_word(word_list, target_word):
    previous_word = None
    
    for word in word_list:
        if word == target_word:
            # If the target word is found, return the previous word if it exists
            if previous_word is not None:
                return previous_word
            else:
                return None
        previous_word = word
    
    # If the target word is not found in the list
    return None

# Example list of words
word_list = words

# Example target words
target_words = incorrect_words 

element = target_words.pop()
target_word = str(element)
print("Target word:", target_word)

# Call the function to find the previous word of the target word
previous_word = find_previous_word(word_list, target_word)

if previous_word is not None:
    print("Previous word of '{}' is '{}'.".format(target_word, previous_word))
else:
    print("No previous word found for '{}'.".format(target_word))


Target word: kng
Previous word of 'kng' is 'the'.


In [6]:
bigram_counts = {}
trigram_counts = {}

# Sliding through corpus to get bigram and trigram counts
for i in range(len(lower_case_corpus) - 2):
    # Getting bigram and trigram at each slide
    bigram = (lower_case_corpus[i], lower_case_corpus[i+1])
    trigram = (lower_case_corpus[i], lower_case_corpus[i+1], lower_case_corpus[i+2])
    
    # Keeping track of the bigram counts
    if bigram in bigram_counts.keys():
        bigram_counts[bigram] += 1
    else:
        bigram_counts[bigram] = 1
    
    # Keeping track of trigram counts
    if trigram in trigram_counts.keys():
        trigram_counts[trigram] += 1
    else:
        trigram_counts[trigram] = 1

# print("Example, count for bigram ('the', 'king') is: " + str(bigram_counts[('the', 'king')]))

In [7]:
def calculate_next_word_probabilities(previous_word, bigram_counts, trigram_counts):
    probabilities = {}
    total_bigrams = sum(count for bigram, count in bigram_counts.items() if bigram[0] == previous_word)
    
    # Calculate probabilities for bigrams
    for bigram, count in bigram_counts.items():
        if bigram[0] == previous_word:
            next_word = bigram[1]
            probabilities[next_word] = count / total_bigrams
    
    # Calculate probabilities for trigrams
    for trigram, count in trigram_counts.items():
        if trigram[:2] == (previous_word,):
            next_word = trigram[2]
            conditional_count = bigram_counts.get((previous_word, trigram[1]), 0)
            probabilities[next_word] = count / conditional_count if conditional_count != 0 else 0
    
    # Sort probabilities in descending order
    sorted_probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))
    
    return sorted_probabilities

# Example usage

probable_words = calculate_next_word_probabilities(previous_word, bigram_counts, trigram_counts)
print(len(probable_words))
print("Probabilities of words following '{}' in the corpus (descending order):".format(previous_word))
for word, prob in probable_words.items():
    print("{}: {}".format(word, prob))



13945
Probabilities of words following 'the' in the corpus (descending order):
first: 0.009461062440153777
same: 0.008975146846550713
most: 0.005959611839190522
other: 0.005945320204084549
``: 0.005788112217918852
new: 0.005673779137071073
united: 0.005616612596647182
world: 0.005159280273256063
state: 0.0038730331137185403
two: 0.0038301582084006233
only: 0.0036872418573408983
time: 0.003587200411599091
way: 0.0034157007903274214
old: 0.003344242614797559
last: 0.003187034628631862
house: 0.0030869931828900543
next: 0.0030012433722542194
end: 0.0029440768318303298
fact: 0.00277257721055866
whole: 0.0027011190350287977
man: 0.0026296608594989354
american: 0.002586785954181018
door: 0.0024581612382272655
second: 0.002443869603121293
best: 0.0023581197924854584
great: 0.0022723699818496235
city: 0.0021580369010018438
past: 0.0021008703605779537
right: 0.0021008703605779537
president: 0.0021008703605779537
church: 0.0020579954552600362
present: 0.0019436623744122565
public: 0.001915079104

In [8]:
def edit_distance(str1, str2):
    """
    Compute the Levenshtein distance between two strings.
    """
    m, n = len(str1), len(str2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j],        # Deletion
                                   dp[i][j - 1],        # Insertion
                                   dp[i - 1][j - 1])    # Substitution

    return dp[m][n]

def get_closest_match(word, dictionary):
    """
    Get the closest match in the dictionary based on edit distance.
    """
    min_distance = float('inf')
    closest_match = None

    for candidate in dictionary:
        distance = edit_distance(word, candidate)
        if distance < min_distance:
            min_distance = distance
            closest_match = candidate

    return closest_match



# Example usage
correct_words = probable_words

misspelled_word = target_word
suggested_correction = get_closest_match(misspelled_word, correct_words)

print(f"Incorrect word: {misspelled_word}")
print(f"Suggested correction: {suggested_correction}")


Incorrect word: kng
Suggested correction: king


In [9]:
corrected_sentence = user_input.replace(target_word,suggested_correction)
print (corrected_sentence)

I am the king
