In [90]:
import re
from collections import Counter
import numpy as np
import pandas as pd
from string_manipulations import TextManipulations

## **Data Processing:**

In [91]:
def process_data(file_name):
    """
    Input: 
        A file_name which is found in your current directory. You just have to read it in. 
    Output: 
        words: a list containing all the words in the corpus (text file you read) in lower case. 
    """
    words = [] 
    file = open(file_name, "r")
    text_lowercase = file.read().lower()
    words = re.findall(r'\w+', text_lowercase)
    return words

In [92]:
word_l = process_data('shakespeare.txt')
vocab = set(word_l)  # this will be your new vocabulary
print(f"The first ten words in the text are: \n{word_l[0:10]}")
print(f"There are {len(vocab)} unique words in the vocabulary.")

The first ten words in the text are: 
['o', 'for', 'a', 'muse', 'of', 'fire', 'that', 'would', 'ascend', 'the']
There are 6116 unique words in the vocabulary.


In [93]:
def get_count(word_l):
    '''
    Input:
        word_l: a set of words representing the corpus. 
    Output:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    '''
    word_count_dict = {}  
    word_count_dict = Counter(word_l)
    return word_count_dict

In [94]:
word_count_dict = get_count(word_l)
print(f"There are {len(word_count_dict)} key values pairs")
print(f"The count for the word 'thee' is {word_count_dict.get('thee',0)}")

There are 6116 key values pairs
The count for the word 'thee' is 240


In [95]:
def get_probs(word_count_dict):
    '''
    Input:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur. 
    '''
    probs = {}  
    # get the total count of words for all words in the dictionary
    total_length = sum(word_count_dict.values())
    for keys,vals in word_count_dict.items():
        prob_val = vals / total_length
        probs[keys] = prob_val
    return probs

In [96]:
probs = get_probs(word_count_dict)
print(f"Length of probs is {len(probs)}")
print(f"P('thee') is {probs['thee']:.4f}")

Length of probs is 6116
P('thee') is 0.0045


## **Text Manipulations**

### **Delete Letter:**

In [97]:
delete_word_l = TextManipulations.delete_letter(word="cans", 
                                                verbose=True)

input word cans, 
split_l = [('', 'cans'), ('c', 'ans'), ('ca', 'ns'), ('can', 's')], 
delete_l = ['ans', 'cns', 'cas', 'can']


In [98]:
print(f"Number of outputs of delete_letter('at') is {len(TextManipulations.delete_letter('at'))}")

Number of outputs of delete_letter('at') is 2


### **Switch Letter:**

In [99]:
switch_word_l = TextManipulations.switch_letter(word="eta",
                                                verbose=True)

Input word = eta 
split_l = [('', 'eta'), ('e', 'ta'), ('et', 'a')] 
switch_l = ['tea', 'eat']


In [100]:
print(f"Number of outputs of switch_letter('at') is {len(TextManipulations.switch_letter('at'))}")

Number of outputs of switch_letter('at') is 1


### **Replace Letter:**

In [101]:
Text_Manipulation = TextManipulations()

In [102]:
replace_l = Text_Manipulation.replace_letter(word='can',
                              verbose=True)

Input word = can 
split_l = [('', 'can'), ('c', 'an'), ('ca', 'n')] 
replace_l ['aan', 'ban', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'dan', 'ean', 'fan', 'gan', 'han', 'ian', 'jan', 'kan', 'lan', 'man', 'nan', 'oan', 'pan', 'qan', 'ran', 'san', 'tan', 'uan', 'van', 'wan', 'xan', 'yan', 'zan']


In [103]:
print(f"Number of outputs of replace_letter('at') is {len(Text_Manipulation.replace_letter('at'))}")

Number of outputs of replace_letter('at') is 50


### **Insert Letter:**

In [104]:
replace_l = Text_Manipulation.insert_letter(word='can',
                              verbose=True)

Input word can 
split_l = [('', 'can'), ('c', 'an'), ('ca', 'n'), ('can', '')] 
insert_l = ['acan', 'caan', 'caan', 'cana', 'bcan', 'cban', 'cabn', 'canb', 'ccan', 'ccan', 'cacn', 'canc', 'dcan', 'cdan', 'cadn', 'cand', 'ecan', 'cean', 'caen', 'cane', 'fcan', 'cfan', 'cafn', 'canf', 'gcan', 'cgan', 'cagn', 'cang', 'hcan', 'chan', 'cahn', 'canh', 'ican', 'cian', 'cain', 'cani', 'jcan', 'cjan', 'cajn', 'canj', 'kcan', 'ckan', 'cakn', 'cank', 'lcan', 'clan', 'caln', 'canl', 'mcan', 'cman', 'camn', 'canm', 'ncan', 'cnan', 'cann', 'cann', 'ocan', 'coan', 'caon', 'cano', 'pcan', 'cpan', 'capn', 'canp', 'qcan', 'cqan', 'caqn', 'canq', 'rcan', 'cran', 'carn', 'canr', 'scan', 'csan', 'casn', 'cans', 'tcan', 'ctan', 'catn', 'cant', 'ucan', 'cuan', 'caun', 'canu', 'vcan', 'cvan', 'cavn', 'canv', 'wcan', 'cwan', 'cawn', 'canw', 'xcan', 'cxan', 'caxn', 'canx', 'ycan', 'cyan', 'cayn', 'cany', 'zcan', 'czan', 'cazn', 'canz']


In [105]:
print(f"Number of outputs of replace_letter('at') is {len(Text_Manipulation.insert_letter('at'))}")

Number of outputs of replace_letter('at') is 78


### **Edit One Letter:**

In [106]:
tmp_word = "at"
tmp_edit_one_set = Text_Manipulation.edit_one_letter(tmp_word)
tmp_edit_one_l = sorted(list(tmp_edit_one_set))

print(f"input word {tmp_word} \nedit_one_l \n{tmp_edit_one_l}\n")
print(f"Number of outputs from edit_one_letter('at') is {len(Text_Manipulation.edit_one_letter('at'))}")

input word at 
edit_one_l 
['a', 'aa', 'aat', 'ab', 'abt', 'ac', 'act', 'ad', 'adt', 'ae', 'aet', 'af', 'aft', 'ag', 'agt', 'ah', 'aht', 'ai', 'ait', 'aj', 'ajt', 'ak', 'akt', 'al', 'alt', 'am', 'amt', 'an', 'ant', 'ao', 'aot', 'ap', 'apt', 'aq', 'aqt', 'ar', 'art', 'as', 'ast', 'ata', 'atb', 'atc', 'atd', 'ate', 'atf', 'atg', 'ath', 'ati', 'atj', 'atk', 'atl', 'atm', 'atn', 'ato', 'atp', 'atq', 'atr', 'ats', 'att', 'atu', 'atv', 'atw', 'atx', 'aty', 'atz', 'au', 'aut', 'av', 'avt', 'aw', 'awt', 'ax', 'axt', 'ay', 'ayt', 'az', 'azt', 'bat', 'bt', 'cat', 'ct', 'dat', 'dt', 'eat', 'et', 'fat', 'ft', 'gat', 'gt', 'hat', 'ht', 'iat', 'it', 'jat', 'jt', 'kat', 'kt', 'lat', 'lt', 'mat', 'mt', 'nat', 'nt', 'oat', 'ot', 'pat', 'pt', 'qat', 'qt', 'rat', 'rt', 'sat', 'st', 't', 'ta', 'tat', 'tt', 'uat', 'ut', 'vat', 'vt', 'wat', 'wt', 'xat', 'xt', 'yat', 'yt', 'zat', 'zt']

Number of outputs from edit_one_letter('at') is 129


### **Edit Two Letter:**

In [107]:
tmp_edit_two_set = Text_Manipulation.edit_two_letters("a")
tmp_edit_two_l = sorted(list(tmp_edit_two_set))
print(f"Number of strings with edit distance of two: {len(tmp_edit_two_l)}")
print(f"First 10 strings {tmp_edit_two_l[:10]}")
print(f"Last 10 strings {tmp_edit_two_l[-10:]}")
print(f"Number of strings that are 2 edit distances from 'at' is {len(Text_Manipulation.edit_two_letters('at'))}")

Number of strings with edit distance of two: 2654
First 10 strings ['', 'a', 'aa', 'aaa', 'aab', 'aac', 'aad', 'aae', 'aaf', 'aag']
Last 10 strings ['zv', 'zva', 'zw', 'zwa', 'zx', 'zxa', 'zy', 'zya', 'zz', 'zza']
Number of strings that are 2 edit distances from 'at' is 7154


## **Get Auto-Correction Results:**

In [88]:
def get_corrections(word, probs, vocab, n=2, verbose = False):
    '''
    Input: 
        word: a user entered string to check for suggestions
        probs: a dictionary that maps each word to its probability in the corpus
        vocab: a set containing all the vocabulary
        n: number of possible word corrections you want returned in the dictionary
    Output: 
        n_best: a list of tuples with the most probable n corrected words and their probabilities.
    '''
    suggestions = []
    n_best = []
    res = []
    if word in probs:
        res += [(word, probs[word])]
    res += sorted([(w, probs[w]) for w in Text_Manipulation.edit_one_letter(word) if w in probs], key = lambda x : -x[1])
    res += sorted([(w, probs[w]) for w in Text_Manipulation.edit_two_letters(word) if w in probs], key = lambda x : -x[1])
    suggestions = set([x[0] for x in res[:n]])
    n_best = [x for x in res[:n]]
    
    if verbose: print("entered word = ", word, "\nsuggestions = ", suggestions)
    return n_best

In [108]:
my_word = 'dys' 
tmp_corrections = get_corrections(my_word, probs, vocab, 2, verbose=True) # keep verbose=True
for i, word_prob in enumerate(tmp_corrections):
    print(f"word {i}: {word_prob[0]}, probability {word_prob[1]:.6f}")

entered word =  dys 
suggestions =  {'days', 'dye'}
word 0: days, probability 0.000410
word 1: dye, probability 0.000019
