# Imports

In [14]:
!pip install pytrec_eval
from main import preprocess_corpus, reduce_spelling_errors_corpus_length, calculate_med, get_top_k_words, s_at_K_for_every_incorrect_token, compute_avg_at_K
import time
import multiprocessing as mp
import json



# Function to pre-process the spelling error corpus
- Corpus Name: Birbeck Corpus
-  Retrieved from: https://www.dcs.bbk.ac.uk/~ROGER/corpora.html

In [2]:
df = preprocess_corpus('./missp.dat.txt')

print('Total number of misspelled words : {}'.format(len(df)))

Total number of misspelled words : 36133


# Change misspelled corpus length

In [3]:
df = reduce_spelling_errors_corpus_length(df, 600)

print(len(df))

557


# Comparison of our Minimum Edit Distance (MED) function with the inbuilt nltk function

In [4]:
from nltk.metrics import distance

def med_nltk(str1:str, str2:str)->int:
  return distance.edit_distance(str1, str2, substitution_cost=2, transpositions=False)

print(calculate_med("hello", "hell"))
print(med_nltk("hello", "hell"))
print(calculate_med("random", "randomizer"))
print(med_nltk("random", "randomizer"))
print(calculate_med("saiyan", "senorita"))
print(med_nltk("saiyan", "senorita"))
print(calculate_med("des", ""))
print(med_nltk("des", ""))
print(calculate_med("", "asa"))
print(med_nltk("", "asa"))
print(calculate_med("", ""))
print(med_nltk("", ""))

1
1
4
4
8
8
3
3
3
3
0
0


Seems like our method is working perfectly in this case

# Getting the top K words for the first 15 misspelled tokens
Top k words are the number of words which have the least distance according to the Minimum Edit Distance(M.E.D) algorithm. The incorrect words are compared to all the words in the wordnet dictionary. In our assignment the k values are k = {1, 5, 10}

## Case 1: No parallelization present

In [6]:
start = time.time()
results = []

for row in df[:15]:
  results.append(get_top_k_words(row))

end = time.time()

print(f"time taken: {end - start}")

time taken: 42.91403007507324


## Case 2: With Parallelization

In [13]:
def main():
    results = []
    start = time.time()
    with mp.Pool(processes=16) as p:
        results = p.map(get_top_k_words, df[:15])

    end = time.time()
    
    print(f"time taken: {end - start}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data]   P

time taken: 12.537324666976929


# Getting the top K words for the entire dataset

In [16]:
def main():
    results = []
    start = time.time()
    with mp.Pool(processes=16) as p:
        results = p.map(get_top_k_words, df)

    end = time.time()
    
    with open('final_results.json', 'w') as f:
        json.dump(results, f)
    
    print(f"time taken: {end - start}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dawarwaqar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data]   Package wordnet i

time taken: 311.5308048725128


# Calculating S@K 

In [18]:
results = []
with open('final_results.json', 'r') as f:
    results = json.load(f)
s_at_K = s_at_K_for_every_incorrect_token(results)

# Calculating Average Success at K

In [19]:
avg_succ_at_K = compute_avg_at_K(s_at_K)
print(avg_succ_at_K)

{'s@1': 0.2911392405063291, 's@5': 0.4484629294755877, 's@10': 0.5135623869801085}
