# My own 'did you mean?'-correction tool. A humble beginning...
### By Hans Martin Aannestad

## 1.1 Finding collocations

In [1]:
import nltk
from nltk.collocations import *

# • Consider sequences of 2 words (bigrams).

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(nltk.corpus.brown.tagged_words(tagset='universal'))

# (1) frequency plus part-of-speech tagging (search for adjectives and nouns)

finder.apply_freq_filter(3)
valid = ['ADJ','NOUN']

# (2) hypothesis testing (see slides for Lecture 2). Using built-in student t-test

coll = sorted(finder.above_score(bigram_measures.student_t, 2.645))  # Set confidence to preference
coll_v = [(a[0].lower(),b[0].lower()) for (a,b) in coll if (a[1] in valid) and (b[1] in valid)]

# • Generate files containing the collocations.

with open('collocations.txt', 'w') as filehandle:
    for item in coll_v:
        filehandle.write('%s\n' % str(item))


## 1.2 Correction tool

In [2]:
## • Suggestion: Use WordNet to detect synonyms.

import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import words
from nltk import word_tokenize
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

correct_spellings = words.words()

def correct(s):
    tokens = word_tokenize(s.lower()) # Tokenize
    corr_ws = correct_spelling(tokens) # Correct spelling

    for i in range(len(corr_ws)-1): # Correct collocations
        out = correct_collocations(corr_ws[i],corr_ws[i+1])
        #print(out)
        corr_ws[i] = out[0]
        corr_ws[i+1] = out[1]

    if tokens == corr_ws:   # No corrections needed
        return ("All good!")
    else:
        return ("Did you mean: " + ' '.join(corr_ws)  + "?")
        
def correct_spelling(raw_in):
    out = []
    for entry in raw_in:
        temp = [(jaccard_distance(set(ngrams(entry, 2)), set(ngrams(w, 2))),w) for w in correct_spellings if w[0]==entry[0]]
        out.append(sorted(temp, key = lambda val:val[0])[0][1])
    return out
     
def correct_collocations(w1, w2):
    # Search for w1 in first bigram collocations
    for c in coll_v:
    
        if w1 == c[0]: # match
        # if second word is in the collocation
            if w2 == c[1]:
                return (w1, w2) # no correction needed
            else:
            # search for synonyms of the second word to identify collocation
                
                for ss in wn.synsets(c[1]):
                    for name in ss.lemma_names():
                        if name == w2:
                            return (c[0], c[1])

        if w2 == c[1]: # match

        # if first word is in the collocation
            if w1 == c[0]:
                return (w1, w2) # no correction needed
            else:
            # search for synonyms of the second word to identify collocation
                for ss in wn.synsets(c[0]):
                    for name in ss.lemma_names():
                        if name == w1:
                            return (c[0], c[1])
    return (w1, w2)
        

In [3]:
from ipywidgets import widgets
from IPython.display import display

print("Enter two words. For example: usual sense")
text = widgets.Text()
display(text)

output = widgets.Output()

@output.capture()
def handle_submit(sender):
    output.clear_output()

button = widgets.Button(description="Check for corrections")
display(button)

@output.capture()
def on_button_clicked(b):
    output.clear_output()

    print(correct(text.value))
 
button.on_click(on_button_clicked)
text.on_submit(handle_submit)

display(output)

Enter two words. For example: usual sense


Text(value='')

Button(description='Check for corrections', style=ButtonStyle())

Output()

In [4]:
len("Uninteresting painting A beautiful classic Bad quality Pointless theme beautiful my favourite painting".split())

13

In [5]:
len(set("Uninteresting painting A beautiful classic Bad quality Pointless theme beautiful my favourite painting".split()))

11