# spellchk: default program

In [1]:
from default import *

## Documentation

Read `answer/default.py` starting with the `spellchk` function and see how it solves the task of spell correction using a pre-trained language model that can predict a replacement token for a masked token in the input.

In your submission, write some beautiful documentation of your program here.

In [2]:
from io import StringIO
with StringIO("4\tit will put your maind into non-stop learning.\n \
              3\t`` Sad '' wss not the right word , of course .\n \
              5,14	Just before Myra left -- Sue was saying good-by to Cathy , and she didm't realize I was near '' .") as f:
    for (locations, spellchk_sent) in spellchk(f):
        print("{locs}\t{sent}".format(
            locs=",".join([str(i) for i in locations]),
            sent=" ".join(spellchk_sent)
        ))

4	it will put your mind into non-stop learning.
3	`` Sad '' s not the right word , of course .
5,14	Just before Myra left -- cathy was saying good-by to Cathy , and she did realize I was near '' .


In [3]:
import numpy as np

def select_correction(typo, predict):
    # return the most likely prediction for the mask token
    dist_score = [len(set(typo.lower()).intersection(set(predict[i]['token_str'])))/len(typo) for i in range(len(predict))]
    best_ind = np.argmax(dist_score)
    return predict[best_ind]['token_str']

def spellchk(fh):
    for (locations, sent) in get_typo_locations(fh):
        spellchk_sent = sent
        for i in locations:
            # predict top_k replacements only for the typo word at index i
            predict = fill_mask(
                " ".join([ sent[j] if j != i else mask for j in range(len(sent)) ]), 
                top_k=20
            )
            logging.info(predict)
            spellchk_sent[i] = select_correction(sent[i], predict)
        yield(locations, spellchk_sent)

In [4]:
with StringIO("4\tit will put your maind into non-stop learning.\n \
              3\t`` Sad '' wss not the right word , of course .\n \
              5,14	Just before Myra left -- Sue was saying good-by to Cathy , and she didm't realize I was near '' .") as f:
    for (locations, spellchk_sent) in spellchk(f):
        print("{locs}\t{sent}".format(
            locs=",".join([str(i) for i in locations]),
            sent=" ".join(spellchk_sent)
        ))

4	it will put your mind into non-stop learning.
3	`` Sad '' was not the right word , of course .
5,14	Just before Myra left -- susie was saying good-by to Cathy , and she immediately realize I was near '' .


In [5]:
import Levenshtein

def select_correction(typo, predict):
    # return the most likely prediction for the mask token
    edit_dist = [Levenshtein.distance(typo.lower(), predict[i]['token_str']) for i in range(len(predict))]
    best_ind = np.argmin(edit_dist)
    return predict[best_ind]['token_str']

def spellchk(fh):
    for (locations, sent) in get_typo_locations(fh):
        spellchk_sent = sent
        for i in locations:
            # predict top_k replacements only for the typo word at index i
            predict = fill_mask(
                " ".join([ sent[j] if j != i else mask for j in range(len(sent)) ]), 
                top_k=20
            )
            logging.info(predict)
            spellchk_sent[i] = select_correction(sent[i], predict)
        yield(locations, spellchk_sent)

In [6]:
with StringIO("4\tit will put your maind into non-stop learning.\n \
              3\t`` Sad '' wss not the right word , of course .\n \
              5,14	Just before Myra left -- Sue was saying good-by to Cathy , and she didm't realize I was near '' .") as f:
    for (locations, spellchk_sent) in spellchk(f):
        print("{locs}\t{sent}".format(
            locs=",".join([str(i) for i in locations]),
            sent=" ".join(spellchk_sent)
        ))

4	it will put your mind into non-stop learning.
3	`` Sad '' was not the right word , of course .
5,14	Just before Myra left -- she was saying good-by to Cathy , and she did realize I was near '' .


In [9]:
def select_correction(typo, predict):
    # return the most likely prediction for the mask token
    LM_score = np.array([predict[i]['score'] for i in range(len(predict))])
    edit_dist = np.array([Levenshtein.distance(typo.lower(), predict[i]['token_str']) for i in range(len(predict))])
    edit_score = (edit_dist.max() - edit_dist) / edit_dist.max()
    w = 0.95
    score_tot = w * edit_score + (1-w) * LM_score
    best_ind = np.argmax(score_tot)
    return predict[best_ind]['token_str'].capitalize() if typo[0].isupper() else predict[best_ind]['token_str']
def spellchk(fh):
    for (locations, sent) in get_typo_locations(fh):
        spellchk_sent = sent
        for i in locations:
            # predict top_k replacements only for the typo word at index i
            predict = fill_mask(
                " ".join([ sent[j] if j != i else mask for j in range(len(sent)) ]), 
                top_k=20
            )
            logging.info(predict)
            spellchk_sent[i] = select_correction(sent[i], predict)
        yield(locations, spellchk_sent)

In [11]:
with StringIO("4\tit will put your maind into non-stop learning.\n \
              3\t`` Sad '' wss not the right word , of course .\n \
              5,14	Just before Myra left -- Sue was saying good-by to Cathy , and she didm't realize I was near '' .\n \
              0	OOviously the farm should be on an all-weather road .") as f:
    for (locations, spellchk_sent) in spellchk(f):
        print("{locs}\t{sent}".format(
            locs=",".join([str(i) for i in locations]),
            sent=" ".join(spellchk_sent)
        ))

4	it will put your mind into non-stop learning.
3	`` Sad '' was not the right word , of course .
5,14	Just before Myra left -- She was saying good-by to Cathy , and she did realize I was near '' .
0	Normally the farm should be on an all-weather road .


In [14]:
def select_correction(typo, predict):
    # return the most likely prediction for the mask token
    LM_score = np.array([predict[i]['score'] for i in range(len(predict))])
    edit_dist = np.array([Levenshtein.distance(typo.lower(), predict[i]['token_str']) for i in range(len(predict))])
    edit_score = (edit_dist.max() - edit_dist) / edit_dist.max()
    w = 0.95
    score_tot = w * edit_score + (1-w) * LM_score
    best_ind = np.argmax(score_tot)
    return predict[best_ind]['token_str'].capitalize() if typo[0].isupper() else predict[best_ind]['token_str']
def spellchk(fh):
    for (locations, sent) in get_typo_locations(fh):
        spellchk_sent = sent
        for i in locations:
            # predict top_k replacements only for the typo word at index i
            predict = fill_mask(
                " ".join([ sent[j] if j != i else mask for j in range(len(sent)) ]), 
                top_k=1000
            )
            logging.info(predict)
            spellchk_sent[i] = select_correction(sent[i], predict)
        yield(locations, spellchk_sent)

In [15]:
with StringIO("4\tit will put your maind into non-stop learning.\n \
              3\t`` Sad '' wss not the right word , of course .\n \
              5,14	Just before Myra left -- Sue was saying good-by to Cathy , and she didm't realize I was near '' .\n \
              0	OOviously the farm should be on an all-weather road .") as f:
    for (locations, spellchk_sent) in spellchk(f):
        print("{locs}\t{sent}".format(
            locs=",".join([str(i) for i in locations]),
            sent=" ".join(spellchk_sent)
        ))

4	it will put your mind into non-stop learning.
3	`` Sad '' was not the right word , of course .
5,14	Just before Myra left -- Sue was saying good-by to Cathy , and she did realize I was near '' .
0	Obviously the farm should be on an all-weather road .


## Analysis

Do some analysis of the results. What ideas did you try? What worked and what did not?

## Group work

* username did this.
* username did this.
