In [1]:
import os

os.sys.path.insert(0, '../script')
from webnlg import WebNLGCorpus

from enchant.checker import SpellChecker

from tqdm import tqdm

from collections import Counter, defaultdict

In [2]:
# return only misspellings for both en_GB and en_US

chkr_GB = SpellChecker('en_GB')
chkr_US = SpellChecker('en_US')

# return misspellings for both en_GB and en_US
# limitations:
# 1 - deals only with lower case words - assumes not lower case words are proper nouns
# 2 - ignores cases of incorrect case, like brazil/Brazil
def get_spellings_errors(t):
    
    chkr_GB.set_text(t)
    chkr_US.set_text(t)
    
    # find misspellings and its first suggestion
    #    only if the word is in lower case
    gb_misspellings = set((w.word, w.suggest()[0]) for w in chkr_GB if w.word.islower())
    us_misspellings = set((w.word, w.suggest()[0]) for w in chkr_US if w.word.islower())
    
    # only consider a misspelling if its a misspelling for both locales en_GB and en_US, and have the same suggestion
    gb_us_misspellings = gb_misspellings & us_misspellings
    
    # remove cases which the only difference is the case
    gb_us_misspellings = [w for w, s in gb_us_misspellings if w.lower() != s.lower()]
    
    return gb_us_misspellings

In [3]:
# load all datasets

db = WebNLGCorpus.load(['train', 'dev', 'test_with_lex'])

In [4]:
%%time

result = []

# for each entry in the dataset
for entry in tqdm(db):
    
    misspellings = set()
    
    # for each reference text
    for lex in entry.lexes():

        # calculate misspellings
        errs = get_spellings_errors(lex)
        
        # accumulate
        misspellings.update(errs)
            
    # if some misspelling
    if misspellings:
        
        entry_result = {'entry':entry, 'misspellings': misspellings}
        
        result.append(entry_result)

100%|██████████| 9674/9674 [03:42<00:00, 43.44it/s] 

CPU times: user 3min 25s, sys: 11.3 s, total: 3min 36s
Wall time: 3min 42s





In [5]:
# creates a dictionary mapping misspellings into entries where they occur
ebm = defaultdict(list)

for d in result:
    
    for misspell in d['misspellings']:
        
        ebm[misspell].append(d['entry'])

In [6]:
# removes supposed misspellings if they occur in some of the entry's triples

def key_is_in_some_triple(key, entries):
    
    for entry in entries:
        
        for triple in entry.triples(kind='text'):
            
            if key in triple:
                
                return True
    return False
    

# retain only cases where the supposed misspelling doesn't occur in some of the entry's triples
ebm = {key: entries for key, entries in ebm.items() if not key_is_in_some_triple(key, entries)}

# Some examples

In [9]:
ebm.keys()

dict_keys(['idenitifier', 'loaction', 'sytle', 'studiies', 'publshed', 'discipine', 'journeal', 'abbreivated', 'ajoblanco', 'ayam', 'bakewell', 'batchoy', 'binignit', 'potatotes', 'bionico', 'apium', 'affilaited', 'bananaman', 'varient', 'arros', 'penet', 'persea', "counrty's", 'originiates', 'menbers', 'legue', 'managerof', 'hardcovered', 'unrelatedly', 'unreleated', 'denonym', 'inagurated', 'thge', 'relgiion', 'locaated', 'joing', 'figter', 'establishedin', 'ffiliated', 'comanded', 'ingridient', 'kethup', 'ingedients', 'straweberries', 'noodels', 'souce', 'confectionary', 'igredient', 'abreviated', 'focussed', 'abbreviatedform', 'homeground', 'constructionof', 'lanka', 'locacted', 'withreference', 'prevously', 'operationg', 'aboce', 'airforce', 'identfier', 'airbas', 'aircaft', 'lqbal', 'lcolo', 'whree', 'affliate', 'inaugrated', 'ethinic', 'panyet', 'sanwich', 'origintes', 'varation', 'bakewall', 'basko', 'penjey', 'retiral', 'univesity', 'sequeled', 'predominently', 'sppoken', 'pun

In [10]:
ebm['idenitifier']

[Triple info: category=Airport eid=Id29
 
 	Modified triples:
 
 Adirondack_Regional_Airport | locationIdentifier | "SLK"
 
 
 	Lexicalizations:
 
 Adirondack Regional Airport location idenitifier is SLK.
 SLK is the I.D. of the Adirondack Regional Airport.
 The Adirondack Regional Airport location identifier is SLK.]

In [11]:
ebm['loaction']

[Triple info: category=Airport eid=Id100
 
 	Modified triples:
 
 Alpena_County_Regional_Airport | locationIdentifier | "APN"
 
 
 	Lexicalizations:
 
 The location identifier of Alpena County Regional airport is APN.
 The Alpena County Regional Airport's location id is "APN".
 The loaction identifier of Alpena County Regional Airport is APN.]

In [12]:
ebm['whiere']

[Triple info: category=City eid=Id795
 
 	Modified triples:
 
 Albany,_Georgia | isPartOf | Georgia_(U.S._state)
 United_States | leaderTitle | President_of_the_United_States
 United_States | demonym | Americans
 United_States | ethnicGroup | Asian_Americans
 Albany,_Georgia | country | United_States
 
 
 	Lexicalizations:
 
 The United States, whiere the inhabitants are known as Americans, is led by the President and has Asian Americans as an ethnic group. It is the location of Albany, part of the state of Georgia.
 Albany is located in the US state of Georgia, U.S. With the leader called the President of the United States there are Asian Americans as well as Americans living there.
 Americans are the people occupying the U.S., whose leader is the President of the United States, and Asian Americans are an ethnic group there. Albany is part of the state of Georgia in the United States.]

In [14]:
# well, it doesn't look like a misspelling 
ebm['sealevel']

[Triple info: category=Airport eid=Id129
 
 	Modified triples:
 
 Appleton_International_Airport | location | Greenville,_Wisconsin
 Appleton_International_Airport | runwayLength | 1982.0
 Appleton_International_Airport | elevationAboveTheSeaLevel_(in_metres) | 280
 
 
 	Lexicalizations:
 
 Appleton International airport, which is located in Greenville, Wisconsin is located at 280 metres above sea level and a runway length of 1982.
 Appleton International airport is located in Greenville, Wisconsin,280 metres above sealevel and its runway is 1.982 km long.]

In [15]:
ebm['carreer']

[Triple info: category=Politician eid=Id1725
 
 	Modified triples:
 
 Albert_B._White | successor | William_M._O._Dawson
 Albert_B._White | birthPlace | Cleveland
 Albert_B._White | activeYearsEndDate | 1905-03-04
 Albert_B._White | activeYearsStartDate | 1901-03-04
 
 
 	Lexicalizations:
 
 Born in Cleveland, Albert B. White started his career on March 4, 1901 and finished it on March 4th 1905.
 Albert B. White was born in Cleveland and his successor was William M.O. Dawson. He started his career on March 4, 1901 and finished it on March 4th 1905.
 Albert B. White was born in Cleveland. he began his carreer on March 4, 1901 and ended it on March 4, 1905. His successor was William M. O. Dawson.]