In notebook _hunspell-analysis_ it has been observed that there are target words which are generally NOT in the dictionary of hunspell. Nevertheless hunspell makes a suggestion for corresponding missspelling and the target word does appear in the list of suggestions anyway.

Therefore it has been decided to not compare the target words to the hunspell dictionary to extract the upper bound **but to give all the target words into hunspell and see which ones are going to be predicted correctly**.

The code is merely copied from _hunspell_run_evaluation_ notebook.

In [1]:
# IMPORTS
#%autoreload 2

import re
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from tqdm.notebook import tqdm # A fast, extensible progress bar

import sys
# TODO: Relativen Pfad einbauen
# --> Issue aufmachen oder elegantere Ordnerstruktur

sys.path.insert(0, '/Users/lisaprepens/_Programmierung/bodu-spell/spelling_correction')
#sys.path.insert(0, '../../')
# sys.path

from Lisa import litkey_2
#import Lisa.litkey_2


# CONFIGURATION
# Set figure size for sns plot
%config InlineBackend.figure_format = 'retina'
mpl.rc('figure', figsize=(8, 6), dpi=100)
sns.set()
sns.set_style('darkgrid')

# Set tqdm on pandas
tqdm.pandas(desc="Progress so far...")

# Do not truncate columns of DataFrame
pd.set_option('display.max_rows', None)

ModuleNotFoundError: No module named 'Lisa'

In [2]:
# LOAD LITKEY DATA SET
# data sets are read in CASE SENSITIVE

data_error_types = litkey_2.load(litkey_data_path ='../../litkey-data/')
data_error_token = litkey_2.load(litkey_data_path ='../../litkey-data/', toss_duplicates=False)

In [3]:
# PREPROCESSING
#Hunspell treats words with some punctuation marks as two words (e.g. Seil=bahn), which would destroy the indices; 
#therefore, "=" is replaced with "-" and the others are deleted prior to analysis with Hunspell

data_error_types['unchanged_corrected'] = data_error_types['corrected']
data_error_types['corrected'] = data_error_types['corrected'].str.replace("=", '-')
data_error_types['corrected'] = data_error_types['corrected'].str.replace('"', '')
data_error_types['corrected'] = data_error_types['corrected'].str.replace(":", '')
data_error_types['corrected'] = data_error_types['corrected'].str.replace(",", '')
data_error_types['corrected'] = data_error_types['corrected'].str.replace("'", '')

data_error_token['unchanged_corrected'] = data_error_token['corrected']
data_error_token['corrected'] = data_error_token['corrected'].str.replace("=", '-')
data_error_token['corrected'] = data_error_token['corrected'].str.replace('"', '')
data_error_token['corrected'] = data_error_token['corrected'].str.replace(":", '')
data_error_token['corrected'] = data_error_token['corrected'].str.replace(",", '')
data_error_token['corrected'] = data_error_token['corrected'].str.replace("'", '')

### a) Types

In [4]:
# RUN HUNSPELL
# Create csv-file containing 'CORRECTED'-column (target) data to be analysed with hunspell
data_error_types.corrected.to_csv('corrected_hun.csv', header=False, index=False)

# The beforementioned csv-file is input to hunspell analysis with german dictionary;
# Output of analysis is put to a txt file
# Build a file 'hunspell.txt' with corrections for each word
!cat corrected_hun.csv | hunspell -d de_DE > output_corrected_hun.txt
#!type original_hun.csv | hunspell -d de_DE > hunspell.txt



# Build a list (suggestions-column) of top correction suggestions from hunspell for each word, '' for correct, '?' for unrecognized
# Reading in all the lines
hs = []
with open('output_corrected_hun.txt') as f:
    next(f)
    for index, l in enumerate([l for l in [line.strip() for line in f] if l]):
        # Words recognized as correct
        # (* = dictionary stem (e. g. "man"), + = affixed forms of the following dictionary stem (e. g. "wollt" - wollen))
        # Append nothing to hs
        
        # TODO: Why minus flag?
        if re.match(r'\+|\*|-', l):
            hs.append('')
            
        # Words not recognized/rejected words (# = without suggestions)
        # Append '?' to hs
        elif re.match('#', l):
            hs.append('?')
            
        # Words not recognized/rejected words (& = with suggestions)
        # Append suggested words to hs
        else:
            hs.append(l.split(': ')[1].split(', '))

            
# Add hunspell's corrections as column to data
print(data_error_types.shape[0])
print(len(hs))

data_error_types['suggestions'] = hs

data_error_types.head(40)    # Print excerpt from DataFrame

9484
9484


Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_corrected,suggestions
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,bellt,
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kuckt,"[juckt, kickt, duckt, guckt, zuckt]"
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dann,
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekriegt,
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,lecken,
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,fällt,
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wollte,
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,leckt,
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,vom,
9,gawen,kaufen,01-006-2-III-Eis.csv,1,6,1,kaufen,


### b) Token

In [5]:
# Create csv-file containing 'CORRECTED'-column (target) data to be analysed with hunspell
data_error_token.corrected.to_csv('corrected_hun_token.csv', header=False, index=False)

# The beforementioned csv-file is input to hunspell analysis with german dictionary;
# Output of analysis is put to a txt file
# Build a file 'hunspell.txt' with corrections for each word
!cat corrected_hun_token.csv | hunspell -d de_DE > output_corrected_hun_token.txt
#!type original_hun.csv | hunspell -d de_DE > hunspell.txt



# Build a list (suggestions-column) of top correction suggestions from hunspell for each word, '' for correct, '?' for unrecognized
# Reading in all the lines
hs = []
with open('output_corrected_hun_token.txt') as f:
    next(f)
    for index, l in enumerate([l for l in [line.strip() for line in f] if l]):
        # Words recognized as correct
        # (* = dictionary stem (e. g. "man"), + = affixed forms of the following dictionary stem (e. g. "wollt" - wollen))
        # Append nothing to hs
        
        # TODO: Why minus flag?
        if re.match(r'\+|\*|-', l):
            hs.append('')
            
        # Words not recognized/rejected words (# = without suggestions)
        # Append '?' to hs
        elif re.match('#', l):
            hs.append('?')
            
        # Words not recognized/rejected words (& = with suggestions)
        # Append suggested words to hs
        else:
            hs.append(l.split(': ')[1].split(', '))

            
# Add hunspell's corrections as column to data
print(data_error_token.shape[0])
print(len(hs))

data_error_token['suggestions'] = hs

data_error_token.head(40)    # Print excerpt from DataFrame

24601
24601


Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_corrected,suggestions
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,bellt,
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kuckt,"[juckt, kickt, duckt, guckt, zuckt]"
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dann,
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekriegt,
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,lecken,
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,fällt,
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wollte,
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,leckt,
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,vom,
9,dan,dann,01-005-2-III-Eis.csv,627,651,621,dann,


In [6]:
data_error_types.head(100)
data_error_token.head(100)

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_corrected,suggestions
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,bellt,
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kuckt,"[juckt, kickt, duckt, guckt, zuckt]"
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dann,
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekriegt,
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,lecken,
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,fällt,
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wollte,
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,leckt,
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,vom,
9,dan,dann,01-005-2-III-Eis.csv,627,651,621,dann,


In [7]:
# Serialize dataframe, so that analysis does not have to be run every single time
# Dataframe can then be deserialized in another notebook
#data_error_types.to_pickle('target_error_types_hunspell_evaluation.pkl')
#data_error_token.to_pickle('target_error_token_hunspell_evaluation.pkl')

## Get upper bound
_Here: For case insensitive_ <br>
There are 3 cases to be distinguished:
- not recognized
    - Hunspell flag = '#' , i. e. append '?' to hs (list = [?])
- recognized as right
    - Hunspell flag = '+' or '*', i. e. append nothing to hs (see above)
- recognized as wrong
    - list contains suggestions

### a) Types

In [3]:
# Deserialze
data_error_types = pd.read_pickle('target_error_types_hunspell_evaluation.pkl')
data_error_token = pd.read_pickle('target_error_token_hunspell_evaluation.pkl')

In [9]:
# NOT RECOGNIZED

# Where is "Erdbeerlollipop"?
# Can be found at "erdbeerlollipop" since lowercasing in evaluation function (corrected)
# or in column 'corrected_cs'

#display(data_error_types[data_error_types.corrected == 'erdbeerlollipop'])
#display(data_error_types[data_error_types.corrected_cs == 'Erdbeerlollipop'])

In [6]:
# RECOGNIZED AS RIGHT / NOT RIGHT (false and nor recognized)

# if suggestions is empty, word has been recognized as correct
# <-> if there is a suggestion, words is not recognized as correct

ct_hun_not_right = data_error_types[data_error_types.suggestions.apply(len).gt(0)].shape[0] # check for length greater than 0, i. e. list not empty
ct_hun_right = data_error_types[~data_error_types.suggestions.apply(len).gt(0)].shape[0] # not right is either false or not recognized
display('Hunspell recognizes as not correct...', data_error_types[data_error_types.suggestions.apply(len).gt(0)].head(20))

'Hunspell recognizes as not correct...'

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_corrected,suggestions
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kuckt,"[juckt, kickt, duckt, guckt, zuckt]"
14,nahaise,nachhause,01-006-2-III-Eis.csv,1,10,1,nachhause,"[nach hause, nach-hause, nachschaue]"
28,weitagegangen,weitergegangen,01-025-2-III-Eis.csv,1,1,1,weitergegangen,"[weiter gegangen, weiter-gegangen, weitergegeb..."
63,runtergefalen,runtergefallen,01-029-2-III-Eis.csv,12,28,12,runtergefallen,"[runter gefallen, runter-gefallen, -untergefal..."
77,wegetan,wehgetan,01-045-2-III-Eis.csv,11,17,11,wehgetan,"[weggetan, weh getan, weh-getan, angeweht]"
79,rich,riech,01-045-2-III-Eis.csv,2,2,2,riech,"[reich, rieche, riecht, siech]"
84,Nachause,nachhause,01-049-2-III-Eis.csv,1,10,1,nachhause,"[nach hause, nach-hause, nachschaue]"
103,schtoiper,stolper,01-057-2-III-Eis.csv,1,1,1,stolper,"[stolpre, stolpere, stolpern, stolpert, stolpe..."
146,pas,pass,01-065-2-III-Eis.csv,8,10,8,pass,"[Pass, passe, passt, passé, pass-, -pass, nass..."
221,cemast,zermatscht,01-113-2-III-Eis.csv,1,1,1,zermatscht,[schmatze]


In [11]:
display(ct_hun_not_right)
display(ct_hun_right)
display(data_error_types.shape[0])

ratio = ct_hun_right/data_error_types.shape[0]
print('New upper bound for types is', round(ratio,4)*100,'%')
print(ratio)

718

8766

9484

New upper bound for types is 92.43 %
0.9242935470265711


### b) Token

In [4]:
ct_hun_not_right_token = data_error_token[data_error_token.suggestions.apply(len).gt(0)].shape[0]
ct_hun_right_token = data_error_token[~data_error_token.suggestions.apply(len).gt(0)].shape[0]

display(ct_hun_not_right_token)
display(ct_hun_right_token)
display(data_error_token.shape[0])

ratio_token = ct_hun_right_token/data_error_token.shape[0]
print('New upper bound for tokens is', round(ratio_token,4)*100,'%')
print(ratio_token)

1458

23143

24601

New upper bound for tokens is 94.07 %
0.9407341164993293


***

In [7]:
data_error_token[data_error_token.suggestions.apply(len).gt(0)].shape[0]

1458

In [8]:
hun_not_right_token = data_error_token[data_error_token.suggestions.apply(len).gt(0)]

In [9]:
hun_not_right_token.to_pickle('df_hunspell_ub_not_right_token_case_sensitive.pkl')