# Spelling correction with hunspell on litkey data set

Contents:
1. Preparation
    - Imports
    - Load data set
        - **Types**
        - **Token**
    - Preprocessing
2. Running hunspell on litkey data set
    - **Types**
    - **Tokens**
3. Evaluation

## 1 - Preparation

In [1]:
# IMPORTS
#%autoreload 2

import re
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from tqdm.notebook import tqdm # A fast, extensible progress bar

import sys
# TODO: Relativen Pfad einbauen
# --> Issue aufmachen oder elegantere Ordnerstruktur

sys.path.insert(0, '/Users/lisaprepens/_Programmierung/bodu-spell/spelling_correction')
#sys.path.insert(0, '../../')
# sys.path

from Lisa import litkey_2
#import Lisa.litkey_2

In [2]:
# Configuration
# Set figure size for sns plot
%config InlineBackend.figure_format = 'retina'
mpl.rc('figure', figsize=(8, 6), dpi=100)
sns.set()
sns.set_style('darkgrid')

# Set tqdm on pandas
tqdm.pandas(desc="Progress so far...")

# Do not truncate columns of DataFrame
pd.set_option('display.max_rows', None)

  from pandas import Panel


In [None]:
#tqdm.pandas(desc='my bar')
#import numpy as np
#tqdm.pandas()
#df = pd.DataFrame(np.random.randint(0, 100, (100000, 6)))
#df.progress_apply(lambda x: x**2)

In [15]:
# LOAD LITKEY DATA SET
# It was the wrong path; Go two levels up
data_error_types = litkey_2.load(litkey_data_path ='../../litkey-data/')

data_error_token = litkey_2.load(litkey_data_path ='../../litkey-data/', toss_duplicates=False) 

In [5]:
# PREPROCESSING

#Hunspell treats words with some punctuation marks as two words (e.g. Seil=bahn), which would destroy the indices; 
#therefore, "=" is replaced with "-" and the others are deleted prior to analysis with Hunspell

data_error_types['unchanged_original'] = data_error_types['original']
data_error_types['original'] = data_error_types['original'].str.replace("=", '-')
data_error_types['original'] = data_error_types['original'].str.replace('"', '')
data_error_types['original'] = data_error_types['original'].str.replace(":", '')
data_error_types['original'] = data_error_types['original'].str.replace(",", '')
data_error_types['original'] = data_error_types['original'].str.replace("'", '')

data_error_token['unchanged_original'] = data_error_token['original']
data_error_token['original'] = data_error_token['original'].str.replace("=", '-')
data_error_token['original'] = data_error_token['original'].str.replace('"', '')
data_error_token['original'] = data_error_token['original'].str.replace(":", '')
data_error_token['original'] = data_error_token['original'].str.replace(",", '')
data_error_token['original'] = data_error_token['original'].str.replace("'", '')

In [16]:
display(data_error_types.head(100))
display(data_error_token.head(100))

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73
2,dan,dann,01-005-2-III-Eis.csv,627,651,621
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19
8,fom,vom,01-005-2-III-Eis.csv,13,16,13
9,gawen,kaufen,01-006-2-III-Eis.csv,1,6,1


Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73
2,dan,dann,01-005-2-III-Eis.csv,627,651,621
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19
8,fom,vom,01-005-2-III-Eis.csv,13,16,13
9,dan,dann,01-005-2-III-Eis.csv,627,651,621


## 2 - Running hunspell on litkey data set
###  a) Types

In [7]:
# Create csv-file containing 'original'-column (misspelled) data to be analysed with hunspell
data_error_types.original.to_csv('original_hun.csv', header=False, index=False)

# The beforementioned csv-file is input to hunspell analysis with german dictionary;
# Output of analysis is put to a txt file
# Build a file 'hunspell.txt' with corrections for each word
!cat original_hun.csv | hunspell -d de_DE > output_hun.txt
#!type original_hun.csv | hunspell -d de_DE > hunspell.txt



# Build a list (suggestions-column) of top correction suggestions from hunspell for each word, '' for correct, '?' for unrecognized
# Reading in all the lines
hs = []
with open('output_hun.txt') as f:
    next(f)
    for index, l in enumerate([l for l in [line.strip() for line in f] if l]):
        # Words recognized as correct
        # (* = dictionary stem (e. g. "man"), + = affixed forms of the following dictionary stem (e. g. "wollt" - wollen))
        # Append nothing to hs
        
        # TODO: Why minus flag?
        if re.match(r'\+|\*|-', l):
            hs.append('')
            
        # Words not recognized/rejected words (# = without suggestions)
        # Append '?' to hs
        elif re.match('#', l):
            hs.append('?')
            
        # Words not recognized/rejected words (& = with suggestions)
        # Append suggested words to hs
        else:
            hs.append(l.split(': ')[1].split(', '))

            
# Add hunspell's corrections as column to data
print(data_error_types.shape[0])
print(len(hs))

data_error_types['suggestions'] = hs

data_error_types.head(40)    # Print excerpt from DataFrame

9484
9484


Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_original,suggestions
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,belt,"[lebt, bellt, bebt, Welt, Zelt, hebelt, bettelt]"
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kukt,"[kurt, bukt]"
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,..."
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekricht,"[gekracht, gekreischt, gerichtet, gerichtlich]"
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,leken,"[lenken, lecken, ekeln, lenke, lesen, legen, l..."
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,felt,"[fehlt, feilt, fielt, elft, fest, fett, fegt, ..."
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wolte,"[wollte, walte, wolle, holte]"
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,lekt,"[lenkt, leckt, lest, legt, lebt, Sekt]"
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,fom,"[vom, Rom, Tom, Dom, fromm]"
9,gawen,kaufen,01-006-2-III-Eis.csv,1,6,1,gawen,"[wagen, garen, gaben]"


### b) Token

In [8]:
# Create csv-file containing 'original'-column (misspelled) data to be analysed with hunspell
data_error_token.original.to_csv('original_hun_token.csv', header=False, index=False)

# The beforementioned csv-file is input to hunspell analysis with german dictionary;
# Output of analysis is put to a txt file
# Build a file 'hunspell.txt' with corrections for each word
!cat original_hun_token.csv | hunspell -d de_DE > output_hun_token.txt
#!type original_hun.csv | hunspell -d de_DE > hunspell.txt



# Build a list (suggestions-column) of top correction suggestions from hunspell for each word, '' for correct, '?' for unrecognized
# Reading in all the lines
hs = []
with open('output_hun_token.txt') as f:
    next(f)
    for index, l in enumerate([l for l in [line.strip() for line in f] if l]):
        # Words recognized as correct
        # (* = dictionary stem (e. g. "man"), + = affixed forms of the following dictionary stem (e. g. "wollt" - wollen))
        # Append nothing to hs
        
        # TODO: Why minus flag?
        if re.match(r'\+|\*|-', l):
            hs.append('')
            
        # Words not recognized/rejected words (# = without suggestions)
        # Append '?' to hs
        elif re.match('#', l):
            hs.append('?')
            
        # Words not recognized/rejected words (& = with suggestions)
        # Append suggested words to hs
        else:
            hs.append(l.split(': ')[1].split(', '))

            
# Add hunspell's corrections as column to data
print(data_error_token.shape[0])
print(len(hs))

data_error_token['suggestions'] = hs

data_error_token.head(40)    # Print excerpt from DataFrame

24601
24601


Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_original,suggestions
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,belt,"[lebt, bellt, bebt, Welt, Zelt, hebelt, bettelt]"
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kukt,"[kurt, bukt]"
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,..."
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekricht,"[gekracht, gekreischt, gerichtet, gerichtlich]"
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,leken,"[lenken, lecken, ekeln, lenke, lesen, legen, l..."
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,felt,"[fehlt, feilt, fielt, elft, fest, fett, fegt, ..."
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wolte,"[wollte, walte, wolle, holte]"
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,lekt,"[lenkt, leckt, lest, legt, lebt, Sekt]"
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,fom,"[vom, Rom, Tom, Dom, fromm]"
9,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,..."


In [9]:
# Write pickle
data_error_token.to_pickle('data_error_token_hunspell_suggestions.pkl')
data_error_types.to_pickle('data_error_types_hunspell_suggestions.pkl')

## 3 - Evaluation

- **Baseline**:<br>
The baseline performance is created by checking whether the **target word** (= _**'corrected'**_) is in the **dictionary** (= _**'in_dict'**_) that is used by hunspell itself. It is checked, whether the word can basically be found.


- **Furthermore, the following variables are created**:
    - Whether the target word is among the suggested words by hunspell (= _**'in_sugg'**_).
        - At which index it is among the suggested words by hunspell (= _**'sugg_index'**_).
    - Whether the target word is ranked at index 0 by hunspell (= _**'idx0'**_).
    - 'original_cs', 'corrected_cs', 'suggestions_cs': All columns (3) in case sensitive (distinction upper- and lower case)

In [2]:
# Read pickle
data_error_token = pd.read_pickle('data_error_token_hunspell_suggestions.pkl')
data_error_types = pd.read_pickle('data_error_types_hunspell_suggestions.pkl')

In [23]:
display(data_error_token.head(20))
display(data_error_types.head(20))
# no suggestion: empty cell

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_original,suggestions
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,belt,"[lebt, bellt, bebt, Welt, Zelt, hebelt, bettelt]"
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kukt,"[kurt, bukt]"
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,..."
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekricht,"[gekracht, gekreischt, gerichtet, gerichtlich]"
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,leken,"[lenken, lecken, ekeln, lenke, lesen, legen, l..."
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,felt,"[fehlt, feilt, fielt, elft, fest, fett, fegt, ..."
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wolte,"[wollte, walte, wolle, holte]"
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,lekt,"[lenkt, leckt, lest, legt, lebt, Sekt]"
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,fom,"[vom, Rom, Tom, Dom, fromm]"
9,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,..."


Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_original,suggestions
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,belt,"[lebt, bellt, bebt, Welt, Zelt, hebelt, bettelt]"
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kukt,"[kurt, bukt]"
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,..."
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekricht,"[gekracht, gekreischt, gerichtet, gerichtlich]"
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,leken,"[lenken, lecken, ekeln, lenke, lesen, legen, l..."
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,felt,"[fehlt, feilt, fielt, elft, fest, fett, fegt, ..."
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wolte,"[wollte, walte, wolle, holte]"
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,lekt,"[lenkt, leckt, lest, legt, lebt, Sekt]"
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,fom,"[vom, Rom, Tom, Dom, fromm]"
9,gawen,kaufen,01-006-2-III-Eis.csv,1,6,1,gawen,"[wagen, garen, gaben]"


In [5]:
# Check if empty suggestion cell is nan
#data_error_token.info()
#data_error_token[data_error_token.suggestions.isna()]
# EMPTY CELL CONTAINS EMPTY STRING, SEE ABOVE

In [10]:
data_error_token.suggestions.iloc[0]
data_error_token.corrected.iloc[0]
data_error_token.suggestions.iloc[2].index(data_error_token.corrected.iloc[2]) # (bellt on index 1, dann on index 2) TRUE

#data_ready.corrected.iloc[2]
#.index(data_ready.corrected.iloc[2]) # bellt 1
#(data_ready.corrected.iloc[0])
#data_ready.corrected.iloc[0]

2

In [3]:
data_error_types_lc = litkey_2.evaluate(data_error_types)
data_error_token_lc = litkey_2.evaluate(data_error_token)

data_error_types_cs = litkey_2.evaluate(data_error_types, lower_case=False)
data_error_token_cs = litkey_2.evaluate(data_error_token, lower_case=False)

HBox(children=(FloatProgress(value=0.0, max=9484.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=24601.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=9484.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=24601.0), HTML(value='')))

In [4]:
display(data_error_types_lc.head(20))
display(data_error_types_cs.head(20))

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_original,suggestions,in_sugg,sugg_idx,idx0
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,belt,"[lebt, bellt, bebt, welt, zelt, hebelt, bettelt]",True,1.0,False
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kukt,"[kurt, bukt]",False,,False
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,...",True,2.0,False
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekricht,"[gekracht, gekreischt, gerichtet, gerichtlich]",False,,False
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,leken,"[lenken, lecken, ekeln, lenke, lesen, legen, l...",True,1.0,False
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,felt,"[fehlt, feilt, fielt, elft, fest, fett, fegt, ...",False,,False
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wolte,"[wollte, walte, wolle, holte]",True,0.0,True
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,lekt,"[lenkt, leckt, lest, legt, lebt, sekt]",True,1.0,False
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,fom,"[vom, rom, tom, dom, fromm]",True,0.0,True
9,gawen,kaufen,01-006-2-III-Eis.csv,1,6,1,gawen,"[wagen, garen, gaben]",False,,False


Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_original,suggestions,in_sugg,sugg_idx,idx0
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,belt,"[lebt, bellt, bebt, Welt, Zelt, hebelt, bettelt]",True,1.0,False
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kukt,"[kurt, bukt]",False,,False
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,...",True,2.0,False
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekricht,"[gekracht, gekreischt, gerichtet, gerichtlich]",False,,False
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,leken,"[lenken, lecken, ekeln, lenke, lesen, legen, l...",True,1.0,False
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,felt,"[fehlt, feilt, fielt, elft, fest, fett, fegt, ...",False,,False
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wolte,"[wollte, walte, wolle, holte]",True,0.0,True
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,lekt,"[lenkt, leckt, lest, legt, lebt, Sekt]",True,1.0,False
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,fom,"[vom, Rom, Tom, Dom, fromm]",True,0.0,True
9,gawen,kaufen,01-006-2-III-Eis.csv,1,6,1,gawen,"[wagen, garen, gaben]",False,,False


In [6]:
display(data_error_token_lc.head(20))
display(data_error_token_cs.head(20))

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_original,suggestions,in_sugg,sugg_idx,idx0
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,belt,"[lebt, bellt, bebt, welt, zelt, hebelt, bettelt]",True,1.0,False
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kukt,"[kurt, bukt]",False,,False
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,...",True,2.0,False
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekricht,"[gekracht, gekreischt, gerichtet, gerichtlich]",False,,False
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,leken,"[lenken, lecken, ekeln, lenke, lesen, legen, l...",True,1.0,False
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,felt,"[fehlt, feilt, fielt, elft, fest, fett, fegt, ...",False,,False
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wolte,"[wollte, walte, wolle, holte]",True,0.0,True
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,lekt,"[lenkt, leckt, lest, legt, lebt, sekt]",True,1.0,False
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,fom,"[vom, rom, tom, dom, fromm]",True,0.0,True
9,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,...",True,2.0,False


Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,unchanged_original,suggestions,in_sugg,sugg_idx,idx0
0,belt,bellt,01-005-2-III-Eis.csv,91,138,91,belt,"[lebt, bellt, bebt, Welt, Zelt, hebelt, bettelt]",True,1.0,False
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,kukt,"[kurt, bukt]",False,,False
2,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,...",True,2.0,False
3,gekricht,gekriegt,01-005-2-III-Eis.csv,2,15,2,gekricht,"[gekracht, gekreischt, gerichtet, gerichtlich]",False,,False
4,leken,lecken,01-005-2-III-Eis.csv,14,17,14,leken,"[lenken, lecken, ekeln, lenke, lesen, legen, l...",True,1.0,False
5,felt,fällt,01-005-2-III-Eis.csv,93,198,90,felt,"[fehlt, feilt, fielt, elft, fest, fett, fegt, ...",False,,False
6,wolte,wollte,01-005-2-III-Eis.csv,173,201,173,wolte,"[wollte, walte, wolle, holte]",True,0.0,True
7,lekt,leckt,01-005-2-III-Eis.csv,20,42,19,lekt,"[lenkt, leckt, lest, legt, lebt, Sekt]",True,1.0,False
8,fom,vom,01-005-2-III-Eis.csv,13,16,13,fom,"[vom, Rom, Tom, Dom, fromm]",True,0.0,True
9,dan,dann,01-005-2-III-Eis.csv,627,651,621,dan,"[da, an, dann, dran, dank, den, das, dar, ran,...",True,2.0,False


In [7]:
# Serialize dataframe, so that analysis does not have to be run every single time
# Dataframe can then be deserialized in another notebook
data_error_token_lc.to_pickle('data_error_token_hunspell_evaluation_lc.pkl')
data_error_types_lc.to_pickle('data_error_types_hunspell_evaluation_lc.pkl')

data_error_token_cs.to_pickle('data_error_token_hunspell_evaluation_cs.pkl')
data_error_types_cs.to_pickle('data_error_types_hunspell_evaluation_cs.pkl')

In [None]:
# Reload pickle
# Load df from pickle

#data_error_token = pd.read_pickle('data_error_token_hunspell_evaluation.pkl')
#data_error_types = pd.read_pickle('data_error_types_hunspell_evaluation.pkl')