In this notebook, the list of misspellings (i. e. data_error_types) are analysed by language tool, each word seperately. Resulting in a comparable analysis as with other spell-checkers (hunspell, boyd).

In [1]:
import language_tool_python
import pandas as pd

In [2]:
# Initialize language tool object
tool = language_tool_python.LanguageTool('de-DE')

In [4]:
# Open and read text file
# readlines method returning list of strings

with open('Input/data_error_types_origs.txt', 'r') as f:
    lines_n = f.readlines()
    
# lines_n[1] --> 'kukt\n'
# lines_n[9483] --> 'momen\n'; 'UPPERCASE_SENTENCE_START'

In [5]:
# Cut off linebreak (\n) at end of each String
# Otherwise each String is interpreted as sentence start (error: 'UPPERCASE_SENTENCE_START'), possibly bc of linebreak
lines = [e.split('\n')[0].strip() for e in lines_n]

In [13]:
#tool.check(lines[9483])

In [7]:
# Take one line each (i. e. one original word) and generate list of matches
# Append each match(-list) to list of suggestions

suggestions = []
for line in lines:
    #print(line)
    suggestions.append(tool.check(line))

In [8]:
print(len(suggestions)) # 9484 / 9484

# BUT SOME ZEROS, i. e. no suggestion
# Filter out 0 suggestions
suggestions_1 = [s for s in suggestions if len(s)>0]
len(suggestions_1) # 8771 / 9484

9484


8771

In [16]:
import pickle
with open('matches_errorlist.pkl', 'wb') as f:
    # Pickle list
    pickle.dump(suggestions, f, protocol=3)

In [86]:
#for s in suggestions_1:
#    print(s[0].ruleId)
#suggestions_1.value_counts()
#sugestions_1[i][0].replacements

In [9]:
# Filter for spelling errors
suggestions_1_spell = [s for s in suggestions_1 if s[0].ruleId == 'GERMAN_SPELLER_RULE']
len(suggestions_1_spell) # 8725

8725

In [10]:
# Filter for all other errors
suggestions_1_other = [s for s in suggestions_1 if s[0].ruleId != 'GERMAN_SPELLER_RULE']
len(suggestions_1_other) # 46

46

In [11]:
# Have a look at other errors
for s in suggestions_1_other:
    print(s[0].sentence, s[0].ruleId, s[0].replacements)

lasen LASEN_LASSEN ['Lassen']
halo HALO_HALLO ['Hallo']
Weder WEDER_OHNE_NOCH []
paster's UPPERCASE_SENTENCE_START ['Paster']
Halo HALO_HALLO ['Hallo']
weder WEDER_OHNE_NOCH []
"Freund,in" TYPOGRAFISCHE_ANFUEHRUNGSZEICHEN ['„']
halo HALO_HALLO ['Hallo']
rase= UPPERCASE_SENTENCE_START ['Rase']
bes=chlosen UPPERCASE_SENTENCE_START ['Bes']
gluglis=ch UPPERCASE_SENTENCE_START ['Gluglis']
get's UPPERCASE_SENTENCE_START ['Get']
wenigsten WENIGSTEN_VS_WENIGSTENS ['wenigstens', 'zumindest']
"lost,läst" TYPOGRAFISCHE_ANFUEHRUNGSZEICHEN ['„']
schaf=en UPPERCASE_SENTENCE_START ['Schaf']
sonder SONDER ['sondern']
weg.gerant UPPERCASE_SENTENCE_START ['Weg']
staupsauger=beutel UPPERCASE_SENTENCE_START ['Staupsauger']
weiss WEISS ['weiß']
dm PRP_DM ['dem']
telefphoniert TEST_F_ANSTATT_PH ['teleffoniert']
Telephon TEST_F_ANSTATT_PH ['Telefon']
liebling'sfutter UPPERCASE_SENTENCE_START ['Liebling']
ko KO ['K.\xa0o.', 'k.\xa0o.']
'nen TYPOGRAFISCHE_ANFUEHRUNGSZEICHEN ['’', '‚', '‘', '′']
gesprun.gen UPP

In [17]:
# Extract mistakes and corrections
my_mistakes = []
my_corrections = []
rule_id = []

for rules in suggestions_1:
    my_mistakes.append(rules[0].sentence)
    my_corrections.append(rules[0].replacements)
    rule_id.append(rules[0].ruleId)

In [18]:
# Zip to list
df_prep = list(zip(my_mistakes,my_corrections, rule_id))
#print(df_prep)
#print(my_corrections)

In [19]:
# Convert to df
df = pd.DataFrame(data=df_prep, columns=['original', 'suggestions', 'rule_id'])
df.head(2)

Unnamed: 0,original,suggestions,rule_id
0,belt,"[Belt, Welt, bald, hält, Geld, Feld, Held, Zel...",GERMAN_SPELLER_RULE
1,kukt,"[rückt, zückt, bückt, guckt, kickt, kackt, gut...",GERMAN_SPELLER_RULE


In [20]:
# Export
#df.to_pickle('./data_error_types_language_tool_RULEID.pkl')

In [85]:
df.shape[0]
# NA-values will be merged back in postprocessing process, resulting in 9484 values

8771