Aim: Define upper bound for language tool
- Prepare "corrected" types for analysis
    - Drop duplicates
    - Export to processable format
- Run analysis
- Postprocess results, i. e. come up with ratios

In [1]:
import pandas as pd
import sys 
import os
#print(sys.path)
#print(os.getcwd())

sys.path.insert(0, '../../Lisa')
import litkey_2

# Do not truncate columns of DataFrame
pd.set_option('display.max_rows', None)

## 1 - Prepare 'corrected' types for analysis
- case sensitive
- case insensitive

### Case sensitive

In [2]:
data_error_types = litkey_2.load(litkey_data_path = "../../litkey-data/")

In [3]:
# Drop duplicates (corrected)
data_error_types_corr = data_error_types.corrected
display(data_error_types_corr.shape[0])
data_error_types_corr.drop_duplicates(keep='first', inplace=True)
display(data_error_types_corr.shape[0])

9484

3154

In [4]:
data_error_types_corr.head(20)

0         bellt
1         kuckt
2          dann
3      gekriegt
4        lecken
5         fällt
6        wollte
7         leckt
8           vom
9        kaufen
10          ein
11         Geld
12         Mann
13        gehen
14    nachhause
15        essen
16         aber
17        geben
21         will
22         gibt
Name: corrected, dtype: object

In [5]:
# Export
#data_error_types_corr.to_csv("data_error_types_corrected_no_duplicates_UB.txt", header=False, index=False)

### Case insensitive
XXX

In [7]:
#data_error_types_lc = litkey_2.load(litkey_data_path = "../../litkey-data/", lower_case=True)

In [9]:
#data_error_types_lc.shape[0]

8594

❗️❗️❗️
- As done with Hunspell. Titlecase all corrected words, use types case sensitive as base
- ( This is because of reconstruction purpose for evaluation!!! )

❗️❗️❗️

In [51]:
# To merge with
data_error_types.original = data_error_types.original.str.title()
data_error_types.corrected = data_error_types.corrected.str.title()

#data_error_types_corr_CI = data_error_types_corr.str.title()

In [52]:
data_error_types.head(10)

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup
0,Belt,Bellt,01-005-2-III-Eis.csv,91,138,91
1,Kukt,Kuckt,01-005-2-III-Eis.csv,73,152,73
2,Dan,Dann,01-005-2-III-Eis.csv,627,651,621
3,Gekricht,Gekriegt,01-005-2-III-Eis.csv,2,15,2
4,Leken,Lecken,01-005-2-III-Eis.csv,14,17,14
5,Felt,Fällt,01-005-2-III-Eis.csv,93,198,90
6,Wolte,Wollte,01-005-2-III-Eis.csv,173,201,173
7,Lekt,Leckt,01-005-2-III-Eis.csv,20,42,19
8,Fom,Vom,01-005-2-III-Eis.csv,13,16,13
9,Gawen,Kaufen,01-006-2-III-Eis.csv,1,6,1


In [53]:
data_error_types_corr = data_error_types.corrected
display(data_error_types_corr.shape[0])
data_error_types_corr.drop_duplicates(keep='first', inplace=True)
display(data_error_types_corr.shape[0])

9484

2960

In [16]:
# Drop duplicates (corrected)
#data_error_types_lc_corr = data_error_types_lc.corrected
#display(data_error_types_lc_corr.shape[0])
#data_error_types_corr_CI.drop_duplicates(keep='first', inplace=True)
#display(data_error_types_corr_CI.shape[0])

2960

In [17]:
# Export
#data_error_types_corr_CI.to_csv("data_error_types__corrected_CI_no_duplicates_UB.txt", header=False, index=False)

## 2 - Run analysis

In [18]:
import language_tool_python

In [19]:
# Initialize language tool object
tool = language_tool_python.LanguageTool('de-DE')

In [20]:
# Open and read text file
# readlines method returning list of strings

#with open('data_error_types_corrected_no_duplicates_UB.txt', 'r') as f:
with open('data_error_types__corrected_CI_no_duplicates_UB.txt', 'r') as f:
    lines_n = f.readlines()

In [21]:
# Cut off linebreak (\n) at end of each String
# Otherwise each String is interpreted as sentence start (error: 'UPPERCASE_SENTENCE_START'), possibly bc of linebreak
lines = [e.split('\n')[0].strip() for e in lines_n]

In [22]:
# Take one line each (i. e. one original word) and generate list of matches
# Append each match(-list) to list of suggestions

suggestions = []
for line in lines:
    
    # If any suggestion, append
    if len(tool.check(line))>0:
        suggestions.append(tool.check(line))
        
    # If no suggestion (i. e. correct word), append word (To merge back after)
    elif len(tool.check(line))==0:
        suggestions.append(line)

In [23]:
suggestions
type(suggestions[0]) == str # type str
#type(suggestions[1]) == list # type list
#suggestions[1][0]

True

In [32]:
len(suggestions[0])
#suggestions[1]

5

In [24]:
# Filter out 0 suggetstions
suggestions_1 = [s for s in suggestions if len(s)>0]
print(len(suggestions_1)) # 221/3154 # 2960

2960


In [46]:
# Have a look at errors
#for s in suggestions_1:
#    print(s[0].sentence, s[0].ruleId, s[0].replacements)

## 3 - Postprocess results
- Transform into right format
    - Reconstruct for types
    - Reconstruct for token
- Ratios
    - Types
    - Token

In [33]:
# Extract mistakes and corrections
my_mistakes = []
my_corrections = []

for line in suggestions:
    if type(line) == list:
        my_mistakes.append(line[0].sentence)
        my_corrections.append(line[0].replacements)
    elif type(line) == str:
        my_mistakes.append(line)
        my_corrections.append('')

In [34]:
df_prep = list(zip(my_mistakes,my_corrections))
df = pd.DataFrame(data=df_prep, columns=['corrected', 'suggestions'])

In [47]:
df.head(10)

Unnamed: 0,corrected,suggestions
0,Bellt,
1,Kuckt,"[Rückt, Guckt, Zuckt, Zückt, Knickt, Bückt, Gl..."
2,Dann,
3,Gekriegt,
4,Lecken,
5,Fällt,
6,Wollte,
7,Leckt,
8,Vom,
9,Kaufen,


In [35]:
# Serialize
#df.to_pickle('/pickles/language_tool_analysis_corrected_suggestions_upper_bound.pkl')
df.to_pickle('./pickles/language_tool_analysis_corrected_suggestions_upper_bound_case_insensitive.pkl')

In [113]:
# Nan value is type float
# data_error_types_suggs[data_error_types_suggs.suggestions.apply(type) == float] # 9186, 9189
# type(df.suggestions[9186])
#df.suggestions.fillna('', inplace=True)

In [54]:
data_error_types.head(10)

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup
0,Belt,Bellt,01-005-2-III-Eis.csv,91,138,91
1,Kukt,Kuckt,01-005-2-III-Eis.csv,73,152,73
2,Dan,Dann,01-005-2-III-Eis.csv,627,651,621
3,Gekricht,Gekriegt,01-005-2-III-Eis.csv,2,15,2
4,Leken,Lecken,01-005-2-III-Eis.csv,14,17,14
5,Felt,Fällt,01-005-2-III-Eis.csv,93,198,90
6,Wolte,Wollte,01-005-2-III-Eis.csv,173,201,173
7,Lekt,Leckt,01-005-2-III-Eis.csv,20,42,19
8,Fom,Vom,01-005-2-III-Eis.csv,13,16,13
9,Gawen,Kaufen,01-006-2-III-Eis.csv,1,6,1


In [55]:
# Reconstruct original df (data_error_types); TYPES
data_error_types_suggs = pd.merge(data_error_types, df, how='left', on='corrected')

In [56]:
data_error_types_suggs.head(10)

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,suggestions
0,Belt,Bellt,01-005-2-III-Eis.csv,91,138,91,
1,Kukt,Kuckt,01-005-2-III-Eis.csv,73,152,73,"[Rückt, Guckt, Zuckt, Zückt, Knickt, Bückt, Gl..."
2,Dan,Dann,01-005-2-III-Eis.csv,627,651,621,
3,Gekricht,Gekriegt,01-005-2-III-Eis.csv,2,15,2,
4,Leken,Lecken,01-005-2-III-Eis.csv,14,17,14,
5,Felt,Fällt,01-005-2-III-Eis.csv,93,198,90,
6,Wolte,Wollte,01-005-2-III-Eis.csv,173,201,173,
7,Lekt,Leckt,01-005-2-III-Eis.csv,20,42,19,
8,Fom,Vom,01-005-2-III-Eis.csv,13,16,13,
9,Gawen,Kaufen,01-006-2-III-Eis.csv,1,6,1,


In [57]:
# Fillna
# Nan value is type float
# data_error_types_suggs[data_error_types_suggs.suggestions.apply(type) == float] # 9186, 9189
# type(df.suggestions[9186])
data_error_types_suggs.suggestions.fillna('', inplace=True)

In [58]:
# Reconstruct original df; TOKEN
data_error_token = litkey_2.load(litkey_data_path = "../../litkey-data/", toss_duplicates=False)
data_error_token.original = data_error_token.original.str.title()
data_error_token.corrected = data_error_token.corrected.str.title()

In [59]:
data_error_token_suggs = pd.merge(data_error_token, df, how='left', on='corrected')

In [60]:
# Fillna
data_error_token_suggs.suggestions.fillna('', inplace=True)

In [61]:
#data_error_types_suggs.shape

(9484, 7)

### Get ratios (upper bound)
- Types
- Token

In [109]:
# Types (case sensitive)
ct_not_right = data_error_types_suggs[data_error_types_suggs.suggestions.apply(len).gt(0)].shape[0] # check for length greater than 0, i. e. list not empty
ct_right = data_error_types_suggs[~data_error_types_suggs.suggestions.apply(len).gt(0)].shape[0] # not right is either false or not recognized
display('Language Tool recognizes as not correct...', data_error_types_suggs[data_error_types_suggs.suggestions.apply(len).gt(0)].head(20))

'Language Tool recognizes as not correct...'

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,suggestions
1,kukt,kuckt,01-005-2-III-Eis.csv,73,152,73,"[guckt, rückt, knickt, glückt, zuckt, zückt, b..."
63,runtergefalen,runtergefallen,01-029-2-III-Eis.csv,12,28,12,[heruntergefallen]
79,rich,riech,01-045-2-III-Eis.csv,2,2,2,"[Sieg, reich, riecht, bieg, lieg, rieche, siec..."
233,Eerber,Erdbeer,01-122-2-III-Eis.csv,1,3,1,"[Erdbeere, Erdmeer, Erdheer, Erdteer, Erdiger,..."
237,sis,sies,01-122-2-III-Eis.csv,2,1,1,"[sie, dies, ließ, hieß, wies, Kies, Ries, lies..."
259,vars,wars,01-127-2-III-Eis.csv,1,2,1,"[war's, war es, warst, wärs, war, was, März, w..."
260,Trupf,tropf,01-128-2-III-Eis.csv,1,1,1,"[Tropf, traf, tropft, Kropf, doof, triff, trof..."
261,Schupfs,schups,01-128-2-III-Eis.csv,1,1,1,"[schubs, Schubs, schubse, schubst, schwups, sc..."
267,runterfale,runterfallen,01-128-2-III-Eis.csv,1,2,1,[herunterfallen]
308,rumtergefalen,runtergefallen,01-139-2-III-Eis.csv,1,28,1,[heruntergefallen]


In [110]:
display(ct_not_right)
display(ct_right)
display(data_error_types_suggs.shape[0])

ratio = ct_right/data_error_types_suggs.shape[0]
print('New upper bound for types is', round(ratio,4)*100,'%')
print(ratio)

335

9149

9484

New upper bound for types is 96.47 %
0.9646773513285534


In [62]:
# Output for Token case sensitive overwritten with output below
# Token (case insensitive)
ct_not_right_2 = data_error_token_suggs[data_error_token_suggs.suggestions.apply(len).gt(0)].shape[0] # check for length greater than 0, i. e. list not empty
ct_right_2 = data_error_token_suggs[~data_error_token_suggs.suggestions.apply(len).gt(0)].shape[0] # not right is either false or not recognized
display('Language Tool recognizes as not correct...', data_error_token_suggs[data_error_token_suggs.suggestions.apply(len).gt(0)].head(20))

'Language Tool recognizes as not correct...'

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,suggestions
1,Kukt,Kuckt,01-005-2-III-Eis.csv,73,152,73,"[Rückt, Guckt, Zuckt, Zückt, Knickt, Bückt, Gl..."
85,Runtergefalen,Runtergefallen,01-029-2-III-Eis.csv,12,28,12,[Heruntergefallen]
105,Rich,Riech,01-045-2-III-Eis.csv,2,2,2,"[Sieg, Riege, Reich, Rigg, Riecht, Bieg, Rieke..."
179,Runtergefalen,Runtergefallen,01-063-2-III-Eis.csv,12,28,12,[Heruntergefallen]
309,Runtergefalen,Runtergefallen,01-108-2-III-Eis.csv,12,28,12,[Heruntergefallen]
363,Eerber,Erdbeer,01-122-2-III-Eis.csv,1,3,1,"[Erdbeere, Erdmeer, Erdheer, Erdteer, Erdiger,..."
369,Sis,Sies,01-122-2-III-Eis.csv,2,1,1,"[Dies, Sie, Ließ, Sitz, Hieß, Sieg, Wies, Kies..."
403,Vars,Wars,01-127-2-III-Eis.csv,1,2,1,"[März, Wärs, Harz, War, Mars, Was, Lars, Walz,..."
405,Schupfs,Schups,01-128-2-III-Eis.csv,1,1,1,"[Schubs, Schutz, Schulz, Schuss, Schubse, Schu..."
411,Runterfale,Runterfallen,01-128-2-III-Eis.csv,1,2,1,[Herunterfallen]


In [64]:
# Token CASE INSENSITIVE
display(ct_not_right_2)
display(ct_right_2)
display(data_error_token_suggs.shape[0])

ratio_2 = ct_right_2/data_error_token_suggs.shape[0]
print('New upper bound for types is', round(ratio_2,4)*100,'%')
print(ratio_2)

606

23995

24601

New upper bound for types is 97.54 %
0.975366855005894


In [115]:
# Token, case sensitive
display(ct_not_right_2)
display(ct_right_2)
display(data_error_token_suggs.shape[0])

ratio_2 = ct_right_2/data_error_token_suggs.shape[0]
print('New upper bound for types is', round(ratio_2,4)*100,'%')
print(ratio_2)

620

23981

24601

New upper bound for types is 97.48 %
0.9747977724482745


In [39]:
# Types CASE INSENSITIVE
# Types
ct_not_right = data_error_types_suggs[data_error_types_suggs.suggestions.apply(len).gt(0)].shape[0] # check for length greater than 0, i. e. list not empty
ct_right = data_error_types_suggs[~data_error_types_suggs.suggestions.apply(len).gt(0)].shape[0] # not right is either false or not recognized
display('Language Tool recognizes as not correct...', data_error_types_suggs[data_error_types_suggs.suggestions.apply(len).gt(0)].head(20))

'Language Tool recognizes as not correct...'

Unnamed: 0,original,corrected,filename,freq_ori,freq_cor,freq_tup,suggestions
233,Eerber,Erdbeer,01-122-2-III-Eis.csv,1,3,1,"[Erdbeere, Erdmeer, Erdheer, Erdteer, Erdiger,..."
518,Schockokugel,Schokokugel,01-265-2-III-Eis.csv,1,1,1,"[Schoko kugel, Schoko Kugel, Schokokuchen, Gol..."
542,ertber,Erdbeer,01-275-2-III-Eis.csv,1,3,1,"[Erdbeere, Erdmeer, Erdheer, Erdteer, Erdiger,..."
547,Etber,Erdbeer,01-277-2-III-Eis.csv,1,3,1,"[Erdbeere, Erdmeer, Erdheer, Erdteer, Erdiger,..."
548,blauber,Blaubeer,01-277-2-III-Eis.csv,1,1,1,"[Blaubär, Blaumeer, Blaubeere, Blauer, Blauhee..."
856,HodogStand,Hotdogstand,01-493-2-III-Eis.csv,2,2,2,[Hotdog stand]
1359,Te,The,02-218-2-IV-Weg.csv,1,7,1,"[TV, Die, Ehe, De, TC, Tee, Tue, Theo, Che, TB..."
1522,vermiszetel,Vermisstzettel,02-289-2-IV-Weg.csv,1,1,1,"[Vermisst zettel, Vermisst Zettel]"
1650,freßchen,Fresschen,02-342-2-IV-Weg.csv,1,1,1,"[Prägen, Versuchen, Kreischen, Fresken, Fresse..."
1686,Dindon,Dingdong,02-371-2-IV-Weg.csv,1,1,1,"[Pingpong, Sington]"


In [65]:
display(ct_not_right)
display(ct_right)
display(data_error_types_suggs.shape[0])

ratio = ct_right/data_error_types_suggs.shape[0]
print('New upper bound for types (case insensitive) is', round(ratio,4)*100,'%')
print(ratio)

118

9366

9484

New upper bound for types (case insensitive) is 98.76 %
0.9875579924082666
