- Run through on server with script language_tool_texts.py (**_before_**)
- <span style='background:yellow'> **Important note: Since errors have been analyzed within a full text, these are only analysed on token level. No types are to be reconstructed!** </span>

1. Read in results from server (**_preparation_**)
2. Extract misspellings (originals) + replacements (suggestions)
3. Get reference df to merge with
    - Load original texts
        - Preprocess as before
        - Get offsets
4. Merge back to original df
    - data_error_types
    - data_error_token
5. Drop words that are not examined in analysis
    - Remove words/characters that are not desired in analysis
    - Drop obsolete columns
    
    
- EVALUATION
    - TOKEN ONLY
    - Case in- and -sensitive

In [1]:
import pandas as pd
import pickle
import glob
import csv
import os

import sys
sys.path.insert(0, '../../Lisa')
import litkey_2
import helpers_litkey_2 as helpers

import helpers_lt

In [2]:
# Do not truncate rows of DataFrame
pd.set_option('display.max_rows', None)
# Do not truncate column width
pd.set_option('display.max_colwidth', None)

## 1 - Read in results from server

In [3]:
x = pd.read_pickle('./Output_LT/matches_texts.pkl') # list

In [4]:
# Multidimensional list
# Access attribute of element/match x[0][0].replacements

print(len(x)) # 1922 texts
print(len(x[1000])) # 20 matches in text 1000
print(type(x[1000])) # list
print(type(x[1000][19])) # language_tool_python.match.Match

1922
22
<class 'list'>
<class 'language_tool_python.match.Match'>


In [5]:
x[1000][19].errorLength

4

In [6]:
#x[0:1]

## 2 - Extract misspellings (originals) + replacements (suggestions)

In [4]:
# Get whole texts
whole_texts = []

for csv_file in sorted(glob.glob(r'./Input_texts_csv/'+'*.csv')):
    with open(csv_file, 'r', encoding='utf-8') as f:
        # not readlines
        # read file as a whole, i. e. all texts
       
        text = f.read()
        #csv.reader(f, delimiter)
        text_n = text.replace('\n', ' ')
        #text_prep = text_n.replace(' .', '.')

        whole_texts.append(text_n)

In [6]:
#whole_texts

In [22]:
# each text as list element
type(whole_texts) # list
type(whole_texts[0]) # str

str

In [5]:
# Extract mistakes and corrections
start_positions = []
end_positions = []
num_texts = []
#offsets = []
#error_lengths = []

my_mistakes = []
my_corrections = []

rule_id = []

text_count = 0

for text in x:
    
    for match in text:
    
        #print(type(i))
        #print(i)
        start_positions.append(match.offset)
        end_positions.append(match.errorLength+match.offset)
        num_texts.append(text_count)
        
        my_mistakes.append(whole_texts[text_count][match.offset:match.errorLength+match.offset])
        my_corrections.append(match.replacements)
        
        rule_id.append(match.ruleId)
        
    text_count+=1    

In [6]:
error_list_full = list(zip(my_mistakes,my_corrections,rule_id,num_texts,start_positions,end_positions))

In [7]:
# Convert to df
df_server = pd.DataFrame(data=error_list_full, columns=['original','suggestions','rule_id','num_texts','start_position','end_position'])

In [8]:
df_server.head(10)
#df_server.tail(20)

Unnamed: 0,original,suggestions,rule_id,num_texts,start_position,end_position
0,Dodo,"[Foto, Tot, Voodoo, Dato, Bodo, Dojo, Tote, Toto, Tod, Töte, Dorf, Dito, Otto, Bot, Gott, Dort, Motor, Doch, Tode, Rot]",GERMAN_SPELLER_RULE,0,17,21
1,Dodo,"[Foto, Tot, Voodoo, Dato, Bodo, Dojo, Tote, Toto, Tod, Töte, Dorf, Dito, Otto, Bot, Gott, Dort, Motor, Doch, Tode, Rot]",GERMAN_SPELLER_RULE,0,41,45
2,belt,"[Belt, Welt, bald, hält, Geld, Feld, Held, Zelt, bebt, bellt, perlt, pell, best, meld, peilt, pellt, pult, zelt, bei, seit]",GERMAN_SPELLER_RULE,0,46,50
3,kukt,"[rückt, zückt, bückt, guckt, kickt, kackt, gut, rückte, deckt, fügt, Lücke, lockt, weckt, packt, Mücke, kund, kürt, lügt, hackt, hockt]",GERMAN_SPELLER_RULE,0,80,84
4,Dodo,"[Foto, Tot, Voodoo, Dato, Bodo, Dojo, Tote, Toto, Tod, Töte, Dorf, Dito, Otto, Bot, Gott, Dort, Motor, Doch, Tode, Rot]",GERMAN_SPELLER_RULE,0,85,89
5,dan,"[dann, Dan, den, an, das, man, da, dar, van, San, tun, Jan, dank, tat, dran, ran, tank, Fan, Idan, Pan]",GERMAN_SPELLER_RULE,0,95,98
6,Ihren Eis,"[Ihr Eis, Ihrem Eis, Ihren Eiern, Ihres Eis]",DE_AGREEMENT,0,109,118
7,gekricht,"[gebricht, gegeigt, gekickt, gekracht, gebracht, geprägt, gezeigt, Gericht, Bericht, gefragt, gerecht, gekocht, geneigt, gereicht, Rücksicht, gerückt, geklagt, geeicht, gegrillt, geknickt]",GERMAN_SPELLER_RULE,0,119,127
8,leken,"[legen, locken, lägen, Säcken, lecken, gegen, wegen, leben, lagen, lesen, Becken, decken, dicken, legten, lenken, locker, regen, rücken, Lücken, Nacken]",GERMAN_SPELLER_RULE,0,136,141
9,felt,"[Welt, fest, fällt, hält, Geld, fehlt, Feld, Held, fett, Zelt, fegt, Belt, feilt, feit, fielt, meld, pfählt, zelt, seit]",GERMAN_SPELLER_RULE,0,164,168


In [9]:
df_server.tail(10)

Unnamed: 0,original,suggestions,rule_id,num_texts,start_position,end_position
42550,einen,[einem],DEN_DEM,1921,770,775
42551,zur Tor,[dem Tor],DE_AGREEMENT,1921,816,823
42552,umarte,"[um arte, smarte, umarmte, umarme, Karte, harte, malte, uralte, arte, klärte, marode, sparte, umbaute, zarte, Quarte, matte, smart, starte, umarmen, umarmt]",GERMAN_SPELLER_RULE,1921,828,834
42553,Lar,"[War, Dar, Gar, Lag, Bar, Zar, Klar, Bär, Lars, Las, Lahr, Lara, Lärm, Rar, Wär, Lear, Lau, Lax, Aar, Lab]",GERMAN_SPELLER_RULE,1921,835,838
42554,nieht,"[nicht, sieht, zieht, niet, lieht, niest, liegt, geht, steht, dient, hielt, nichts, nie, nimmt, siehe, Licht, Sicht, dicht, dreht, liebt]",GERMAN_SPELLER_RULE,1921,886,891
42555,Dodo,"[Foto, Tot, Voodoo, Dato, Bodo, Dojo, Tote, Toto, Tod, Töte, Dorf, Dito, Otto, Bot, Gott, Dort, Motor, Doch, Tode, Rot]",GERMAN_SPELLER_RULE,1921,894,898
42556,ass,"[Ass, aß, als, aus, dass, ans, saß, Hass, Fass, nass, bass, lass, pass, Bass, Pass, aßt, iss, äse, äst, des]",GERMAN_SPELLER_RULE,1921,912,915
42557,mutzte,"[musste, nutzte, müsste, nützte, mutete, bützte, motzte, münzte, putzte, setzte, letzte, mussten, müsse, nutzt, Hütte, Künste, meiste, müssten, nutzten, wusste]",GERMAN_SPELLER_RULE,1921,923,929
42558,momen,"[mögen, Domen, mimen, zoomen, Nomen, boomen, Namen, kamen, hohen, oben, Polen, Roman, hören, roten, zogen, Boden, lösen, töten, Böhmen, Löwen]",GERMAN_SPELLER_RULE,1921,934,939
42559,Lutscha,"[Lutsch, Luigi, Lutsche, Lunge, Lutscht, Putsch, Kutsche, Rutscht, Ludger, Lusaka, Putsche, Luchs, Rutsche, Luchse, Rutsch, Latsche, Blutschau, Datscha, Dougga, Hodscha]",GERMAN_SPELLER_RULE,1921,962,969


In [10]:
df_server.shape[0] 
# Before troubleshooting 42038 token
# After troubleshooting 42560 token

42560

## 3 - Get reference df to merge with
- Load original texts
- Preprocess (... as before)
- Get offsets

In [12]:
# Get original texts, i. e. data_raw_token
# In data_raw_token, NA-cases and headline markers are already removed
#original_texts = litkey_2.load_raw(litkey_data_path = "../../litkey-data/")

In [11]:
def combineCsvsToDataframe_x(data_path):
    df=[] # type: list
    
    text_count = 0
    
    for csv_file in sorted(glob.glob(data_path+"*.csv")):
        frame = pd.read_csv(filepath_or_buffer=csv_file, sep='\t', quoting=csv.QUOTE_NONE, names=['original', 'corrected']) # Get columns (o & c)
        frame['filename'] = os.path.basename(csv_file) # Get column filename
        frame['num_texts'] = text_count
        df.append(frame)
        
        text_count+=1
        
    #display(type(frame)) : DataFrame
    #display(type(df)) : List
    df = pd.concat(df)
    return df

In [12]:
def load_raw_x(data):

    # REMOVE NA-CASES, i. e. line-ends (^) (no other relevant NA-cases)
    data.dropna(subset=['original','corrected'], inplace=True)
    
    # REMOVE HEADLINE MARKERS, i. e. \h
    data = data[~data.original.str.contains(r'\\h')]
    
    # Reset index
    data.reset_index(drop=True, inplace=True)
    
    return data

In [13]:
original_texts_prep = combineCsvsToDataframe_x("../../litkey-data/")
original_texts = load_raw_x(original_texts_prep)

In [14]:
# Preprocessing like in language_tool_prep.ipynb
# Remove "|" and "_" characters, i. e. original is one/two-word whereas target is two/one-word
original_texts['original']=original_texts['original'].str.replace(r'[\|\_]', '', regex=True)
original_texts['corrected']=original_texts['corrected'].str.replace(r'[\|\_]', '', regex=True)
    
# Remove intended line-break characters '^', '-^', '^-'
pattern = r'(\-\^)|(\^\-)|(\^)'
original_texts['original'] = original_texts['original'].str.replace(pattern, '', regex=True)
    
# Remove words containing illegible character(s), i. e. containing '*' (original)
original_texts = original_texts[~original_texts.original.str.contains(r'\*')]
    
# Remove non-words ('?' target not identifiable or no standardized spelling | '~' target non existing word form)
original_texts = original_texts[~original_texts.corrected.str.contains(r'\?|\~')]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_texts['original']=original_texts['original'].str.replace(r'[\|\_]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_texts['corrected']=original_texts['corrected'].str.replace(r'[\|\_]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_texts['o

In [7]:
original_texts.reset_index(drop=True, inplace=True)
#original_texts.tail(100)

In [8]:
# Get list of filenames
filename_list = list(original_texts.filename.unique())

In [9]:
# Initialize empty columns
original_texts['length'] = ''
original_texts['start_position'] = ''
original_texts['end_position'] = ''
original_texts['ct'] = ''

In [10]:
original_texts.head(2)

Unnamed: 0,original,corrected,filename,num_texts,length,start_position,end_position,ct
0,Lea,Lea,01-005-2-III-Eis.csv,0,,,,
1,und,und,01-005-2-III-Eis.csv,0,,,,


In [11]:
# GET START POSITION
def get_starts(df):
    for i in range(len(df)):
        if df.loc[i, 'ct'] == 0:
            df.loc[i, 'start_position'] = 0
        #elif df.loc[i, 'original'] == '.':
        #    df.loc[i, 'start_position'] = df.loc[i-1, 'start_position'] + df.loc[i-1, 'length']
        else:
            df.loc[i, 'start_position'] = df.loc[i-1, 'start_position'] + df.loc[i-1, 'length'] + 1
            # if punkt, dann nicht +1
    return df        

In [12]:
# GET END POSITION
def get_ends(df):
    for i in range(len(df)):
        df.loc[i, 'end_position'] =  df.loc[i, 'start_position'] + df.loc[i, 'length']
    return df    

In [13]:
ref_list = []

for file in filename_list:
    # Get part of the dataframe that matches filename
    
    df = original_texts.loc[original_texts.filename == file]
    df.reset_index(inplace=True)
    df['ct'] = df.index
    
    df['length'] = df.apply(lambda x: len(x.original), axis=1)
   
    # ASSIGN RETURN VALUE LOL
    df = get_starts(df)
    df = get_ends(df)
    
    ref_list.append(df)

# RETURN VALUES ARE NOW IN DF NOT IN ORIGINAL_TEXTS

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ct'] = df.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['length'] = df.apply(lambda x: len(x.original), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [14]:
# Convert list to dataframe
ref_list = pd.concat(ref_list)

In [15]:
ref_list.shape[0] # 210771 token

210771

In [45]:
ref_list.tail(10)

Unnamed: 0,index,original,corrected,filename,num_texts,length,start_position,end_position,ct
196,212493,zu,zu,10-693-4-IV-Weg.csv,1921,2,970,972,196
197,212494,schenken,schenken,10-693-4-IV-Weg.csv,1921,8,973,981,197
198,212495,.,.,10-693-4-IV-Weg.csv,1921,1,982,983,198
199,212496,P.S.,P.S.,10-693-4-IV-Weg.csv,1921,4,984,988,199
200,212497,Lars,Lars,10-693-4-IV-Weg.csv,1921,4,989,993,200
201,212498,ist,ist,10-693-4-IV-Weg.csv,1921,3,994,997,201
202,212499,in,in,10-693-4-IV-Weg.csv,1921,2,998,1000,202
203,212500,Lea,Lea,10-693-4-IV-Weg.csv,1921,3,1001,1004,203
204,212501,verknallt,verknallt,10-693-4-IV-Weg.csv,1921,9,1005,1014,204
205,212502,.,.,10-693-4-IV-Weg.csv,1921,1,1015,1016,205


In [17]:
# Zwischenstand
#ref_list.to_pickle('data_raw_token_whole_texts_offsets.pkl')
#ref_list.to_pickle('data_raw_token_whole_texts_offsets_OTHERINDEX_20211208.pkl')

In [15]:
#ref_list = pd.read_pickle('./pickles/data_raw_token_whole_texts_offsets.pkl')
ref_list = pd.read_pickle('./pickles/data_raw_token_whole_texts_offsets_OTHERINDEX_20211208.pkl')

In [16]:
#ref_list.head(10)
ref_list.tail(10)
ref_list.shape[0]

210771

## 4 - Merge back to original df
- data_error_token

In [17]:
# MERGE (original_texts, df_server)
suggs = ref_list.merge(df_server, how='left', on=['original', 'num_texts', 'start_position', 'end_position'])

In [18]:
# CHECK FOR DUPLICATES; Mark all duplicates as True
# Server results
df_server[df_server.duplicated(subset=['original', 'num_texts', 'start_position', 'end_position'], keep=False)] # 10.390 / 42.038

Unnamed: 0,original,suggestions,rule_id,num_texts,start_position,end_position


In [19]:
# Reference dataframe
ref_list[ref_list.duplicated(subset=['original', 'num_texts', 'start_position', 'end_position'], keep=False)] # 116.691 / 210.771

Unnamed: 0,index,original,corrected,filename,num_texts,length,start_position,end_position,ct


In [20]:
suggs.shape[0] # equals ref_list

210771

In [21]:
suggs.tail(10) # 210770 with resetted index

Unnamed: 0,index,original,corrected,filename,num_texts,length,start_position,end_position,ct,suggestions,rule_id
210761,210761,zu,zu,10-693-4-IV-Weg.csv,1921,2,970,972,196,,
210762,210762,schenken,schenken,10-693-4-IV-Weg.csv,1921,8,973,981,197,,
210763,210763,.,.,10-693-4-IV-Weg.csv,1921,1,982,983,198,,
210764,210764,P.S.,P.S.,10-693-4-IV-Weg.csv,1921,4,984,988,199,,
210765,210765,Lars,Lars,10-693-4-IV-Weg.csv,1921,4,989,993,200,,
210766,210766,ist,ist,10-693-4-IV-Weg.csv,1921,3,994,997,201,,
210767,210767,in,in,10-693-4-IV-Weg.csv,1921,2,998,1000,202,,
210768,210768,Lea,Lea,10-693-4-IV-Weg.csv,1921,3,1001,1004,203,,
210769,210769,verknallt,verknallt,10-693-4-IV-Weg.csv,1921,9,1005,1014,204,,
210770,210770,.,.,10-693-4-IV-Weg.csv,1921,1,1015,1016,205,,


In [22]:
suggs.tail(10) # 210770 with non-resetted index

Unnamed: 0,index,original,corrected,filename,num_texts,length,start_position,end_position,ct,suggestions,rule_id
210761,210761,zu,zu,10-693-4-IV-Weg.csv,1921,2,970,972,196,,
210762,210762,schenken,schenken,10-693-4-IV-Weg.csv,1921,8,973,981,197,,
210763,210763,.,.,10-693-4-IV-Weg.csv,1921,1,982,983,198,,
210764,210764,P.S.,P.S.,10-693-4-IV-Weg.csv,1921,4,984,988,199,,
210765,210765,Lars,Lars,10-693-4-IV-Weg.csv,1921,4,989,993,200,,
210766,210766,ist,ist,10-693-4-IV-Weg.csv,1921,3,994,997,201,,
210767,210767,in,in,10-693-4-IV-Weg.csv,1921,2,998,1000,202,,
210768,210768,Lea,Lea,10-693-4-IV-Weg.csv,1921,3,1001,1004,203,,
210769,210769,verknallt,verknallt,10-693-4-IV-Weg.csv,1921,9,1005,1014,204,,
210770,210770,.,.,10-693-4-IV-Weg.csv,1921,1,1015,1016,205,,


## 5 - Drop words that are not examined in analysis
- Remove words/characters that are not desired in analysis (**_in alignment with language_tool_prep Notebook_**)
- Drop obsolete columns
- Get other information to complete df

- Get Types

In [23]:
# Get clean data
suggs_clean_token = helpers_lt.removeCharactersWords_x(suggs)

In [24]:
suggs_clean_token.head(2)
suggs_clean_token.tail(2)

Unnamed: 0,index,original,corrected,filename,num_texts,length,start_position,end_position,ct,suggestions,rule_id
210767,210767,in,in,10-693-4-IV-Weg.csv,1921,2,998,1000,202,,
210769,210769,verknallt,verknallt,10-693-4-IV-Weg.csv,1921,9,1005,1014,204,,


In [25]:
# Get error data
suggs_error_token = helpers.removeCorrectWords(suggs_clean_token)

In [26]:
suggs_error_token.head(2)
suggs_error_token.tail(2)

Unnamed: 0,index,original,corrected,filename,num_texts,length,start_position,end_position,ct,suggestions,rule_id
210754,210754,momen,Moment,10-693-4-IV-Weg.csv,1921,5,934,939,189,"[mögen, Domen, mimen, zoomen, Nomen, boomen, Namen, kamen, hohen, oben, Polen, Roman, hören, roten, zogen, Boden, lösen, töten, Böhmen, Löwen]",GERMAN_SPELLER_RULE
210760,210760,Lutscha,Lutscher,10-693-4-IV-Weg.csv,1921,7,962,969,195,"[Lutsch, Luigi, Lutsche, Lunge, Lutscht, Putsch, Kutsche, Rutscht, Ludger, Lusaka, Putsche, Luchs, Rutsche, Luchse, Rutsch, Latsche, Blutschau, Datscha, Dougga, Hodscha]",GERMAN_SPELLER_RULE


In [27]:
# Drop obsolete columns
#suggs_error_token_x = suggs_error_token.drop(columns=['index', 'num_texts', 'length', 'start_position', 'end_position', 'ct'])
suggs_error_token_x = suggs_error_token

In [28]:
suggs_error_token_x.head(2)
#suggs_error_token_x.shape[0]

Unnamed: 0,index,original,corrected,filename,num_texts,length,start_position,end_position,ct,suggestions,rule_id
10,10,belt,bellt,01-005-2-III-Eis.csv,0,4,46,50,10,"[Belt, Welt, bald, hält, Geld, Feld, Held, Zelt, bebt, bellt, perlt, pell, best, meld, peilt, pellt, pult, zelt, bei, seit]",GERMAN_SPELLER_RULE
17,17,kukt,kuckt,01-005-2-III-Eis.csv,0,4,80,84,17,"[rückt, zückt, bückt, guckt, kickt, kackt, gut, rückte, deckt, fügt, Lücke, lockt, weckt, packt, Mücke, kund, kürt, lügt, hackt, hockt]",GERMAN_SPELLER_RULE


In [29]:
# Get other information to complete data (token tupel counts)
def get_tupel_counts(data):
    # Tupel frequency is returned as a multi-indexed Series, therefore merging is different
    data['freq_ori'] = data.groupby('original').size()[data.original].values
    data['freq_cor'] = data.groupby('corrected').size()[data.corrected].values
    
    freq_tupel = data.groupby(['original','corrected']).size()
    data = data.merge(freq_tupel.to_frame(), how='left', left_on=['original', 'corrected'], right_on=['original', 'corrected'])
    data.rename({0: 'freq_tup'}, axis='columns', inplace = True)
    
    return data

In [30]:
suggs_error_token_x = get_tupel_counts(suggs_error_token_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['freq_ori'] = data.groupby('original').size()[data.original].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['freq_cor'] = data.groupby('corrected').size()[data.corrected].values


In [36]:
suggs_error_token_x.head(2)

Unnamed: 0,index,original,corrected,filename,num_texts,length,start_position,end_position,ct,suggestions,rule_id,freq_ori,freq_cor,freq_tup
0,10,belt,bellt,01-005-2-III-Eis.csv,0,4,46,50,10,"[Belt, Welt, bald, hält, Geld, Feld, Held, Zelt, bebt, bellt, perlt, pell, best, meld, peilt, pellt, pult, zelt, bei, seit]",GERMAN_SPELLER_RULE,91,138,91
1,17,kukt,kuckt,01-005-2-III-Eis.csv,0,4,80,84,17,"[rückt, zückt, bückt, guckt, kickt, kackt, gut, rückte, deckt, fügt, Lücke, lockt, weckt, packt, Mücke, kund, kürt, lügt, hackt, hockt]",GERMAN_SPELLER_RULE,73,152,73


In [32]:
# Reset index
suggs_error_token_x.reset_index(drop=True, inplace=True)

# Evaluation
- Token
    - lc
    - cs
- Before
    - Get value counts
    - Reset index
    - Replace NaNs

In [33]:
suggs_error_token_x.suggestions.fillna(value='', inplace=True)

In [34]:
# TOKEN
token_eval_lc = litkey_2.evaluate(suggs_error_token_x)
token_eval_cs = litkey_2.evaluate(suggs_error_token_x, lower_case=False)

  0%|          | 0/24601 [00:00<?, ?it/s]

  0%|          | 0/24601 [00:00<?, ?it/s]

In [37]:
token_eval_lc.shape[0] # 24601
token_eval_cs.shape[0] # 24601

24601

In [42]:
token_eval_lc.head(2)

Unnamed: 0,index,original,corrected,filename,num_texts,length,start_position,end_position,ct,suggestions,rule_id,freq_ori,freq_cor,freq_tup,in_sugg,sugg_idx,idx0
0,10,belt,bellt,01-005-2-III-Eis.csv,0,4,46,50,10,"[belt, welt, bald, hält, geld, feld, held, zelt, bebt, bellt, perlt, pell, best, meld, peilt, pellt, pult, zelt, bei, seit]",GERMAN_SPELLER_RULE,91,138,91,True,9.0,False
1,17,kukt,kuckt,01-005-2-III-Eis.csv,0,4,80,84,17,"[rückt, zückt, bückt, guckt, kickt, kackt, gut, rückte, deckt, fügt, lücke, lockt, weckt, packt, mücke, kund, kürt, lügt, hackt, hockt]",GERMAN_SPELLER_RULE,73,152,73,False,,False


In [30]:
# EXPORT TOKEN
#token_eval_lc.to_pickle('./pickles/pickles_evaluation/data_error_token_language_tool_whole_texts_evaluation_lc.pkl')
#token_eval_cs.to_pickle('./pickles/pickles_evaluation/data_error_token_language_tool_whole_texts_evaluation_cs.pkl')

In [38]:
# Export pickle with ALL COLUMNS; ESP. OFFSETS
#token_eval_lc.to_pickle('./pickles/pickles_evaluation/data_error_token_language_tool_whole_texts_evaluation_lc_ALL_COLUMNS_OFFSETS.pkl')
#token_eval_lc.to_pickle('./pickles/pickles_evaluation/data_error_token_language_tool_whole_texts_evaluation_lc_ALL_COLUMNS_OFFSETS_OTHERINDEX_20211208.pkl')

# Export also for case sensitive; Language Tool with all text information
token_eval_cs.to_pickle('./pickles/pickles_evaluation/data_error_token_language_tool_whole_texts_evaluation_cs_ALL_COLUMNS_OFFSETS_OTHERINDEX_20211208.pkl')

In [43]:
# Export pickle with ALL COLUMNS; RULEID
#token_eval_lc.to_pickle('./pickles/pickles_evaluation/data_error_token_language_tool_whole_texts_evaluation_lc_ALL_COLUMNS_OFFSETS_RULEID.pkl')

***

In [31]:
token_eval_lc = pd.read_pickle('./pickles/pickles_evaluation/data_error_token_language_tool_whole_texts_evaluation_lc.pkl')
token_eval_cs = pd.read_pickle('./pickles/pickles_evaluation/data_error_token_language_tool_whole_texts_evaluation_cs.pkl')

In [32]:
token_eval_lc[token_eval_lc.suggestions == ''].shape[0] # 11070

5009

In [33]:
token_eval_lc.tail(10)

Unnamed: 0,original,corrected,filename,suggestions,freq_ori,freq_cor,freq_tup,in_sugg,sugg_idx,idx0
24591,nieht,nicht,10-693-4-IV-Weg.csv,"[nicht, sieht, zieht, niet, lieht, niest, liegt, geht, steht, dient, hielt, nichts, nie, nimmt, siehe, licht, sicht, dicht, dreht, liebt]",2,135,2,True,0.0,True
24592,vessen,fassen,10-693-4-IV-Weg.csv,"[dessen, hessen, festen, fassen, fressen, messen, essen, fesseln, wessen, pässen, fußen, bässen, säßen, fetzen, messen, nässen, fessel, kessen, mäßen, nässen]",1,2,1,True,3.0,False
24593,entlich,endlich,10-693-4-IV-Weg.csv,"[endlich, endliche, entwich, entlieh, eidlich, ältlich, deutlich, südlich, westlich, englisch, etliche, seitlich, tödlich, zeitlich, amtlich, endlichen, entließ, entzog, rötlich, erblich]",10,20,10,True,0.0,True
24594,tor,tür,10-693-4-IV-Weg.csv,,1,15,1,False,,False
24595,umarte,umarmte,10-693-4-IV-Weg.csv,"[um arte, smarte, umarmte, umarme, karte, harte, malte, uralte, arte, klärte, marode, sparte, umbaute, zarte, quarte, matte, smart, starte, umarmen, umarmt]",2,16,2,True,2.0,False
24596,nieht,nicht,10-693-4-IV-Weg.csv,"[nicht, sieht, zieht, niet, lieht, niest, liegt, geht, steht, dient, hielt, nichts, nie, nimmt, siehe, licht, sicht, dicht, dreht, liebt]",2,135,2,True,0.0,True
24597,ass,aß,10-693-4-IV-Weg.csv,"[ass, aß, als, aus, dass, ans, saß, hass, fass, nass, bass, lass, pass, bass, pass, aßt, iss, äse, äst, des]",3,7,3,True,1.0,False
24598,mutzte,nutzte,10-693-4-IV-Weg.csv,"[musste, nutzte, müsste, nützte, mutete, bützte, motzte, münzte, putzte, setzte, letzte, mussten, müsse, nutzt, hütte, künste, meiste, müssten, nutzten, wusste]",1,1,1,True,1.0,False
24599,momen,moment,10-693-4-IV-Weg.csv,"[mögen, domen, mimen, zoomen, nomen, boomen, namen, kamen, hohen, oben, polen, roman, hören, roten, zogen, boden, lösen, töten, böhmen, löwen]",1,4,1,False,,False
24600,lutscha,lutscher,10-693-4-IV-Weg.csv,"[lutsch, luigi, lutsche, lunge, lutscht, putsch, kutsche, rutscht, ludger, lusaka, putsche, luchs, rutsche, luchse, rutsch, latsche, blutschau, datscha, dougga, hodscha]",8,56,8,False,,False


In [74]:
token_eval_lc.tail(10)

Unnamed: 0,index,original,corrected,filename,num_texts,length,start_position,end_position,ct,suggestions,rule_id,freq_ori,freq_cor,freq_tup,in_sugg,sugg_idx,idx0
24591,210710,nieht,nicht,10-693-4-IV-Weg.csv,1921,5,721,726,145,"[nicht, sieht, zieht, niet, lieht, niest, liegt, geht, steht, dient, hielt, nichts, nie, nimmt, siehe, licht, sicht, dicht, dreht, liebt]",GERMAN_SPELLER_RULE,2,135,2,True,0.0,True
24592,210711,vessen,fassen,10-693-4-IV-Weg.csv,1921,6,727,733,146,"[dessen, hessen, festen, fassen, fressen, messen, essen, fesseln, wessen, pässen, fußen, bässen, säßen, fetzen, messen, nässen, fessel, kessen, mäßen, nässen]",GERMAN_SPELLER_RULE,1,2,1,True,3.0,False
24593,210715,entlich,endlich,10-693-4-IV-Weg.csv,1921,7,744,751,150,"[endlich, endliche, entwich, entlieh, eidlich, ältlich, deutlich, südlich, westlich, englisch, etliche, seitlich, tödlich, zeitlich, amtlich, endlichen, entließ, entzog, rötlich, erblich]",GERMAN_SPELLER_RULE,10,20,10,True,0.0,True
24594,210730,tor,tür,10-693-4-IV-Weg.csv,1921,3,820,823,165,,,1,15,1,False,,False
24595,210732,umarte,umarmte,10-693-4-IV-Weg.csv,1921,6,828,834,167,"[um arte, smarte, umarmte, umarme, karte, harte, malte, uralte, arte, klärte, marode, sparte, umbaute, zarte, quarte, matte, smart, starte, umarmen, umarmt]",GERMAN_SPELLER_RULE,2,16,2,True,2.0,False
24596,210743,nieht,nicht,10-693-4-IV-Weg.csv,1921,5,886,891,178,"[nicht, sieht, zieht, niet, lieht, niest, liegt, geht, steht, dient, hielt, nichts, nie, nimmt, siehe, licht, sicht, dicht, dreht, liebt]",GERMAN_SPELLER_RULE,2,135,2,True,0.0,True
24597,210749,ass,aß,10-693-4-IV-Weg.csv,1921,3,912,915,184,"[ass, aß, als, aus, dass, ans, saß, hass, fass, nass, bass, lass, pass, bass, pass, aßt, iss, äse, äst, des]",GERMAN_SPELLER_RULE,3,7,3,True,1.0,False
24598,210752,mutzte,nutzte,10-693-4-IV-Weg.csv,1921,6,923,929,187,"[musste, nutzte, müsste, nützte, mutete, bützte, motzte, münzte, putzte, setzte, letzte, mussten, müsse, nutzt, hütte, künste, meiste, müssten, nutzten, wusste]",GERMAN_SPELLER_RULE,1,1,1,True,1.0,False
24599,210754,momen,moment,10-693-4-IV-Weg.csv,1921,5,934,939,189,"[mögen, domen, mimen, zoomen, nomen, boomen, namen, kamen, hohen, oben, polen, roman, hören, roten, zogen, boden, lösen, töten, böhmen, löwen]",GERMAN_SPELLER_RULE,1,4,1,False,,False
24600,210760,lutscha,lutscher,10-693-4-IV-Weg.csv,1921,7,962,969,195,"[lutsch, luigi, lutsche, lunge, lutscht, putsch, kutsche, rutscht, ludger, lusaka, putsche, luchs, rutsche, luchse, rutsch, latsche, blutschau, datscha, dougga, hodscha]",GERMAN_SPELLER_RULE,8,56,8,False,,False
