In [19]:
import re
from Levenshtein import distance
from tqdm import tqdm
import textwrap
import evaluate
import datasets
from difflib import SequenceMatcher
from difflib import Differ
from sklearn.metrics import precision_recall_fscore_support
import shutil
from evaluate_functions import *

In [20]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

## Gold data

In [21]:
# Gold data (human annotated)
with open('gold_data/Multatuli_MaxHavelaar_gold.txt', 'r') as g1:
    gold_data1 = g1.readlines()
    
with open('gold_data/ConanDoyle_SherlockHolmesDeAgraSchat_gold.txt', 'r') as g2:
    gold_data2 = g2.readlines()
    
with open('gold_data/Nescio_Titaantjes_gold.txt', 'r') as g3:
    gold_data3 = g3.readlines()

## Rule-based predictions

In [22]:
# Silver data from Rule-Based
with open('silver_data/Multatuli_MaxHavelaar_silver.txt', 'r') as s1:
    silver_data1 = s1.readlines()
    
with open('silver_data/ConanDoyle_SherlockHolmesDeAgraSchat_silver.txt', 'r') as s2:
    silver_data2 = s2.readlines()
    
with open('silver_data/Nescio_Titaantjes_silver.txt', 'r') as s3:
    silver_data3 = s3.readlines()

## Predictions Flan-T5 for 5k and 10k train data

In [23]:
# Predictions from Flan-T5 not pretrained 5K
with open('FlanT5_pred5k/Multatuli_MaxHavelaar_pred.txt', 'r') as flan_p1_5k:
    flan_pred1_5k = flan_p1_5k.readlines()

with open('FlanT5_pred5k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as flan_p2_5k:
    flan_pred2_5k = flan_p2_5k.readlines()
    
with open('FlanT5_pred5k/Nescio_Titaantjes_pred.txt', 'r') as flan_p3_5k:
    flan_pred3_5k = flan_p3_5k.readlines()

In [24]:
# Predictions from Flan-T5 not pretrained 10K
with open('FlanT5_pred10k/Multatuli_MaxHavelaar_pred.txt', 'r') as flan_p1_10k:
    flan_pred1_10k = flan_p1_10k.readlines()

with open('FlanT5_pred10k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as flan_p2_10k:
    flan_pred2_10k = flan_p2_10k.readlines()
    
with open('FlanT5_pred10k/Nescio_Titaantjes_pred.txt', 'r') as flan_p3_10k:
    flan_pred3_10k = flan_p3_10k.readlines()

## Predictions ByT5 original (not pretrained) for 5k and 10k train data

In [25]:
# Predictions from ByT5 not pretrained 5K
with open('ByT5_orig_pred5k/Multatuli_MaxHavelaar_pred.txt', 'r') as byt5_p1_5k:
    byt5_pred1_5k = byt5_p1_5k.readlines()

with open('ByT5_orig_pred5k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as byt5_p2_5k:
    byt5_pred2_5k = byt5_p2_5k.readlines()
    
with open('ByT5_orig_pred5k/Nescio_Titaantjes_pred.txt', 'r') as byt5_p3_5k:
    byt5_pred3_5k = byt5_p3_5k.readlines()

In [26]:
# Predictions from ByT5 not pretrained 5K
with open('ByT5_orig_pred10k/Multatuli_MaxHavelaar_pred.txt', 'r') as byt5_p1_10k:
    byt5_pred1_10k = byt5_p1_10k.readlines()

with open('ByT5_orig_pred10k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as byt5_p2_10k:
    byt5_pred2_10k = byt5_p2_10k.readlines()
    
with open('ByT5_orig_pred10k/Nescio_Titaantjes_pred.txt', 'r') as byt5_p3_10k:
    byt5_pred3_10k = byt5_p3_10k.readlines()

## Predictions ByT5 pretrained Books for 5k and 10k train data

In [27]:
# Predictions from ByT5 pretrained with 2 million sentences from BERTje Books 5K
with open('ByT5_pre_books_pred5k/Multatuli_MaxHavelaar_pred.txt', 'r') as books1_5k:
    byt5_books_pred1_5k = books1_5k.readlines()

with open('ByT5_pre_books_pred5k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as books2_5k:
    byt5_books_pred2_5k = books2_5k.readlines()
    
with open('ByT5_pre_books_pred5k/Nescio_Titaantjes_pred.txt', 'r') as books3_5k:
    byt5_books_pred3_5k = books3_5k.readlines()

In [28]:
# Predictions from ByT5 pretrained with 2 million sentences from BERTje Books 5K
with open('ByT5_pre_books_pred10k/Multatuli_MaxHavelaar_pred.txt', 'r') as books1_10k:
    byt5_books_pred1_10k = books1_10k.readlines()

with open('ByT5_pre_books_pred10k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as books2_10k:
    byt5_books_pred2_10k = books2_10k.readlines()
    
with open('ByT5_pre_books_pred10k/Nescio_Titaantjes_pred.txt', 'r') as books3_10k:
    byt5_books_pred3_10k = books3_10k.readlines()

## Predictions ByT5 pretrained Sonar for 5k and 10k train data

In [29]:
# Predictions from ByT5 pretrained with 2 million sentences from BERTje Sonar 5K
with open('ByT5_pre_sonar_pred5k/Multatuli_MaxHavelaar_pred.txt', 'r') as sonar1_5k:
    byt5_sonar_pred1_5k = sonar1_5k.readlines()

with open('ByT5_pre_sonar_pred5k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as sonar2_5k:
    byt5_sonar_pred2_5k = sonar2_5k.readlines()
    
with open('ByT5_pre_sonar_pred5k/Nescio_Titaantjes_pred.txt', 'r') as sonar3_5k:
    byt5_sonar_pred3_5k = sonar3_5k.readlines()

In [30]:
# Predictions from ByT5 pretrained with 2 million sentences from BERTje Sonar 5K
with open('ByT5_pre_sonar_pred10k/Multatuli_MaxHavelaar_pred.txt', 'r') as sonar1_10k:
    byt5_sonar_pred1_10k = sonar1_10k.readlines()

with open('ByT5_pre_sonar_pred10k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as sonar2_10k:
    byt5_sonar_pred2_10k = sonar2_10k.readlines()
    
with open('ByT5_pre_sonar_pred10k/Nescio_Titaantjes_pred.txt', 'r') as sonar3_10k:
    byt5_sonar_pred3_10k = sonar3_10k.readlines()

In [31]:
'''
ERR, Chrf++ scores:
1. FlanT5 
2. ByT5 original
3. ByT5 pretrained Sonar 
4. ByT5 pretrained Books 
5. Rule-based

Best results for each novel:

1. Max Havelaar Multatuli: 1. Rule-Based (ERR: 68.35%)
2. Conan Doyle Sherlock Holmes De Agra Schat: 4. ByT5 pretrained sonar (ERR: 71.71%)
3. Nescio Titaantjes: 3. ByT5 pretrained books (ERR: 83.79%%)


1. Max Havelaar Multatuli: 1. Rule-Based (ChrF++: 68.35%)
2. Conan Doyle Sherlock Holmes De Agra Schat: 4. ByT5 pretrained sonar (ChrF++: 71.71%)
3. Nescio Titaantjes: 3. ByT5 pretrained books (ChrF++: 83.79%%)
'''

'\nERR, Chrf++ scores:\n1. FlanT5 \n2. ByT5 original\n3. ByT5 pretrained Sonar \n4. ByT5 pretrained Books \n5. Rule-based\n\nBest results for each novel:\n\n1. Max Havelaar Multatuli: 1. Rule-Based (ERR: 68.35%)\n2. Conan Doyle Sherlock Holmes De Agra Schat: 4. ByT5 pretrained sonar (ERR: 71.71%)\n3. Nescio Titaantjes: 3. ByT5 pretrained books (ERR: 83.79%%)\n\n\n1. Max Havelaar Multatuli: 1. Rule-Based (ChrF++: 68.35%)\n2. Conan Doyle Sherlock Holmes De Agra Schat: 4. ByT5 pretrained sonar (ChrF++: 71.71%)\n3. Nescio Titaantjes: 3. ByT5 pretrained books (ChrF++: 83.79%%)\n'

## 1. Flan T5 (not pretrained) 5K

In [32]:
print('Max Havelaar Multatuli (FlanT5):')
_, _ = evaluate_T5(gold_data1, flan_pred1_5k, 'Flan-T5')

Max Havelaar Multatuli (FlanT5):
Baseline Accuracy: 96.04%
Accuracy: 98.24%
Error Reduction Rate: 55.44%
Avg Precision: 94.19%
Avg Recall: 94.76%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 97.16
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 97.78
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 98.14
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.43
   char_order: 6
   word_order: 0
   beta: 2


In [33]:
print('Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):')
_, _ = evaluate_T5(gold_data2, flan_pred2_5k, 'Flan-T5')

Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):
Baseline Accuracy: 94.83%
Accuracy: 97.87%
Error Reduction Rate: 58.72%
Avg Precision: 94.92%
Avg Recall: 95.97%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 97.86
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.29
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 97.98
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.4
   char_order: 6
   word_order: 0
   beta: 2


In [34]:
print('Nescio Titaantjes (FlanT5):') 
_, _ = evaluate_T5(gold_data3, flan_pred3_5k, 'Flan-T5')

Nescio Titaantjes (FlanT5):
Baseline Accuracy: 96.28%
Accuracy: 98.95%
Error Reduction Rate: 71.69%
Avg Precision: 96.32%
Avg Recall: 97.10%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 98.64
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 99.06
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 99.0
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 99.19
   char_order: 6
   word_order: 0
   beta: 2


## 1. Flan T5 (not pretrained) 10K

In [35]:
print('Max Havelaar Multatuli (FlanT5):')
_T, _T1 = evaluate_T5(gold_data1, flan_pred1_10k, 'Flan-T5')

Max Havelaar Multatuli (FlanT5):
Baseline Accuracy: 96.04%
Accuracy: 98.48%
Error Reduction Rate: 61.52%
Avg Precision: 94.85%
Avg Recall: 95.21%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 97.96
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.58
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 98.34
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.58
   char_order: 6
   word_order: 0
   beta: 2


In [36]:
print('Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):')
_, _ = evaluate_T5(gold_data2, flan_pred2_10k, 'Flan-T5')

Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):
Baseline Accuracy: 94.83%
Accuracy: 98.04%
Error Reduction Rate: 62.02%
Avg Precision: 95.50%
Avg Recall: 96.29%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 98.03
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.44
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 98.13
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.52
   char_order: 6
   word_order: 0
   beta: 2


In [37]:
print('Nescio Titaantjes (FlanT5):') 
_, _ = evaluate_T5(gold_data3, flan_pred3_10k, 'Flan-T5')

Nescio Titaantjes (FlanT5):
Baseline Accuracy: 96.28%
Accuracy: 99.14%
Error Reduction Rate: 76.94%
Avg Precision: 96.85%
Avg Recall: 97.46%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 98.93
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 99.33
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 99.17
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 99.33
   char_order: 6
   word_order: 0
   beta: 2


## 2. ByT5 original (not pretrained) 5K

In [38]:
print('Max Havelaar Multatuli (ByT5 orig):')
byt5_orig_pred1_5K, byt5_orig_gold1_5K = evaluate_T5(gold_data1, byt5_pred1_5k, 'ByT5 orig')

Max Havelaar Multatuli (ByT5 orig):
Baseline Accuracy: 96.04%
Accuracy: 98.72%
Error Reduction Rate: 67.59%
Avg Precision: 95.73%
Avg Recall: 95.99%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 98.16
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.73
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 98.59
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.79
   char_order: 6
   word_order: 0
   beta: 2


In [39]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):')
byt5_orig_pred2_5K, byt5_orig_gold2_5K = evaluate_T5(gold_data2, byt5_pred2_5k, 'ByT5 orig')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):
Baseline Accuracy: 94.83%
Accuracy: 98.53%
Error Reduction Rate: 71.51%
Avg Precision: 96.91%
Avg Recall: 97.54%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 98.54
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.85
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 98.57
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.85
   char_order: 6
   word_order: 0
   beta: 2


In [40]:
print('Nescio Titaantjes (ByT5 orig):') 
byt5_orig_pred3_5K, byt5_orig_gold3_5K = evaluate_T5(gold_data3, byt5_pred3_5k, 'ByT5 orig')

Nescio Titaantjes (ByT5 orig):
Baseline Accuracy: 96.28%
Accuracy: 99.32%
Error Reduction Rate: 81.74%
Avg Precision: 97.65%
Avg Recall: 98.12%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 99.07
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 99.43
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 99.32
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 99.43
   char_order: 6
   word_order: 0
   beta: 2


## 2. ByT5 original (not pretrained) 10K

In [41]:
print('Max Havelaar Multatuli (ByT5 orig):')
byt5_orig_pred1_10K, byt5_orig_gold1_10K = evaluate_T5(gold_data1, byt5_pred1_10k, 'ByT5 orig')

Max Havelaar Multatuli (ByT5 orig):
Baseline Accuracy: 96.04%
Accuracy: 98.72%
Error Reduction Rate: 67.59%
Avg Precision: 95.74%
Avg Recall: 95.99%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 98.22
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.8
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 98.6
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.8
   char_order: 6
   word_order: 0
   beta: 2


In [42]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):')
byt5_orig_pred2_10K, byt5_orig_gold2_10K = evaluate_T5(gold_data2, byt5_pred2_10k, 'ByT5 orig')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):
Baseline Accuracy: 94.83%
Accuracy: 98.54%
Error Reduction Rate: 71.71%
Avg Precision: 96.91%
Avg Recall: 97.55%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 98.56
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.87
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 98.59
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.87
   char_order: 6
   word_order: 0
   beta: 2


In [43]:
print('Nescio Titaantjes (ByT5 orig):') 
byt5_orig_pred3_10K, byt5_orig_gold3_10K = evaluate_T5(gold_data3, byt5_pred3_10k, 'ByT5 orig')

Nescio Titaantjes (ByT5 orig):
Baseline Accuracy: 96.28%
Accuracy: 99.35%
Error Reduction Rate: 82.65%
Avg Precision: 97.76%
Avg Recall: 98.31%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 99.1
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 99.45
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 99.35
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 99.45
   char_order: 6
   word_order: 0
   beta: 2


## 3. ByT5 pretrained with 2 million sentences from BERTje Books 5K

In [44]:
print('Max Havelaar Multatuli (ByT5 pretrained books):')
books_pred1_5k, books_gold1_5k = evaluate_T5(gold_data1, byt5_books_pred1_5k, 'ByT5 pretrained books')

Max Havelaar Multatuli (ByT5 pretrained books):
Baseline Accuracy: 96.04%
Accuracy: 98.70%
Error Reduction Rate: 67.09%
Avg Precision: 96.03%
Avg Recall: 96.28%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 98.22
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.79
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 98.6
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.8
   char_order: 6
   word_order: 0
   beta: 2


In [45]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):')
books_pred2_5k, books_gold2_5k = evaluate_T5(gold_data2, byt5_books_pred2_5k, 'ByT5 pretrained books')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):
Baseline Accuracy: 94.83%
Accuracy: 98.49%
Error Reduction Rate: 70.74%
Avg Precision: 96.78%
Avg Recall: 97.54%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 98.5
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.81
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 98.53
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.81
   char_order: 6
   word_order: 0
   beta: 2


In [46]:
print('Nescio Titaantjes (ByT5 pretrained books):') 
books_pred3_5k, books_gold3_5k = evaluate_T5(gold_data3, byt5_books_pred3_5k, 'ByT5 pretrained books')

Nescio Titaantjes (ByT5 pretrained books):
Baseline Accuracy: 96.28%
Accuracy: 99.40%
Error Reduction Rate: 83.79%
Avg Precision: 98.00%
Avg Recall: 98.47%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 99.15
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 99.49
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 99.41
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 99.51
   char_order: 6
   word_order: 0
   beta: 2


## 3. ByT5 pretrained with 2 million sentences from BERTje Books 10K

In [47]:
print('Max Havelaar Multatuli (ByT5 pretrained books):')
books_pred1_10k, books_gold1_10k = evaluate_T5(gold_data1, byt5_books_pred1_10k, 'ByT5 pretrained books')

Max Havelaar Multatuli (ByT5 pretrained books):
Baseline Accuracy: 96.04%
Accuracy: 98.87%
Error Reduction Rate: 71.39%
Avg Precision: 96.04%
Avg Recall: 96.39%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 98.37
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.92
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 98.74
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.92
   char_order: 6
   word_order: 0
   beta: 2


In [48]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):')
books_pred2_10k, books_gold2_10k = evaluate_T5(gold_data2, byt5_books_pred2_10k, 'ByT5 pretrained books')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):
Baseline Accuracy: 94.83%
Accuracy: 98.63%
Error Reduction Rate: 73.45%
Avg Precision: 97.25%
Avg Recall: 97.83%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 98.62
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.9
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 98.65
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.9
   char_order: 6
   word_order: 0
   beta: 2


In [49]:
print('Nescio Titaantjes (ByT5 pretrained books):') 
books_pred3_10k, books_gold3_10k = evaluate_T5(gold_data3, byt5_books_pred3_10k, 'ByT5 pretrained books')

Nescio Titaantjes (ByT5 pretrained books):
Baseline Accuracy: 96.28%
Accuracy: 99.44%
Error Reduction Rate: 84.93%
Avg Precision: 98.22%
Avg Recall: 98.57%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 99.14
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 99.47
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 99.44
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 99.54
   char_order: 6
   word_order: 0
   beta: 2


## 4. ByT5 pretrained with 2 million sentences from BERTje Sonar 5K

In [50]:
print('Max Havelaar Multatuli (ByT5 pretrained sonar):')
sonar_pred1_5k, sonar_gold1_5k = evaluate_T5(gold_data1, byt5_sonar_pred1_5k, 'ByT5 pretrained sonar')

Max Havelaar Multatuli (ByT5 pretrained sonar):
Baseline Accuracy: 96.04%
Accuracy: 98.70%
Error Reduction Rate: 67.09%
Avg Precision: 95.67%
Avg Recall: 95.92%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.19
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.77
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.56
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.77
   char_order: 6
   word_order: 0
   beta: 2


In [51]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):')
sonar_pred2_5k, sonar_gold2_5k = evaluate_T5(gold_data2, byt5_sonar_pred2_5k, 'ByT5 pretrained sonar')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):
Baseline Accuracy: 94.83%
Accuracy: 98.54%
Error Reduction Rate: 71.71%
Avg Precision: 96.94%
Avg Recall: 97.48%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.53
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.83
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.56
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.83
   char_order: 6
   word_order: 0
   beta: 2


In [52]:
print('Nescio Titaantjes (ByT5 pretrained sonar):') 
sonar_pred3_5k, sonar_gold3_5k = evaluate_T5(gold_data3, byt5_sonar_pred3_5k, 'ByT5 pretrained sonar')

Nescio Titaantjes (ByT5 pretrained sonar):
Baseline Accuracy: 96.28%
Accuracy: 99.39%
Error Reduction Rate: 83.56%
Avg Precision: 98.04%
Avg Recall: 98.30%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.97
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 99.25
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 99.21
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 99.24
   char_order: 6
   word_order: 0
   beta: 2


## 4. ByT5 pretrained with 2 million sentences from BERTje Sonar 10K

In [53]:
print('Max Havelaar Multatuli (ByT5 pretrained sonar):')
sonar_pred1_10k, sonar_gold1_10k = evaluate_T5(gold_data1, byt5_sonar_pred1_10k, 'ByT5 pretrained sonar')

Max Havelaar Multatuli (ByT5 pretrained sonar):
Baseline Accuracy: 96.04%
Accuracy: 98.79%
Error Reduction Rate: 69.37%
Avg Precision: 95.91%
Avg Recall: 96.20%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.13
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.82
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.67
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.86
   char_order: 6
   word_order: 0
   beta: 2


In [54]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):')
sonar_pred2_10k, sonar_gold2_10k = evaluate_T5(gold_data2, byt5_sonar_pred2_10k, 'ByT5 pretrained sonar')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):
Baseline Accuracy: 94.83%
Accuracy: 98.54%
Error Reduction Rate: 71.71%
Avg Precision: 96.86%
Avg Recall: 97.53%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.52
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.82
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.55
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.82
   char_order: 6
   word_order: 0
   beta: 2


In [55]:
print('Nescio Titaantjes (ByT5 pretrained sonar):') 
sonar_pred3_10k, sonar_gold3_10k = evaluate_T5(gold_data3, byt5_sonar_pred3_10k, 'ByT5 pretrained sonar')

Nescio Titaantjes (ByT5 pretrained sonar):
Baseline Accuracy: 96.28%
Accuracy: 99.45%
Error Reduction Rate: 85.16%
Avg Precision: 98.17%
Avg Recall: 98.59%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 99.17
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 99.5
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 99.43
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 99.52
   char_order: 6
   word_order: 0
   beta: 2


## 5. Rule-based 

In [56]:
print('Max Havelaar Multatuli (rule-based):')
_, silver_target1 = create_data(silver_data1)
silver_pred1, silver_gold1 = evaluate_rulebased(gold_data1, silver_target1, 'rule-based')

Max Havelaar Multatuli (rule-based):
Baseline Accuracy: 96.04%
Accuracy: 98.84%
Error Reduction Rate: 70.63%
Avg Precision: 96.02%
Avg Recall: 96.20%
ChrF scores with original predictions:
ChrF++: rule-based
   score: 98.68
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.86
   char_order: 6
   word_order: 0
   beta: 2
ChrF scores with aligned predictions:
ChrF++: rule-based
   score: 98.68
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.86
   char_order: 6
   word_order: 0
   beta: 2


In [57]:
print('Conan Doyle Sherlock Holmes De Agra Schat (rule-based):')
_, silver_target2 = create_data(silver_data2)
silver_pred2, silver_gold2 = evaluate_rulebased(gold_data2, silver_target2, 'rule-based')

Conan Doyle Sherlock Holmes De Agra Schat (rule-based):
Baseline Accuracy: 94.83%
Accuracy: 98.41%
Error Reduction Rate: 69.19%
Avg Precision: 96.52%
Avg Recall: 96.96%
ChrF scores with original predictions:
ChrF++: rule-based
   score: 98.5
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.82
   char_order: 6
   word_order: 0
   beta: 2
ChrF scores with aligned predictions:
ChrF++: rule-based
   score: 98.5
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.81
   char_order: 6
   word_order: 0
   beta: 2


In [58]:
print('Nescio Titaantjes (rule-based):') 
_, silver_target3 = create_data(silver_data3)
silver_pred3, silver_gold3 = evaluate_rulebased(gold_data3, silver_target3, 'rule-based')

Nescio Titaantjes (rule-based):
Baseline Accuracy: 96.28%
Accuracy: 98.60%
Error Reduction Rate: 62.41%
Avg Precision: 97.89%
Avg Recall: 98.27%
ChrF scores with original predictions:
ChrF++: rule-based
   score: 98.11
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.38
   char_order: 6
   word_order: 0
   beta: 2
ChrF scores with aligned predictions:
ChrF++: rule-based
   score: 98.1
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.38
   char_order: 6
   word_order: 0
   beta: 2


## Error Analysis

In [59]:
from collections import Counter

In [60]:
def get_misstakes(pred, gold):
    misstakes = []
    for x, y in zip(pred, gold):
        for x_word, y_word in zip(x, y):
            if x_word.strip() != y_word.strip():
                misstakes.append('Pred: ' + x_word + ' ' + 'Gold: ' + y_word)
    return Counter(misstakes).most_common(10), "Total misstakes: {}".format(len(misstakes))

## Rule-based top 10 spelling misstakes

In [61]:
get_misstakes(silver_pred1, silver_gold1)

([('Pred: zo -iets Gold: zoiets', 5),
  ('Pred: op-eens Gold: opeens', 4),
  ('Pred: korrespondentie Gold: correspondentie', 3),
  ('Pred: optemerken Gold: op te merken', 3),
  ('Pred: moeielijker Gold: moeilijker', 3),
  ('Pred: luî Gold: lui', 2),
  ('Pred: - Gold: –', 2),
  ('Pred: ouden Gold: oude', 2),
  ('Pred: konnexie Gold: connectie', 2),
  ('Pred: duitsch Gold: Duits', 2)],
 'Total misstakes: 116')

In [62]:
get_misstakes(books_pred1_10k, books_gold1_10k)

([('Pred: meisjen Gold: meisje', 5),
  ('Pred: op-eens Gold: opeens', 4),
  ('Pred: korrespondentie Gold: correspondentie', 3),
  ('Pred: optemerken Gold: op te merken', 3),
  ('Pred: moeielijker Gold: moeilijker', 3),
  ('Pred: luî Gold: lui', 2),
  ('Pred: konnexie Gold: connectie', 2),
  ('Pred: duitsche Gold: Duitse', 2),
  ('Pred: enigen Gold: enige', 2),
  ('Pred: solieden Gold: solide', 2)],
 'Total misstakes: 113')

In [63]:
get_misstakes(books_pred1_10k, books_gold1_10k)

([('Pred: meisjen Gold: meisje', 5),
  ('Pred: op-eens Gold: opeens', 4),
  ('Pred: korrespondentie Gold: correspondentie', 3),
  ('Pred: optemerken Gold: op te merken', 3),
  ('Pred: moeielijker Gold: moeilijker', 3),
  ('Pred: luî Gold: lui', 2),
  ('Pred: konnexie Gold: connectie', 2),
  ('Pred: duitsche Gold: Duitse', 2),
  ('Pred: enigen Gold: enige', 2),
  ('Pred: solieden Gold: solide', 2)],
 'Total misstakes: 113')

## Best ByT5 model (ByT5 pretrained books 10K) top 10 spelling misstakes

In [64]:
get_misstakes(books_pred1_10k, books_gold1_10k)

([('Pred: meisjen Gold: meisje', 5),
  ('Pred: op-eens Gold: opeens', 4),
  ('Pred: korrespondentie Gold: correspondentie', 3),
  ('Pred: optemerken Gold: op te merken', 3),
  ('Pred: moeielijker Gold: moeilijker', 3),
  ('Pred: luî Gold: lui', 2),
  ('Pred: konnexie Gold: connectie', 2),
  ('Pred: duitsche Gold: Duitse', 2),
  ('Pred: enigen Gold: enige', 2),
  ('Pred: solieden Gold: solide', 2)],
 'Total misstakes: 113')

In [65]:
get_misstakes(books_pred2_10k, books_gold2_10k)

([('Pred: zei Gold: zeide', 24),
  ('Pred: ene Gold: een', 13),
  ('Pred: enigen Gold: enige', 7),
  ('Pred: Neen Gold: Nee', 4),
  ('Pred: dezen Gold: deze', 4),
  ('Pred: bizonder Gold: bijzonder', 3),
  ('Pred: uwe Gold: uw', 3),
  ('Pred: uwen Gold: uw', 3),
  ('Pred: oudsten Gold: oudste', 3),
  ('Pred: gene Gold: geen', 3)],
 'Total misstakes: 137')

In [66]:
get_misstakes(books_pred3_10k, books_gold3_10k)

([('Pred: - Gold: de', 8),
  ('Pred: der Gold: van', 7),
  ('Pred: appelboomen Gold: appelbomen', 3),
  ('Pred: onzen Gold: onze', 2),
  ('Pred: Doch Gold: Maar', 2),
  ('Pred: verten Gold: verte', 2),
  ('Pred: anderen Gold: andere', 2),
  ('Pred: opaan Gold: op aan', 1),
  ('Pred: koeienoogen Gold: koeienogen', 1),
  ('Pred: der Gold: er', 1)],
 'Total misstakes: 66')