In [15]:
import re
from Levenshtein import distance
from tqdm import tqdm
import textwrap
import evaluate
import datasets
from difflib import SequenceMatcher
from difflib import Differ
from sklearn.metrics import precision_recall_fscore_support
import shutil
from evaluate_functions import *

In [16]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

## Gold data

In [3]:
# Gold data (human annotated)
with open('gold_data/Multatuli_MaxHavelaar_gold.txt', 'r') as g1:
    gold_data1 = g1.readlines()
    
with open('gold_data/ConanDoyle_SherlockHolmesDeAgraSchat_gold.txt', 'r') as g2:
    gold_data2 = g2.readlines()
    
with open('gold_data/Nescio_Titaantjes_gold.txt', 'r') as g3:
    gold_data3 = g3.readlines()

## Rule-based predictions

In [78]:
# Silver data from Rule-Based
with open('RuleBased_pred/Multatuli_MaxHavelaar_silver.txt', 'r') as s1:
    silver_data1 = s1.readlines()
    
with open('RuleBased_pred/ConanDoyle_SherlockHolmesDeAgraSchat_silver.txt', 'r') as s2:
    silver_data2 = s2.readlines()
    
with open('RuleBased_pred/Nescio_Titaantjes_silver.txt', 'r') as s3:
    silver_data3 = s3.readlines()

## Predictions Flan-T5 for 5k and 10k train data

In [5]:
# Predictions from Flan-T5 not pretrained 5K
with open('FlanT5_pred5k/Multatuli_MaxHavelaar_pred.txt', 'r') as flan_p1_5k:
    flan_pred1_5k = flan_p1_5k.readlines()

with open('FlanT5_pred5k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as flan_p2_5k:
    flan_pred2_5k = flan_p2_5k.readlines()
    
with open('FlanT5_pred5k/Nescio_Titaantjes_pred.txt', 'r') as flan_p3_5k:
    flan_pred3_5k = flan_p3_5k.readlines()

In [6]:
# Predictions from Flan-T5 not pretrained 10K
with open('FlanT5_pred10k/Multatuli_MaxHavelaar_pred.txt', 'r') as flan_p1_10k:
    flan_pred1_10k = flan_p1_10k.readlines()

with open('FlanT5_pred10k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as flan_p2_10k:
    flan_pred2_10k = flan_p2_10k.readlines()
    
with open('FlanT5_pred10k/Nescio_Titaantjes_pred.txt', 'r') as flan_p3_10k:
    flan_pred3_10k = flan_p3_10k.readlines()

## Predictions ByT5 original (not pretrained) for 5k and 10k train data

In [7]:
# Predictions from ByT5 not pretrained 5K
with open('ByT5_orig_pred5k/Multatuli_MaxHavelaar_pred.txt', 'r') as byt5_p1_5k:
    byt5_pred1_5k = byt5_p1_5k.readlines()

with open('ByT5_orig_pred5k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as byt5_p2_5k:
    byt5_pred2_5k = byt5_p2_5k.readlines()
    
with open('ByT5_orig_pred5k/Nescio_Titaantjes_pred.txt', 'r') as byt5_p3_5k:
    byt5_pred3_5k = byt5_p3_5k.readlines()

In [8]:
# Predictions from ByT5 not pretrained 5K
with open('ByT5_orig_pred10k/Multatuli_MaxHavelaar_pred.txt', 'r') as byt5_p1_10k:
    byt5_pred1_10k = byt5_p1_10k.readlines()

with open('ByT5_orig_pred10k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as byt5_p2_10k:
    byt5_pred2_10k = byt5_p2_10k.readlines()
    
with open('ByT5_orig_pred10k/Nescio_Titaantjes_pred.txt', 'r') as byt5_p3_10k:
    byt5_pred3_10k = byt5_p3_10k.readlines()

## Predictions ByT5 pretrained Books for 5k and 10k train data

In [9]:
# Predictions from ByT5 pretrained with 2 million sentences from BERTje Books 5K
with open('ByT5_pre_books_pred5k/Multatuli_MaxHavelaar_pred.txt', 'r') as books1_5k:
    byt5_books_pred1_5k = books1_5k.readlines()

with open('ByT5_pre_books_pred5k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as books2_5k:
    byt5_books_pred2_5k = books2_5k.readlines()
    
with open('ByT5_pre_books_pred5k/Nescio_Titaantjes_pred.txt', 'r') as books3_5k:
    byt5_books_pred3_5k = books3_5k.readlines()

In [10]:
# Predictions from ByT5 pretrained with 2 million sentences from BERTje Books 5K
with open('ByT5_pre_books_pred10k/Multatuli_MaxHavelaar_pred.txt', 'r') as books1_10k:
    byt5_books_pred1_10k = books1_10k.readlines()

with open('ByT5_pre_books_pred10k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as books2_10k:
    byt5_books_pred2_10k = books2_10k.readlines()
    
with open('ByT5_pre_books_pred10k/Nescio_Titaantjes_pred.txt', 'r') as books3_10k:
    byt5_books_pred3_10k = books3_10k.readlines()

## Predictions ByT5 pretrained Sonar for 5k and 10k train data

In [11]:
# Predictions from ByT5 pretrained with 2 million sentences from BERTje Sonar 5K
with open('ByT5_pre_sonar_pred5k/Multatuli_MaxHavelaar_pred.txt', 'r') as sonar1_5k:
    byt5_sonar_pred1_5k = sonar1_5k.readlines()

with open('ByT5_pre_sonar_pred5k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as sonar2_5k:
    byt5_sonar_pred2_5k = sonar2_5k.readlines()
    
with open('ByT5_pre_sonar_pred5k/Nescio_Titaantjes_pred.txt', 'r') as sonar3_5k:
    byt5_sonar_pred3_5k = sonar3_5k.readlines()

In [12]:
# Predictions from ByT5 pretrained with 2 million sentences from BERTje Sonar 5K
with open('ByT5_pre_sonar_pred10k/Multatuli_MaxHavelaar_pred.txt', 'r') as sonar1_10k:
    byt5_sonar_pred1_10k = sonar1_10k.readlines()

with open('ByT5_pre_sonar_pred10k/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as sonar2_10k:
    byt5_sonar_pred2_10k = sonar2_10k.readlines()
    
with open('ByT5_pre_sonar_pred10k/Nescio_Titaantjes_pred.txt', 'r') as sonar3_10k:
    byt5_sonar_pred3_10k = sonar3_10k.readlines()

## 1. Flan T5 (not pretrained) 5K

In [22]:
print('Max Havelaar Multatuli (FlanT5):')
_ = evaluate_T5(gold_data1, flan_pred1_5k, 'Flan-T5')

Max Havelaar Multatuli (FlanT5):
Baseline Accuracy: 96.04%
Accuracy: 98.24%
Error Reduction Rate: 55.44%
Avg Precision: 94.19%
Avg Recall: 94.76%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 97.16
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 97.78
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 98.14
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.43
   char_order: 6
   word_order: 0
   beta: 2


In [23]:
print('Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):')
_ = evaluate_T5(gold_data2, flan_pred2_5k, 'Flan-T5')

Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):
Baseline Accuracy: 94.87%
Accuracy: 97.91%
Error Reduction Rate: 59.18%
Avg Precision: 94.94%
Avg Recall: 96.00%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 97.89
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.31
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 98.01
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.42
   char_order: 6
   word_order: 0
   beta: 2


In [24]:
print('Nescio Titaantjes (FlanT5):') 
_ = evaluate_T5(gold_data3, flan_pred3_5k, 'Flan-T5')

Nescio Titaantjes (FlanT5):
Baseline Accuracy: 96.29%
Accuracy: 98.96%
Error Reduction Rate: 72.02%
Avg Precision: 96.36%
Avg Recall: 97.11%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 98.66
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 99.08
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 99.01
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 99.21
   char_order: 6
   word_order: 0
   beta: 2


## 1. Flan T5 (not pretrained) 10K

In [25]:
print('Max Havelaar Multatuli (FlanT5):')
_ = evaluate_T5(gold_data1, flan_pred1_10k, 'Flan-T5')

Max Havelaar Multatuli (FlanT5):
Baseline Accuracy: 96.04%
Accuracy: 98.48%
Error Reduction Rate: 61.52%
Avg Precision: 94.85%
Avg Recall: 95.21%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 97.96
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.58
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 98.34
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.58
   char_order: 6
   word_order: 0
   beta: 2


In [26]:
print('Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):')
_ = evaluate_T5(gold_data2, flan_pred2_10k, 'Flan-T5')

Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):
Baseline Accuracy: 94.87%
Accuracy: 98.08%
Error Reduction Rate: 62.50%
Avg Precision: 95.53%
Avg Recall: 96.32%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 98.06
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.46
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 98.16
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 98.54
   char_order: 6
   word_order: 0
   beta: 2


In [27]:
print('Nescio Titaantjes (FlanT5):') 
_ = evaluate_T5(gold_data3, flan_pred3_10k, 'Flan-T5')

Nescio Titaantjes (FlanT5):
Baseline Accuracy: 96.29%
Accuracy: 99.16%
Error Reduction Rate: 77.29%
Avg Precision: 96.89%
Avg Recall: 97.46%


ChrF scores with original predictions:
ChrF++: Flan-T5
   score: 98.95
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 99.35
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: Flan-T5
   score: 99.19
   char_order: 6
   word_order: 2
   beta: 2
ChrF: Flan-T5
   score: 99.34
   char_order: 6
   word_order: 0
   beta: 2


## 2. ByT5 original (not pretrained) 5K

In [28]:
print('Max Havelaar Multatuli (ByT5 orig):')
_ = evaluate_T5(gold_data1, byt5_pred1_5k, 'ByT5 orig')

Max Havelaar Multatuli (ByT5 orig):
Baseline Accuracy: 96.04%
Accuracy: 98.72%
Error Reduction Rate: 67.59%
Avg Precision: 95.73%
Avg Recall: 95.99%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 98.16
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.73
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 98.59
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.79
   char_order: 6
   word_order: 0
   beta: 2


In [29]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):')
_ = evaluate_T5(gold_data2, byt5_pred2_5k, 'ByT5 orig')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):
Baseline Accuracy: 94.87%
Accuracy: 98.57%
Error Reduction Rate: 72.07%
Avg Precision: 96.93%
Avg Recall: 97.58%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 98.57
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.87
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 98.6
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.87
   char_order: 6
   word_order: 0
   beta: 2


In [30]:
print('Nescio Titaantjes (ByT5 orig):') 
_ = evaluate_T5(gold_data3, byt5_pred3_5k, 'ByT5 orig')

Nescio Titaantjes (ByT5 orig):
Baseline Accuracy: 96.29%
Accuracy: 99.34%
Error Reduction Rate: 82.11%
Avg Precision: 97.69%
Avg Recall: 98.13%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 99.09
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 99.45
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 99.34
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 99.45
   char_order: 6
   word_order: 0
   beta: 2


## 2. ByT5 original (not pretrained) 10K

In [31]:
print('Max Havelaar Multatuli (ByT5 orig):')
_ = evaluate_T5(gold_data1, byt5_pred1_10k, 'ByT5 orig')

Max Havelaar Multatuli (ByT5 orig):
Baseline Accuracy: 96.04%
Accuracy: 98.72%
Error Reduction Rate: 67.59%
Avg Precision: 95.74%
Avg Recall: 95.99%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 98.22
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.8
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 98.6
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.8
   char_order: 6
   word_order: 0
   beta: 2


In [32]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):')
_ = evaluate_T5(gold_data2, byt5_pred2_10k, 'ByT5 orig')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):
Baseline Accuracy: 94.87%
Accuracy: 98.58%
Error Reduction Rate: 72.27%
Avg Precision: 96.93%
Avg Recall: 97.59%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 98.59
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.89
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 98.62
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 98.89
   char_order: 6
   word_order: 0
   beta: 2


In [33]:
print('Nescio Titaantjes (ByT5 orig):') 
_ = evaluate_T5(gold_data3, byt5_pred3_10k, 'ByT5 orig')

Nescio Titaantjes (ByT5 orig):
Baseline Accuracy: 96.29%
Accuracy: 99.37%
Error Reduction Rate: 83.03%
Avg Precision: 97.81%
Avg Recall: 98.31%


ChrF scores with original predictions:
ChrF++: ByT5 orig
   score: 99.12
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 99.47
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 orig
   score: 99.36
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 orig
   score: 99.47
   char_order: 6
   word_order: 0
   beta: 2


## 3. ByT5 pretrained with 2 million sentences from BERTje Books 5K

In [34]:
print('Max Havelaar Multatuli (ByT5 pretrained books):')
_ = evaluate_T5(gold_data1, byt5_books_pred1_5k, 'ByT5 pretrained books')

Max Havelaar Multatuli (ByT5 pretrained books):
Baseline Accuracy: 96.04%
Accuracy: 98.70%
Error Reduction Rate: 67.09%
Avg Precision: 96.03%
Avg Recall: 96.28%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 98.22
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.79
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 98.6
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.8
   char_order: 6
   word_order: 0
   beta: 2


In [35]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):')
_ = evaluate_T5(gold_data2, byt5_books_pred2_5k, 'ByT5 pretrained books')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):
Baseline Accuracy: 94.87%
Accuracy: 98.53%
Error Reduction Rate: 71.29%
Avg Precision: 96.81%
Avg Recall: 97.58%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 98.53
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.83
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 98.55
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.83
   char_order: 6
   word_order: 0
   beta: 2


In [36]:
print('Nescio Titaantjes (ByT5 pretrained books):') 
_ = evaluate_T5(gold_data3, byt5_books_pred3_5k, 'ByT5 pretrained books')

Nescio Titaantjes (ByT5 pretrained books):
Baseline Accuracy: 96.29%
Accuracy: 99.41%
Error Reduction Rate: 84.17%
Avg Precision: 98.04%
Avg Recall: 98.47%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 99.16
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 99.51
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 99.43
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 99.53
   char_order: 6
   word_order: 0
   beta: 2


## 3. ByT5 pretrained with 2 million sentences from BERTje Books 10K

In [37]:
print('Max Havelaar Multatuli (ByT5 pretrained books):')
books_pred1_10k, books_gold1_10k, books_raw1_10k = evaluate_T5(gold_data1, byt5_books_pred1_10k, 'ByT5 pretrained books')

Max Havelaar Multatuli (ByT5 pretrained books):
Baseline Accuracy: 96.04%
Accuracy: 98.87%
Error Reduction Rate: 71.39%
Avg Precision: 96.04%
Avg Recall: 96.39%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 98.37
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.92
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 98.74
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.92
   char_order: 6
   word_order: 0
   beta: 2


In [38]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):')
books_pred2_10k, books_gold2_10k, books_raw2_10k = evaluate_T5(gold_data2, byt5_books_pred2_10k, 'ByT5 pretrained books')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):
Baseline Accuracy: 94.87%
Accuracy: 98.67%
Error Reduction Rate: 74.02%
Avg Precision: 97.28%
Avg Recall: 97.87%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 98.65
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.92
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 98.67
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 98.92
   char_order: 6
   word_order: 0
   beta: 2


In [39]:
print('Nescio Titaantjes (ByT5 pretrained books):') 
_ = evaluate_T5(gold_data3, byt5_books_pred3_10k, 'ByT5 pretrained books')

Nescio Titaantjes (ByT5 pretrained books):
Baseline Accuracy: 96.29%
Accuracy: 99.46%
Error Reduction Rate: 85.32%
Avg Precision: 98.26%
Avg Recall: 98.57%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained books
   score: 99.16
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 99.49
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained books
   score: 99.46
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained books
   score: 99.56
   char_order: 6
   word_order: 0
   beta: 2


## 4. ByT5 pretrained with 2 million sentences from BERTje Sonar 5K

In [40]:
print('Max Havelaar Multatuli (ByT5 pretrained sonar):')
_ = evaluate_T5(gold_data1, byt5_sonar_pred1_5k, 'ByT5 pretrained sonar')

Max Havelaar Multatuli (ByT5 pretrained sonar):
Baseline Accuracy: 96.04%
Accuracy: 98.70%
Error Reduction Rate: 67.09%
Avg Precision: 95.67%
Avg Recall: 95.92%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.19
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.77
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.56
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.77
   char_order: 6
   word_order: 0
   beta: 2


In [41]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):')
_ = evaluate_T5(gold_data2, byt5_sonar_pred2_5k, 'ByT5 pretrained sonar')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):
Baseline Accuracy: 94.87%
Accuracy: 98.58%
Error Reduction Rate: 72.27%
Avg Precision: 96.97%
Avg Recall: 97.52%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.56
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.85
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.58
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.85
   char_order: 6
   word_order: 0
   beta: 2


In [42]:
print('Nescio Titaantjes (ByT5 pretrained sonar):') 
_ = evaluate_T5(gold_data3, byt5_sonar_pred3_5k, 'ByT5 pretrained sonar')

Nescio Titaantjes (ByT5 pretrained sonar):
Baseline Accuracy: 96.29%
Accuracy: 99.40%
Error Reduction Rate: 83.94%
Avg Precision: 98.08%
Avg Recall: 98.30%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.99
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 99.26
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 99.23
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 99.26
   char_order: 6
   word_order: 0
   beta: 2


## 4. ByT5 pretrained with 2 million sentences from BERTje Sonar 10K

In [43]:
print('Max Havelaar Multatuli (ByT5 pretrained sonar):')
_ = evaluate_T5(gold_data1, byt5_sonar_pred1_10k, 'ByT5 pretrained sonar')

Max Havelaar Multatuli (ByT5 pretrained sonar):
Baseline Accuracy: 96.04%
Accuracy: 98.79%
Error Reduction Rate: 69.37%
Avg Precision: 95.91%
Avg Recall: 96.20%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.13
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.82
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.67
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.86
   char_order: 6
   word_order: 0
   beta: 2


In [44]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):')
_ = evaluate_T5(gold_data2, byt5_sonar_pred2_10k, 'ByT5 pretrained sonar')

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):
Baseline Accuracy: 94.87%
Accuracy: 98.58%
Error Reduction Rate: 72.27%
Avg Precision: 96.89%
Avg Recall: 97.56%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.55
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.84
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 98.58
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 98.84
   char_order: 6
   word_order: 0
   beta: 2


In [45]:
print('Nescio Titaantjes (ByT5 pretrained sonar):') 
sonar_pred3_10k, sonar_gold3_10k, sonar_raw3_10k = evaluate_T5(gold_data3, byt5_sonar_pred3_10k, 'ByT5 pretrained sonar')

Nescio Titaantjes (ByT5 pretrained sonar):
Baseline Accuracy: 96.29%
Accuracy: 99.46%
Error Reduction Rate: 85.55%
Avg Precision: 98.21%
Avg Recall: 98.59%


ChrF scores with original predictions:
ChrF++: ByT5 pretrained sonar
   score: 99.19
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 99.52
   char_order: 6
   word_order: 0
   beta: 2


ChrF scores with aligned predictions:
ChrF++: ByT5 pretrained sonar
   score: 99.45
   char_order: 6
   word_order: 2
   beta: 2
ChrF: ByT5 pretrained sonar
   score: 99.54
   char_order: 6
   word_order: 0
   beta: 2


## 5. Rule-based 

In [46]:
print('Max Havelaar Multatuli (rule-based):')
_, silver_target1 = create_data(silver_data1)
silver_pred1, silver_gold1, silver_raw1 = evaluate_rulebased(gold_data1, silver_target1, 'rule-based')

Max Havelaar Multatuli (rule-based):
Baseline Accuracy: 96.04%
Accuracy: 98.84%
Error Reduction Rate: 70.63%
Avg Precision: 96.02%
Avg Recall: 96.20%
ChrF scores with original predictions:
ChrF++: rule-based
   score: 98.68
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.86
   char_order: 6
   word_order: 0
   beta: 2
ChrF scores with aligned predictions:
ChrF++: rule-based
   score: 98.68
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.86
   char_order: 6
   word_order: 0
   beta: 2


In [47]:
print('Conan Doyle Sherlock Holmes De Agra Schat (rule-based):')
_, silver_target2 = create_data(silver_data2)
silver_pred2, silver_gold2, silver_raw2 = evaluate_rulebased(gold_data2, silver_target2, 'rule-based')

Conan Doyle Sherlock Holmes De Agra Schat (rule-based):
Baseline Accuracy: 94.87%
Accuracy: 98.45%
Error Reduction Rate: 69.73%
Avg Precision: 96.54%
Avg Recall: 97.00%
ChrF scores with original predictions:
ChrF++: rule-based
   score: 98.53
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.84
   char_order: 6
   word_order: 0
   beta: 2
ChrF scores with aligned predictions:
ChrF++: rule-based
   score: 98.52
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.83
   char_order: 6
   word_order: 0
   beta: 2


In [48]:
print('Nescio Titaantjes (rule-based):') 
_, silver_target3 = create_data(silver_data3)
silver_pred3, silver_gold3, silver_raw3 = evaluate_rulebased(gold_data3, silver_target3, 'rule-based')

Nescio Titaantjes (rule-based):
Baseline Accuracy: 96.30%
Accuracy: 98.62%
Error Reduction Rate: 62.70%
Avg Precision: 97.93%
Avg Recall: 98.27%
ChrF scores with original predictions:
ChrF++: rule-based
   score: 98.12
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.4
   char_order: 6
   word_order: 0
   beta: 2
ChrF scores with aligned predictions:
ChrF++: rule-based
   score: 98.12
   char_order: 6
   word_order: 2
   beta: 2
ChrF: rule-based
   score: 98.39
   char_order: 6
   word_order: 0
   beta: 2


## Combining RB + ByT5

In [76]:
pred_com = []
gold_com = []
raw_com = []
for rb, rb_gold, rb_raw, by, by_gold, by_raw in zip(silver_pred3, silver_gold3, silver_raw3, sonar_pred3_10k, sonar_gold3_10k, sonar_raw3_10k):
    if len(rb) == len(by):
        pred = []
        gold = []
        raw = []
        for rb1, rb_g, rb_r, by1, by_g, by_r in zip(rb, rb_gold, rb_raw, by, by_gold, by_raw):
            if rb1 == by1:
                pred.append(rb1)
                gold.append(rb_g)
                raw.append(rb_r)
            elif rb1 != by1:
                pred.append(rb1)
                gold.append(rb_g)
                raw.append(rb_r)
        pred_com.append(pred)
        gold_com.append(gold)
        raw_com.append(raw)
    else:
        pred_com.append(rb)
        gold_com.append(rb_gold)
        raw_com.append(rb_raw)

### Favoring ByT5

In [67]:
# Multatuli (favoring ByT5 for disagreement)
print(cal_err(raw_com, gold_com, pred_com))

Baseline Accuracy: 96.04%
Accuracy: 98.86%
Error Reduction Rate: 71.14%


In [69]:
# Sherlock Holmes (favoring ByT5 for disagreement)
print(cal_err(raw_com, gold_com, pred_com))

Baseline Accuracy: 94.87%
Accuracy: 98.65%
Error Reduction Rate: 73.63%


In [71]:
# Titaantjes (favoring ByT5 for disagreement)
print(cal_err(raw_com, gold_com, pred_com))

Baseline Accuracy: 96.30%
Accuracy: 99.42%
Error Reduction Rate: 84.44%


### Favoring Rule-based

In [73]:
# Multatuli (favoring RB for disagreement)
print(cal_err(raw_com, gold_com, pred_com))

Baseline Accuracy: 96.04%
Accuracy: 98.84%
Error Reduction Rate: 70.63%


In [75]:
# Sherlock Holmes (favoring RB for disagreement)
print(cal_err(raw_com, gold_com, pred_com))

Baseline Accuracy: 94.87%
Accuracy: 98.45%
Error Reduction Rate: 69.73%


In [77]:
# Titaantjes (favoring RB for disagreement)
print(cal_err(raw_com, gold_com, pred_com))

Baseline Accuracy: 96.30%
Accuracy: 98.62%
Error Reduction Rate: 62.70%


## Error Analysis

In [171]:
from collections import Counter

In [215]:
def get_mistakes(pred, gold):
    misstakes = []
    for x, y in zip(pred, gold):
        for x_word, y_word in zip(x, y):
            if x_word.strip() != y_word.strip():
                misstakes.append('Pred: ' + x_word + ' ' + 'Gold: ' + y_word)
    return Counter(misstakes).most_common(12), "Total misstakes: {}".format(len(misstakes))

### Rule-based top 5 spelling misstakes

In [223]:
get_misstakes(silver_pred1, silver_gold1)

([('Pred: zo -iets Gold: zoiets', 5),
  ('Pred: op-eens Gold: opeens', 4),
  ('Pred: korrespondentie Gold: correspondentie', 3),
  ('Pred: optemerken Gold: op te merken', 3),
  ('Pred: moeielijker Gold: moeilijker', 3),
  ('Pred: luî Gold: lui', 2),
  ('Pred: - Gold: –', 2),
  ('Pred: ouden Gold: oude', 2),
  ('Pred: konnexie Gold: connectie', 2),
  ('Pred: duitsch Gold: Duits', 2),
  ('Pred: duitsche Gold: Duitse', 2),
  ('Pred: enigen Gold: enige', 2)],
 'Total misstakes: 116')

In [191]:
get_misstakes(silver_pred2, silver_gold2)

([('Pred: zei Gold: zeide', 24),
  ('Pred: ene Gold: een', 13),
  ('Pred: enigen Gold: enige', 7),
  ('Pred: dezen Gold: deze', 4),
  ('Pred: uwe Gold: uw', 3),
  ('Pred: uwen Gold: uw', 3),
  ('Pred: gene Gold: geen', 3),
  ('Pred: haren Gold: haar', 3),
  ('Pred: onzen Gold: onze', 3),
  ('Pred: te zamen Gold: tezamen', 2),
  ('Pred: zoëven Gold: zo-even', 2),
  ('Pred: Bij voorbeeld Gold: Bijvoorbeeld', 2)],
 'Total misstakes: 155')

In [192]:
get_misstakes(silver_pred3, silver_gold3)

([('Pred: hij Gold: -ie', 93),
  ('Pred: - Gold: de', 8),
  ('Pred: der Gold: van', 7),
  ('Pred: onzen Gold: onze', 2),
  ('Pred: zooiets Gold: zoiets', 2),
  ('Pred: verten Gold: verte', 2),
  ('Pred: anderen Gold: andere', 2),
  ('Pred: opaan Gold: op aan', 1),
  ('Pred: koeienoogen Gold: koeienogen', 1),
  ('Pred: der Gold: er', 1),
  ('Pred: effe Gold: effen', 1),
  ('Pred: geprakkizeerd Gold: geprakkiseerd', 1)],
 'Total misstakes: 163')

### Best ByT5 model (ByT5 pretrained books and sonar) top 5 spelling misstakes

In [225]:
get_misstakes(books_pred1_10k, books_gold1_10k)

([('Pred: meisjen Gold: meisje', 5),
  ('Pred: op-eens Gold: opeens', 4),
  ('Pred: korrespondentie Gold: correspondentie', 3),
  ('Pred: optemerken Gold: op te merken', 3),
  ('Pred: moeielijker Gold: moeilijker', 3),
  ('Pred: luî Gold: lui', 2),
  ('Pred: konnexie Gold: connectie', 2),
  ('Pred: duitsche Gold: Duitse', 2),
  ('Pred: enigen Gold: enige', 2),
  ('Pred: solieden Gold: solide', 2),
  ("Pred: m'nheer Gold: mijnheer", 2),
  ('Pred: ter-zijde Gold: terzijde', 2)],
 'Total misstakes: 113')

In [177]:
get_misstakes(books_pred2_10k, books_gold2_10k)

([('Pred: zei Gold: zeide', 24),
  ('Pred: ene Gold: een', 13),
  ('Pred: enigen Gold: enige', 7),
  ('Pred: dezen Gold: deze', 4),
  ('Pred: bizonder Gold: bijzonder', 3),
  ('Pred: uwe Gold: uw', 3),
  ('Pred: uwen Gold: uw', 3),
  ('Pred: oudsten Gold: oudste', 3),
  ('Pred: gene Gold: geen', 3),
  ('Pred: onzen Gold: onze', 3),
  ('Pred: zoëven Gold: zo-even', 2),
  ('Pred: ongelukkigen Gold: ongelukkige', 2)],
 'Total misstakes: 133')

In [197]:
get_misstakes(sonar_pred3_10k, sonar_gold3_10k)

([('Pred: - Gold: de', 8),
  ('Pred: der Gold: van', 7),
  ('Pred: onzen Gold: onze', 2),
  ('Pred: verten Gold: verte', 2),
  ('Pred: anderen Gold: andere', 2),
  ('Pred: opaan Gold: op aan', 1),
  ('Pred: der Gold: er', 1),
  ('Pred: effe Gold: effen', 1),
  ('Pred: geprakkizeerd Gold: geprakkiseerd', 1),
  ('Pred: metdertijd Gold: mettertijd', 1),
  ('Pred: bizonder Gold: bijzonder', 1),
  ('Pred: ruggemerg Gold: ruggenmerg', 1)],
 'Total misstakes: 63')

## Extended Analysis

In [264]:
c = 0
original1, _ = create_data(gold_data1)
for orig1, gold1, rule1, byt5_1, flant5_1 in zip(original1, silver_gold1, silver_pred1, books_pred1_10k, FlanT5_prediction1):
    c += 1
    if ' '.join(rule1) != ' '.join(byt5_1):
        print(c, 'ORIG:', ''.join(orig1))
        print(c, 'GOLD:', ' '.join(gold1))
        print(c, 'RULE:', ' '.join(rule1))
        print(c, 'FLANT5:', ' '.join(flant5_1))
        print(c, 'BYT5:',' '.join(byt5_1))
        print('\n')

3 ORIG: Het is mijn gewoonte niet , romans te schrijven , of zulke dingen , en het heeft dan ook lang geduurd , voor ik er toe overging een paar riem papier extra te bestellen , en het werk aantevangen , dat gij , lieve lezer , zoo-even in de hand hebt genomen , en dat ge lezen moet als ge makelaar in koffie zijt , of als ge wat anders zijt .
3 GOLD: Het is mijn gewoonte niet , romans te schrijven , of zulke dingen , en het heeft dan ook lang geduurd , voor ik er toe overging een paar riem papier extra te bestellen , en het werk aantevangen , dat gij , lieve lezer , zo-even in de hand hebt genomen , en dat ge lezen moet als ge makelaar in koffie zijt , of als ge wat anders zijt .
3 RULE: Het is mijn gewoonte niet , romans te schrijven , of zulke dingen , en het heeft dan ook lang geduurd , voor ik er toe overging een paar riem papier extra te bestellen , en het werk aantevangen , dat gij , lieve lezer , zoo-even in de hand hebt genomen , en dat ge lezen moet als ge makelaar in koffie z

In [265]:
c = 0
original2, _ = create_data(gold_data2)
for orig2, gold2, rule2, byt5_2, flant5_2 in zip(original2, silver_gold2, silver_pred2, books_pred2_10k, FlanT5_prediction2):
    c += 1
    if ' '.join(rule2) != ' '.join(byt5_2):
        print(c, 'ORIG:', ''.join(orig2))
        print(c, 'GOLD:', ' '.join(gold2))
        print(c, 'RULE:', ' '.join(rule2))
        print(c, 'FLANT5:', ' '.join(flant5_2))
        print(c, 'BYT5:',' '.join(byt5_2))
        print('\n')

3 ORIG: Sherlock Holmes nam zijn flesch van den schoorsteenmantel en zijn werktuig voor onderhuidsche inspuitingen uit zijn marokijnen foudraal .
3 GOLD: Sherlock Holmes nam zijn fles van de schoorsteenmantel en zijn werktuig voor onderhuidse inspuitingen uit zijn marokijnen foudraal .
3 RULE: Sherlock Holmes nam zijn fles van de schoorsteenmantel en zijn werktuig voor onderhuidsche inspuitingen uit zijn marokijnen foudraal .
3 FLANT5: Sherlock Holmes nam zijn fles van de schoorsteenmantel en zijn werktuig voor onderhuidsche inspuitingen uit zijn marokijnen foudraal .
3 BYT5: Sherlock Holmes nam zijn fles van de schoorsteenmantel en zijn werktuig voor onderhuidse inspuitingen uit zijn marokijnen foudraal .


8 ORIG: Integendeel , van dag tot dag stond mij het gezicht ervan meer tegen , en elken avond verweet ik mij mijn gebrek aan moed , om mij ertegen te verzetten .
8 GOLD: Integendeel , van dag tot dag stond mij het gezicht ervan meer tegen , en elke avond verweet ik mij mijn gebrek 

In [266]:
c = 0
original3, _ = create_data(gold_data3)
for orig3, gold3, rule3, byt5_3, flant5_3 in zip(original3, silver_gold3, silver_pred3, sonar_pred3_10k, FlanT5_prediction3):
    c += 1
    if ' '.join(rule3) != ' '.join(byt5_3):
        print(c, 'ORIG:', ''.join(orig3))
        print(c, 'GOLD:', ' '.join(gold3))
        print(c, 'RULE:', ' '.join(rule3))
        print(c, 'FLANT5:', ' '.join(flant5_3))
        print(c, 'BYT5:',' '.join(byt5_3))
        print('\n')

19 ORIG: En Kees Ploeger praat van die rare kerels die 'm op den slechten weg brachten .
19 GOLD: En Kees Ploeger praat van die rare kerels die 'm op de slechte weg brachten .
19 RULE: En Kees Ploeger praat van die rare kerels die 'm op de slechte weg brachten .
19 FLANT5: En Kees Ploeger praat van die rare kerels die'm op de slechten weg brachten .
19 BYT5: En Kees Ploeger praat van die rare kerels die'm op de slechte weg brachten .


25 ORIG: Bekker had een vaag besef dat -ie alle kantoren wilde afbreken , Ploeger wilde zijn baas z'n eigen klokken laten inpakken en er bij gaan staan met een sigaar in z'n hoofd en vloeken op die kerels die nooit iets goed konden doen .
25 GOLD: Bekker had een vaag besef dat -ie alle kantoren wilde afbreken , Ploeger wilde zijn baas z'n eigen klokken laten inpakken en er bij gaan staan met een sigaar in z'n hoofd en vloeken op die kerels die nooit iets goed konden doen .
25 RULE: Bekker had een vaag besef dat hij alle kantoren wilde afbreken , Ploeger 