In [11]:
import re
from Levenshtein import distance
from tqdm import tqdm
import textwrap
import evaluate
import datasets
from difflib import SequenceMatcher
from difflib import Differ
from sklearn.metrics import precision_recall_fscore_support
import shutil
from evaluate_functions import *

In [3]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [4]:
# Silver data from Rule-Based
with open('silver_data/Multatuli_MaxHavelaar_silver.txt', 'r') as s1:
    silver_data1 = s1.readlines()
    
with open('silver_data/ConanDoyle_SherlockHolmesDeAgraSchat_silver.txt', 'r') as s2:
    silver_data2 = s2.readlines()
    
with open('silver_data/Nescio_Titaantjes_silver.txt', 'r') as s3:
    silver_data3 = s3.readlines()

In [5]:
# Gold data (human annotated)
with open('gold_data/Multatuli_MaxHavelaar_gold.txt', 'r') as g1:
    gold_data1 = g1.readlines()
    
with open('gold_data/ConanDoyle_SherlockHolmesDeAgraSchat_gold.txt', 'r') as g2:
    gold_data2 = g2.readlines()
    
with open('gold_data/Nescio_Titaantjes_gold.txt', 'r') as g3:
    gold_data3 = g3.readlines()

In [6]:
# Predictions from Flan-T5 not pretrained
with open('FlanT5_pred/Multatuli_MaxHavelaar_pred.txt', 'r') as flan_p1:
    flan_pred1 = flan_p1.readlines()

with open('FlanT5_pred/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as flan_p2:
    flan_pred2 = flan_p2.readlines()
    
with open('FlanT5_pred/Nescio_Titaantjes_pred.txt', 'r') as flan_p3:
    flan_pred3 = flan_p3.readlines()

In [7]:
# Predictions from ByT5 not pretrained
with open('ByT5_orig_predV2/Multatuli_MaxHavelaar_pred.txt', 'r') as byt5_p1:
    byt5_pred1 = byt5_p1.readlines()

with open('ByT5_orig_predV2/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as byt5_p2:
    byt5_pred2 = byt5_p2.readlines()
    
with open('ByT5_orig_predV2/Nescio_Titaantjes_pred.txt', 'r') as byt5_p3:
    byt5_pred3 = byt5_p3.readlines()

In [8]:
# Predictions from ByT5 pretrained with 2 million sentences from BERTje Books 
with open('ByT5_pre_books_predV4/Multatuli_MaxHavelaar_pred.txt', 'r') as books1:
    byt5_books_pred1 = books1.readlines()

with open('ByT5_pre_books_predV4/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as books2:
    byt5_books_pred2 = books2.readlines()
    
with open('ByT5_pre_books_predV4/Nescio_Titaantjes_pred.txt', 'r') as books3:
    byt5_books_pred3 = books3.readlines()

In [9]:
# Predictions from ByT5 pretrained with 2 million sentences from BERTje Sonar 
with open('ByT5_pre_sonar_predV2/Multatuli_MaxHavelaar_pred.txt', 'r') as sonar1:
    byt5_sonar_pred1 = sonar1.readlines()

with open('ByT5_pre_sonar_predV2/ConanDoyle_SherlockHolmesDeAgraSchat_pred.txt', 'r') as sonar2:
    byt5_sonar_pred2 = sonar2.readlines()
    
with open('ByT5_pre_sonar_predV2/Nescio_Titaantjes_pred.txt', 'r') as sonar3:
    byt5_sonar_pred3 = sonar3.readlines()

In [None]:
'''
Accuracy ERR scores:
1. FlanT5 
2. ByT5 original
3. ByT5 pretrained Sonar 
4. ByT5 pretrained Books 
5. Rule-based

Best results for each novel:

1. Max Havelaar Multatuli: 1. Rule-Based (Acc ERR: 68.35%)
2. Conan Doyle Sherlock Holmes De Agra Schat: 4. ByT5 pretrained sonar (Acc ERR: 71.71%)
3. Nescio Titaantjes: 3. ByT5 pretrained books (Acc ERR: 83.79%%)
'''

## 1. Flan T5 (not pretrained)

In [12]:
print('Max Havelaar Multatuli (FlanT5):')
evaluate_err(gold_data1, flan_pred1)

Max Havelaar Multatuli (FlanT5):
Baseline Accuracy: 96.04%
Accuracy: 98.24%
Error Reduction Rate: 55.44%
Avg Precision: 98.45%
Avg Recall: 98.46%


In [13]:
print('Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):')
evaluate_err(gold_data2, flan_pred2)

Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):
Baseline Accuracy: 94.83%
Accuracy: 97.87%
Error Reduction Rate: 58.72%
Avg Precision: 97.95%
Avg Recall: 97.99%


In [14]:
print('Nescio Titaantjes (FlanT5):') 
evaluate_err(gold_data3, flan_pred3)

Nescio Titaantjes (FlanT5):
Baseline Accuracy: 96.28%
Accuracy: 98.95%
Error Reduction Rate: 71.69%
Avg Precision: 99.23%
Avg Recall: 99.28%


## 2. ByT5 original (not pretrained)

In [15]:
print('Max Havelaar Multatuli (ByT5 orig):')
evaluate_err(gold_data1, byt5_pred1)

Max Havelaar Multatuli (ByT5 orig):
Baseline Accuracy: 96.04%
Accuracy: 98.72%
Error Reduction Rate: 67.59%
Avg Precision: 98.73%
Avg Recall: 98.73%


In [16]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):')
evaluate_err(gold_data2, byt5_pred2)

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):
Baseline Accuracy: 94.83%
Accuracy: 98.53%
Error Reduction Rate: 71.51%
Avg Precision: 98.51%
Avg Recall: 98.54%


In [261]:
print('Nescio Titaantjes (ByT5 orig):') 
evaluate_err(gold_data3, byt5_pred3)

Nescio Titaantjes (ByT5 orig):
Baseline Accuracy: 96.28%
Accuracy: 99.32%
Error Reduction Rate: 81.74%
Avg Precision: 99.48%
Avg Recall: 99.51%


## 3. ByT5 pretrained with 2 million sentences from BERTje Books

In [17]:
print('Max Havelaar Multatuli (ByT5 pretrained books):')
evaluate_err(gold_data1, byt5_books_pred1)

Max Havelaar Multatuli (ByT5 pretrained books):
Baseline Accuracy: 96.04%
Accuracy: 98.70%
Error Reduction Rate: 67.09%
Avg Precision: 98.76%
Avg Recall: 98.76%


In [18]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):')
evaluate_err(gold_data2, byt5_books_pred2)

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):
Baseline Accuracy: 94.83%
Accuracy: 98.49%
Error Reduction Rate: 70.74%
Avg Precision: 98.46%
Avg Recall: 98.50%


In [19]:
print('Nescio Titaantjes (ByT5 pretrained books):') 
evaluate_err(gold_data3, byt5_books_pred3)

Nescio Titaantjes (ByT5 pretrained books):
Baseline Accuracy: 96.28%
Accuracy: 99.40%
Error Reduction Rate: 83.79%
Avg Precision: 99.47%
Avg Recall: 99.51%


## 4. ByT5 pretrained with 2 million sentences from BERTje Sonar

In [20]:
print('Max Havelaar Multatuli (ByT5 pretrained sonar):')
evaluate_err(gold_data1, byt5_sonar_pred1)

Max Havelaar Multatuli (ByT5 pretrained sonar):
Baseline Accuracy: 96.04%
Accuracy: 98.70%
Error Reduction Rate: 67.09%
Avg Precision: 98.80%
Avg Recall: 98.80%


In [21]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):')
evaluate_err(gold_data2, byt5_sonar_pred2)

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):
Baseline Accuracy: 94.83%
Accuracy: 98.54%
Error Reduction Rate: 71.71%
Avg Precision: 98.46%
Avg Recall: 98.50%


In [22]:
print('Nescio Titaantjes (ByT5 pretrained sonar):') 
evaluate_err(gold_data3, byt5_sonar_pred3)

Nescio Titaantjes (ByT5 pretrained sonar):
Baseline Accuracy: 96.28%
Accuracy: 99.39%
Error Reduction Rate: 83.56%
Avg Precision: 99.19%
Avg Recall: 99.23%


## 5. Rule-based 

In [23]:
print('Max Havelaar Multatuli (rule-based):')
_, silver_target1 = create_data(silver_data1)
evaluate_err(gold_data1, silver_target1)

Max Havelaar Multatuli (rule-based):
Baseline Accuracy: 96.04%
Accuracy: 98.75%
Error Reduction Rate: 68.35%
Avg Precision: 98.90%
Avg Recall: 98.90%


In [24]:
print('Conan Doyle Sherlock Holmes De Agra Schat (rule-based):')
_, silver_target2 = create_data(silver_data2)
evaluate_err(gold_data2, silver_target2)

Conan Doyle Sherlock Holmes De Agra Schat (rule-based):
Baseline Accuracy: 94.83%
Accuracy: 98.35%
Error Reduction Rate: 68.02%
Avg Precision: 98.34%
Avg Recall: 98.38%


In [43]:
print('Nescio Titaantjes (rule-based):') 
_, silver_target3 = create_data(silver_data3)
evaluate_err(gold_data3, silver_target3)

Nescio Titaantjes (rule-based):
Baseline Accuracy: 96.28%
Accuracy: 98.52%
Error Reduction Rate: 60.27%
Avg Precision: 98.80%
Avg Recall: 98.82%


In [None]:
'''
Chrf++ (ERR) scores:
1. FlanT5 
2. ByT5 original
3. ByT5 pretrained Sonar 
4. ByT5 pretrained Books 
5. Rule-based

Best results for each novel:

1. Max Havelaar Multatuli: 5. Rule-Based (chrf++: 98.68%) 
2. Conan Doyle Sherlock Holmes De Agra Schat: 2. ByT5 original (chrf++: 98.54%)
3. Nescio Titaantjes: 3. ByT5 pretrained books (chrf++: 99.15%)
'''

## 1. Flan T5 (not pretrained)

In [28]:
print('Max Havelaar Multatuli (FlanT5):')
original1, gold1 = create_data(gold_data1)
baseline_chrf = cal_chrf(gold1, original1, 2, 'Baseline')
chrf = cal_chrf(gold1, flan_pred1, 2, 'FlanT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Max Havelaar Multatuli (FlanT5):
Chrf++: Baseline
   score: 96.34
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: FlanT5
   score: 97.16
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 22.37


In [29]:
print('Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):')
original2, gold2 = create_data(gold_data2)
baseline_chrf = cal_chrf(gold2, original2, 2, 'Baseline')
chrf = cal_chrf(gold2, flan_pred2, 2, 'FlanT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Conan Doyle Sherlock Holmes De Agra Schat (FlanT5):
Chrf++: Baseline
   score: 95.64
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: FlanT5
   score: 97.86
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 50.86


In [30]:
print('Nescio Titaantjes (FlanT5):') 
original3, gold3 = create_data(gold_data3)
baseline_chrf = cal_chrf(gold3, original3, 2, 'Baseline')
chrf = cal_chrf(gold3, flan_pred3, 2, 'FlanT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Nescio Titaantjes (FlanT5):
Chrf++: Baseline
   score: 96.83
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: FlanT5
   score: 98.64
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 57.06


## 2. ByT5 original (not pretrained)

In [31]:
print('Max Havelaar Multatuli (ByT5 orig):')
original1, gold1 = create_data(gold_data1)
baseline_chrf = cal_chrf(gold1, original1, 2, 'Baseline')
chrf = cal_chrf(gold1, byt5_pred1, 2, 'ByT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Max Havelaar Multatuli (ByT5 orig):
Chrf++: Baseline
   score: 96.34
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: ByT5
   score: 98.16
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 49.76


In [32]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):')
original2, gold2 = create_data(gold_data2)
baseline_chrf = cal_chrf(gold2, original2, 2, 'Baseline')
chrf = cal_chrf(gold2, byt5_pred2, 2, 'ByT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 orig):
Chrf++: Baseline
   score: 95.64
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: ByT5
   score: 98.54
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 66.59


In [33]:
print('Nescio Titaantjes (ByT5 orig):') 
original3, gold3 = create_data(gold_data3)
baseline_chrf = cal_chrf(gold3, original3, 2, 'Baseline')
chrf = cal_chrf(gold3, byt5_pred3, 2, 'ByT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Nescio Titaantjes (ByT5 orig):
Chrf++: Baseline
   score: 96.83
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: ByT5
   score: 99.07
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 70.79


## 3. ByT5 pretrained with 2 million sentences from BERTje Books

In [34]:
print('Max Havelaar Multatuli (ByT5 pretrained books):')
original1, gold1 = create_data(gold_data1)
baseline_chrf = cal_chrf(gold1, original1, 2, 'Baseline')
chrf = cal_chrf(gold1, byt5_books_pred1, 2, 'ByT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Max Havelaar Multatuli (ByT5 pretrained books):
Chrf++: Baseline
   score: 96.34
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: ByT5
   score: 98.22
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 51.46


In [35]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):')
original2, gold2 = create_data(gold_data2)
baseline_chrf = cal_chrf(gold2, original2, 2, 'Baseline')
chrf = cal_chrf(gold2, byt5_books_pred2, 2, 'ByT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained books):
Chrf++: Baseline
   score: 95.64
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: ByT5
   score: 98.5
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 65.55


In [36]:
print('Nescio Titaantjes (ByT5 pretrained books):') 
original3, gold3 = create_data(gold_data3)
baseline_chrf = cal_chrf(gold3, original3, 2, 'Baseline')
chrf = cal_chrf(gold3, byt5_books_pred3, 2, 'ByT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Nescio Titaantjes (ByT5 pretrained books):
Chrf++: Baseline
   score: 96.83
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: ByT5
   score: 99.15
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 73.04


## 4. ByT5 pretrained with 2 million sentences from BERTje Sonar

In [37]:
print('Max Havelaar Multatuli (ByT5 pretrained sonar):')
original1, gold1 = create_data(gold_data1)
baseline_chrf = cal_chrf(gold1, original1, 2, 'Baseline')
chrf = cal_chrf(gold1, byt5_sonar_pred1, 2, 'ByT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Max Havelaar Multatuli (ByT5 pretrained sonar):
Chrf++: Baseline
   score: 96.34
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: ByT5
   score: 98.19
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 50.56


In [38]:
print('Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):')
original2, gold2 = create_data(gold_data2)
baseline_chrf = cal_chrf(gold2, original2, 2, 'Baseline')
chrf = cal_chrf(gold2, byt5_sonar_pred2, 2, 'ByT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Conan Doyle Sherlock Holmes De Agra Schat (ByT5 pretrained sonar):
Chrf++: Baseline
   score: 95.64
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: ByT5
   score: 98.53
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 66.37


In [39]:
print('Nescio Titaantjes (ByT5 pretrained sonar):') 
original3, gold3 = create_data(gold_data3)
baseline_chrf = cal_chrf(gold3, original3, 2, 'Baseline')
chrf = cal_chrf(gold3, byt5_sonar_pred3, 2, 'ByT5')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Nescio Titaantjes (ByT5 pretrained sonar):
Chrf++: Baseline
   score: 96.83
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: ByT5
   score: 98.97
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 67.42


## 5. Rule-based 

In [40]:
print('Max Havelaar Multatuli (rule-based):')
silver_source1, silver_target1 = create_data(silver_data1)
gold_source1, gold_target1 = create_data(gold_data1)
baseline_chrf = cal_chrf(gold_target1, gold_source1, 2, 'Baseline')
chrf = cal_chrf(gold_target1, silver_target1, 2, 'Rule-Based')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Max Havelaar Multatuli (rule-based):
Chrf++: Baseline
   score: 96.34
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: Rule-Based
   score: 98.68
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 63.97


In [41]:
print('Conan Doyle Sherlock Holmes De Agra Schat (rule-based):')
silver_source2, silver_target2 = create_data(silver_data2)
gold_source2, gold_target2 = create_data(gold_data2)
baseline_chrf = cal_chrf(gold_target2, gold_source2, 2, 'Baseline')
chrf = cal_chrf(gold_target2, silver_target2, 2, 'Rule-Based')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Conan Doyle Sherlock Holmes De Agra Schat (rule-based):
Chrf++: Baseline
   score: 95.64
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: Rule-Based
   score: 98.5
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 65.59


In [42]:
print('Nescio Titaantjes (rule-based):') 
silver_source3, silver_target3 = create_data(silver_data3)
gold_source3, gold_target3 = create_data(gold_data3)
baseline_chrf = cal_chrf(gold_target3, gold_source3, 2, 'Baseline')
chrf = cal_chrf(gold_target3, silver_target3, 2, 'Rule-Based')
err = round((chrf - baseline_chrf) / (100 - baseline_chrf) * 100, 2)
print('Chrf++ ERR: {}'.format(err))

Nescio Titaantjes (rule-based):
Chrf++: Baseline
   score: 96.83
   char_order: 6
   word_order: 2
   beta: 2
Chrf++: Rule-Based
   score: 98.11
   char_order: 6
   word_order: 2
   beta: 2
Chrf++ ERR: 40.23
