# Evaluation

In [24]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

import random
from collections import Counter
import json
from tqdm import tqdm
import editdistance
import numpy as np

from transformers import AutoTokenizer, BertForMaskedLM, BertTokenizer, BertConfig
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline

import torch

In [25]:
test_set = json.load(open('data/archaic/test_common.json'))
max_l = max([ len(x['masked_gt']) for x in test_set])

dialects = ['atticionic', 'doric', 'northwest', 'aeolic']

### Utils

In [26]:
# Copyright 2021 the Ithaca Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Example for running inference. See also colab."""

import functools
import pickle

from absl import app
from absl import flags
from ithaca.eval import inference
from ithaca.models.model import Model
from ithaca.util.alphabet import GreekAlphabet
import jax

In [27]:
def load_checkpoint(path):
  """Loads a checkpoint pickle.

  Args:
    path: path to checkpoint pickle

  Returns:
    a model config dictionary (arguments to the model's constructor), a dict of
    dicts containing region mapping information, a GreekAlphabet instance with
    indices and words populated from the checkpoint, a dict of Jax arrays
    `params`, and a `forward` function.
  """

  # Pickled checkpoint dict containing params and various config:
  with open(path, 'rb') as f:
    checkpoint = pickle.load(f)

  # We reconstruct the model using the same arguments as during training, which
  # are saved as a dict in the "model_config" key, and construct a `forward`
  # function of the form required by attribute() and restore().
  params = jax.device_put(checkpoint['params'])
  model = Model(**checkpoint['model_config'])
  forward = functools.partial(model.apply, params)

  # Contains the mapping between region IDs and names:
  region_map = checkpoint['region_map']

  # Use vocabulary mapping from the checkpoint, the rest of the values in the
  # class are fixed and constant e.g. the padding symbol
  alphabet = GreekAlphabet()
  alphabet.idx2word = checkpoint['alphabet']['idx2word']
  alphabet.word2idx = checkpoint['alphabet']['word2idx']

  return checkpoint['model_config'], region_map, alphabet, params, forward

## Base models

**Ithaca base**

In [29]:
!python3 run_ithaca_inference.py --results_path results/archaic/ithaca --cuda 0

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|███████████████████████████████████████████| 95/95 [01:24<00:00,  1.13it/s]


In [30]:
os.makedirs('results/archaic/ithaca', exist_ok=True)

editd = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}
acc1 = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}
acc20 = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}

for inscription in test_set:

    ground_truth = inscription['text'].strip()
    text_masked = inscription['masked_ithaca']
    phi_id = inscription['id']

    dialect = inscription['dialect']

    predictions = json.load(open(f'results/archaic/ithaca/{phi_id}.json'))['predictions']

    text_predictions = [ pred['text'] for pred in predictions]

    assert len(text_predictions[0]) == len(ground_truth)
    ctop1 = 0
    ctop20 = 0
    if ground_truth in text_predictions[:1]:
        ctop1 = 1
        ctop20 = 1
    if ground_truth in text_predictions:
        ctop20 = 1

    l = text_masked.count('?')

    acc1[dialect][l].append(ctop1)
    acc20[dialect][l].append(ctop20)

    editd[dialect][l].append(editdistance.eval(text_predictions[0], ground_truth) / l)

for dialect in dialects:
    print(f"======== {dialect} ========")
    print(f'ACC1 : {np.mean([np.mean(v) for v in acc1[dialect].values() if v])*100:.2f}%')
    print(f'ACC20: {np.mean([np.mean(v) for v in acc20[dialect].values() if v])*100:.2f}%')
    print(f'CER  : {np.mean([np.mean(v) for v in editd[dialect].values() if v])*100:.2f}%')

print(f"======== TOTAL ========")
cert = { x: [] for x in range(1, max_l+1)}
for k, vs in editd.items():
    for kk, v in vs.items():
        cert[kk].extend(v)
acc1t = { x: [] for x in range(1, max_l+1)}
for k, vs in acc1.items():
    for kk, v in vs.items():
        acc1t[kk].extend(v)

acc20t = { x: [] for x in range(1, max_l+1)}
for k, vs in acc20.items():
    for kk, v in vs.items():
        acc20t[kk].extend(v)
print(f'CER : {np.mean([np.mean(v) for v in cert.values() if v])*100:.2f}%')
print(f'ACC1 : {np.mean([np.mean(v) for v in acc1t.values() if v])*100:.2f}%')
print(f'ACC20 : {np.mean([np.mean(v) for v in acc20t.values() if v])*100:.2f}%')

ACC1 : 28.15%
ACC20: 45.18%
CER  : 65.16%
ACC1 : 25.00%
ACC20: 56.67%
CER  : 60.45%
ACC1 : 40.00%
ACC20: 70.00%
CER  : 50.83%
ACC1 : 33.33%
ACC20: 33.33%
CER  : 66.67%
CER : 67.11%
ACC1 : 24.24%
ACC20 : 42.66%


**AG BERT**

In [31]:
model_name = "pranaydeeps/Ancient-Greek-BERT"
fill_mask = pipeline("fill-mask", model=model_name, tokenizer=model_name, top_k=3500)

ctop1s = []
ctop20s = []
#editd = { x: [] for x in range(0,20)}
#acc1 =  { x: [] for x in range(1, max_l+1)}
#acc20 = { x: [] for x in range(1, max_l+1)}
editd = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}
acc1 = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}
acc20 = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}
for sentence in tqdm(test_set):

    masked = sentence['masked_ag']
    ground_truth = sentence['masked_gt']

    dialect = sentence['dialect']

    inference_result = fill_mask(masked)

    constr = [ x for x in inference_result if len(x['token_str']) == len(ground_truth)][:20]

    ctop1 = 0
    ctop20 = 0
    if ground_truth in [ x['token_str'] for x in constr[:1] ]:
        ctop20 = 1
        ctop1 = 1
    if ground_truth in [ x['token_str'] for x in constr ]:
        ctop20 = 1

    ctop1s.append(ctop1)
    ctop20s.append(ctop20)
    #editd[len(ground_truth)].append(editdistance.eval(constr[0]['token_str'], ground_truth) / len(ground_truth))
    #acc1[len(ground_truth)].append(ctop1)
    #acc20[len(ground_truth)].append(ctop20)
    acc1[dialect][len(ground_truth)].append(ctop1)
    acc20[dialect][len(ground_truth)].append(ctop20)

    if constr != []:
        editd[dialect][len(ground_truth)].append(editdistance.eval(constr[0]['token_str'], ground_truth) / len(ground_truth))
    else:
        editd[dialect][len(ground_truth)].append(1.0)

for dialect in dialects:
    print(f"======== {dialect} ========")
    print(f'CER  : {np.mean([np.mean(v) for v in editd[dialect].values() if v])*100:.2f}%')
    print(f'ACC1 : {np.mean([np.mean(v) for v in acc1[dialect].values() if v])*100:.2f}%')
    print(f'ACC20: {np.mean([np.mean(v) for v in acc20[dialect].values() if v])*100:.2f}%')

print(f"======== TOTAL ========")
cert = { x: [] for x in range(1, max_l+1)}
for k, vs in editd.items():
    for kk, v in vs.items():
        cert[kk].extend(v)
acc1t = { x: [] for x in range(1, max_l+1)}
for k, vs in acc1.items():
    for kk, v in vs.items():
        acc1t[kk].extend(v)

acc20t = { x: [] for x in range(1, max_l+1)}
for k, vs in acc20.items():
    for kk, v in vs.items():
        acc20t[kk].extend(v)
print(f'CER : {np.mean([np.mean(v) for v in cert.values() if v])*100:.2f}%')
print(f'ACC1 : {np.mean([np.mean(v) for v in acc1t.values() if v])*100:.2f}%')
print(f'ACC20 : {np.mean([np.mean(v) for v in acc20t.values() if v])*100:.2f}%')

Some weights of BertForMaskedLM were not initialized from the model checkpoint at pranaydeeps/Ancient-Greek-BERT and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 95/95 [00:33<00:00,  2.87it/s]

CER  : 93.19%
ACC1 : 0.00%
ACC20: 0.00%
CER  : 96.10%
ACC1 : 0.00%
ACC20: 0.00%
CER  : 95.83%
ACC1 : 0.00%
ACC20: 0.00%
CER  : 79.17%
ACC1 : 0.00%
ACC20: 0.00%
CER : 91.45%
ACC1 : 0.00%
ACC20 : 0.00%





## Fine-Tuning

AG BERT fine-tuning on the whole iPHI dataset.

In [6]:
from transformers import EarlyStoppingCallback

# Load pre-trained BERT model and tokenizer
#model_name = "bert-base-uncased"
model_name = "pranaydeeps/Ancient-Greek-BERT"
tokenizer = BertTokenizer.from_pretrained(model_name)
config = BertConfig.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name, config=config)

early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0)

train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data/iphi/train.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    #tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="models/epi-agBERT",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=32,
    save_steps=500,
    logging_steps=500,
    eval_steps=500,
    save_total_limit=2,
    evaluation_strategy = 'steps',
    metric_for_best_model= 'eval_loss',
    load_best_model_at_end = True,
    greater_is_better=False,
    logging_dir='./logs',
    report_to=["tensorboard"]
)

val_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data/iphi/validation.txt",
    block_size=128,
)

test_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data/iphi/test.txt",
    block_size=128,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping]
)

test_result = trainer.evaluate(test_dataset)
print("Evaluation result:", test_result)
"""
Evaluation result: {'eval_loss': 11.396125793457031, 'eval_runtime': 22.5231, 'eval_samples_per_second': 346.8, 'eval_steps_per_second': 14.474}
"""

trainer.train()
trainer.save_model('models/epi-agBERT')
tokenizer.save_pretrained('models/epi-agBERT')

test_result = trainer.evaluate(test_dataset)
print("Evaluation result:", test_result)

# with open('log.txt', 'w') as f:
#     f.write(cap.stdout)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at pranaydeeps/Ancient-Greek-BERT and are newly initialized: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Evaluation result: {'eval_loss': 10.620267868041992, 'eval_runtime': 22.2925, 'eval_samples_per_second': 350.297, 'eval_steps_per_second': 43.826}


Step,Training Loss,Validation Loss
500,3.6959,2.734945
1000,2.6324,2.362947
1500,2.3717,2.157843
2000,2.2435,2.045613
2500,2.133,1.974687
3000,2.0564,1.917562
3500,1.9769,1.859798
4000,1.9452,1.836522
4500,1.8717,1.781387
5000,1.8701,1.747057


KeyboardInterrupt: 

Archaic

In [11]:
from transformers import EarlyStoppingCallback
import os

# Load pre-trained BERT model and tokenizer
#model_name = "bert-base-uncased"
model_name = "pranaydeeps/Ancient-Greek-BERT"
tokenizer = BertTokenizer.from_pretrained(model_name)
config = BertConfig.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name, config=config)

early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0)

train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data/archaic/train.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    #tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="models/agBERT-archaic",
    overwrite_output_dir=True,
    num_train_epochs=10000,
    per_device_train_batch_size=32,
    save_steps=500,
    logging_steps=500,
    eval_steps=500,
    save_total_limit=2,
    evaluation_strategy = 'steps',
    metric_for_best_model= 'eval_loss',
    load_best_model_at_end = True,
    greater_is_better=False,
    logging_dir='./logs/archaic',
    run_name="agBERT-archaic",
)

val_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data/archaic/validation.txt",
    block_size=128,
)

test_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data/archaic/test.txt",
    block_size=128,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping]
)

test_result = trainer.evaluate(test_dataset)
print("Evaluation result:", test_result)
"""
Evaluation result: {'eval_loss': 11.396125793457031, 'eval_runtime': 22.5231, 'eval_samples_per_second': 346.8, 'eval_steps_per_second': 14.474}
"""

trainer.train()
trainer.save_model('models/agBERT-archaic')
tokenizer.save_pretrained('models/agBERT-archaic')

test_result = trainer.evaluate(test_dataset)
print("Evaluation result:", test_result)

# with open('log.txt', 'w') as f:
#     f.write(cap.stdout)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at pranaydeeps/Ancient-Greek-BERT and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Evaluation result: {'eval_loss': 10.873405456542969, 'eval_runtime': 0.3262, 'eval_samples_per_second': 291.234, 'eval_steps_per_second': 18.394}


Step,Training Loss,Validation Loss
500,2.7025,2.649799
1000,1.2064,2.22262
1500,0.6598,2.104249
2000,0.3985,2.284696
2500,0.2784,2.488966
3000,0.2106,2.338235


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


Evaluation result: {'eval_loss': 2.165879249572754, 'eval_runtime': 0.3262, 'eval_samples_per_second': 291.2, 'eval_steps_per_second': 18.392, 'epoch': 250.0}


## Fine-Tuned Eval

Without knowing the length of the gap.

In [15]:
model_name = "./models/epi-agBERT"
fill_mask = pipeline("fill-mask", model=model_name, tokenizer=model_name, top_k=20)


editd = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}
acc1 = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}
acc20 = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}

for inscription in tqdm(test_set):
    ground_truth = inscription['masked_gt']
    masked = inscription['masked_ag']

    dialect = inscription['dialect']
    
    inference_result = fill_mask(masked)

    ctop1 = 0
    ctop20 = 0
    if ground_truth in [ x['token_str'] for x in inference_result[:1] ]:
        ctop1 = 1
        ctop20 = 1
    if ground_truth in [ x['token_str'] for x in inference_result ]:
        ctop20 = 1

    acc1[dialect][len(ground_truth)].append(ctop1)
    acc20[dialect][len(ground_truth)].append(ctop20)

    #editd[dialect][len(ground_truth)].append(editdistance.eval(inference_result[0]['token_str'], ground_truth) / max(len(inference_result[0]['token_str']), len(ground_truth)))
    editd[dialect][len(ground_truth)].append(editdistance.eval(inference_result[0]['token_str'], ground_truth) / len(ground_truth))

for dialect in dialects:
    print(f"======== {dialect} ========")
    print(f'CER  : {np.mean([np.mean(v) for v in editd[dialect].values() if v])*100:.2f}%')
    print(f'ACC1 : {np.mean([np.mean(v) for v in acc1[dialect].values() if v])*100:.2f}%')
    print(f'ACC20: {np.mean([np.mean(v) for v in acc20[dialect].values() if v])*100:.2f}%')

print(f"======== TOTAL ========")
cert = { x: [] for x in range(1, max_l+1)}
for k, vs in editd.items():
    for kk, v in vs.items():
        cert[kk].extend(v)
acc1t = { x: [] for x in range(1, max_l+1)}
for k, vs in acc1.items():
    for kk, v in vs.items():
        acc1t[kk].extend(v)
acc20t = { x: [] for x in range(1, max_l+1)}
for k, vs in acc20.items():
    for kk, v in vs.items():
        acc20t[kk].extend(v)
print(f'CER : {np.mean([np.mean(v) for v in cert.values() if v])*100:.2f}%')
print(f'ACC1 : {np.mean([np.mean(v) for v in acc1t.values() if v])*100:.2f}%')
print(f'ACC20 : {np.mean([np.mean(v) for v in acc20t.values() if v])*100:.2f}%')

100%|██████████| 95/95 [00:05<00:00, 16.09it/s]

CER  : 88.27%
ACC1 : 12.25%
ACC20: 28.28%
CER  : 80.49%
ACC1 : 10.00%
ACC20: 27.50%
CER  : 90.83%
ACC1 : 20.00%
ACC20: 30.00%
CER  : 80.83%
ACC1 : 0.00%
ACC20: 0.00%
CER : 87.73%
ACC1 : 9.95%
ACC20 : 23.32%





Knowing the length of the gap.

In [16]:
model_name = "./models/epi-agBERT"
fill_mask = pipeline("fill-mask", model=model_name, tokenizer=model_name, top_k=3500)

ctop1s = []
ctop20s = []

editd = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}

acc1 = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}
acc20 = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}

pbar = tqdm(test_set)
for inscription in pbar:
    ground_truth = inscription['masked_gt']
    masked = inscription['masked_ag']

    dialect = inscription['dialect']
    
    inference_result = fill_mask(masked)

    constr = [ x for x in inference_result if len(x['token_str']) == len(ground_truth)][:20]

    ctop1 = 0
    ctop20 = 0
    if ground_truth in [ x['token_str'] for x in constr[:1] ]:
        ctop1 = 1
        ctop20 = 1
    if ground_truth in [ x['token_str'] for x in constr ]:
        ctop20 = 1

    ctop1s.append(ctop1)
    ctop20s.append(ctop20)
    acc1[dialect][len(ground_truth)].append(ctop1)
    acc20[dialect][len(ground_truth)].append(ctop20)

    if constr != []:
        editd[dialect][len(ground_truth)].append(editdistance.eval(constr[0]['token_str'], ground_truth) / len(ground_truth))
    else:
        editd[dialect][len(ground_truth)].append(1.0)

for dialect in dialects:
    print(f"======== {dialect} ========")
    print(f'CER  : {np.mean([np.mean(v) for v in editd[dialect].values() if v])*100:.2f}%')
    print(f'ACC1 : {np.mean([np.mean(v) for v in acc1[dialect].values() if v])*100:.2f}%')
    print(f'ACC20: {np.mean([np.mean(v) for v in acc20[dialect].values() if v])*100:.2f}%')

print(f"======== TOTAL ========")
cert = { x: [] for x in range(1, max_l+1)}
for k, vs in editd.items():
    for kk, v in vs.items():
        cert[kk].extend(v)
acc1t = { x: [] for x in range(1, max_l+1)}
for k, vs in acc1.items():
    for kk, v in vs.items():
        acc1t[kk].extend(v)

acc20t = { x: [] for x in range(1, max_l+1)}
for k, vs in acc20.items():
    for kk, v in vs.items():
        acc20t[kk].extend(v)
print(f'CER : {np.mean([np.mean(v) for v in cert.values() if v])*100:.2f}%')
print(f'ACC1 : {np.mean([np.mean(v) for v in acc1t.values() if v])*100:.2f}%')
print(f'ACC20 : {np.mean([np.mean(v) for v in acc20t.values() if v])*100:.2f}%')

100%|██████████| 95/95 [00:33<00:00,  2.84it/s]

CER  : 68.71%
ACC1 : 21.23%
ACC20: 28.28%
CER  : 73.82%
ACC1 : 10.00%
ACC20: 27.50%
CER  : 65.00%
ACC1 : 30.00%
ACC20: 50.00%
CER  : 89.17%
ACC1 : 0.00%
ACC20: 0.00%
CER : 73.11%
ACC1 : 15.82%
ACC20 : 25.19%





Archaic knowing the size.

In [34]:
model_name = "./models/agBERT-archaic/" #"./models/agBERT-iphi-archaic/"
fill_mask = pipeline("fill-mask", model=model_name, tokenizer=model_name, top_k=3500)

ctop1s = []
ctop20s = []

dialects = ['atticionic', 'doric', 'northwest', 'aeolic']

editd = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}
acc1 = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}
acc20 = { d: { x: [] for x in range(1, max_l+1)} for d in dialects}

for inscription in tqdm(test_set):
    ground_truth = inscription['masked_gt']
    masked = inscription['masked_ag']

    dialect = inscription['dialect']
    
    inference_result = fill_mask(masked)

    constr = [ x for x in inference_result if len(x['token_str']) == len(ground_truth)][:20]

    ctop1 = 0
    ctop20 = 0
    if ground_truth in [ x['token_str'] for x in constr[:1] ]:
        ctop1 = 1
        ctop20 = 1
    if ground_truth in [ x['token_str'] for x in constr ]:
        ctop20 = 1

    ctop1s.append(ctop1)
    ctop20s.append(ctop20)
    acc1[dialect][len(ground_truth)].append(ctop1)
    acc20[dialect][len(ground_truth)].append(ctop20)

    if constr != []:
        editd[dialect][len(ground_truth)].append(editdistance.eval(constr[0]['token_str'], ground_truth) / len(ground_truth))
    else:
        editd[dialect][len(ground_truth)].append(1.0)
    #pbar.set_description(f"acc1={ctop1s.count(1)/len(ctop1s)*100:.2f}%, acc20={ctop20s.count(1)/len(ctop20s)*100:.2f}%")

for dialect in dialects:
    print(f"======== {dialect} ========")
    print(f'CER  : {np.mean([np.mean(v) for v in editd[dialect].values() if v])*100:.2f}%')
    print(f'ACC1 : {np.mean([np.mean(v) for v in acc1[dialect].values() if v])*100:.2f}%')
    print(f'ACC20: {np.mean([np.mean(v) for v in acc20[dialect].values() if v])*100:.2f}%')
    
print(f"======== TOTAL ========")
acc1t = { x: [] for x in range(1, max_l+1)}
for k, vs in acc1.items():
    for kk, v in vs.items():
        acc1t[kk].extend(v)

acc20t = { x: [] for x in range(1, max_l+1)}
for k, vs in acc20.items():
    for kk, v in vs.items():
        acc20t[kk].extend(v)

cert = { x: [] for x in range(1, max_l+1)}
for k, vs in editd.items():
    for kk, v in vs.items():
        cert[kk].extend(v)
        
print(f'CER : {np.mean([np.mean(v) for v in cert.values() if v])*100:.2f}%')
print(f'ACC1 : {np.mean([np.mean(v) for v in acc1t.values() if v])*100:.2f}%')
print(f'ACC20 : {np.mean([np.mean(v) for v in acc20t.values() if v])*100:.2f}%')

100%|██████████| 95/95 [00:33<00:00,  2.80it/s]

CER  : 65.88%
ACC1 : 26.36%
ACC20: 28.28%
CER  : 64.25%
ACC1 : 27.50%
ACC20: 27.50%
CER  : 56.67%
ACC1 : 40.00%
ACC20: 50.00%
CER  : 80.83%
ACC1 : 0.00%
ACC20: 0.00%
CER : 69.01%
ACC1 : 22.81%
ACC20 : 25.19%



