# Export Datasets

Evaluation script imported from https://github.com/gsarti/it5/

In [None]:
from datasets import load_dataset

fst = load_dataset("it5/datasets", "fst")
hg = load_dataset("it5/datasets", "hg")
ns = load_dataset("it5/datasets", "ns")
qa = load_dataset("it5/datasets", "qa")
qg = load_dataset("it5/datasets", "qg")
st_g2r = load_dataset("it5/datasets", "st_g2r")
st_r2g = load_dataset("it5/datasets", "st_r2g")
wits = load_dataset("it5/datasets", "wits")

In [None]:
with open("references/informal.txt", 'w') as f:
    for s in fst["test_0"]["informal"]:
        f.write(s.lower() + '\n')

for i in range(4):
    with open(f"references/formal{i}.txt", 'w') as f:
        for s in fst[f"test_{i}"]["formal"]:
            f.write(s.lower() + '\n')

with open(f"references/hg.txt", 'w') as f:
    for s in hg[f"test"]["target"]:
        f.write(s.lower() + '\n')

for name in ["fanpage", "ilpost"]:
    with open(f"references/{name}.txt", 'w') as f:
        for s in ns[f"test_{name}"]["target"]:
            f.write(s.replace("\n", "").lower() + '\n')

with open(f"references/qa.txt", 'w') as f:
    for s in qa[f"test"]["target"]:
        f.write(s.lower() + '\n')

with open(f"references/qg.txt", 'w') as f:
    for s in qg[f"test"]["target"]:
        f.write(s.lower() + '\n')

with open(f"references/st_g2r.txt", 'w') as f:
    for s in st_g2r[f"test"]["headline"]:
        f.write(s.lower() + '\n')

with open(f"references/st_r2g.txt", 'w') as f:
    for s in st_r2g[f"test"]["headline"]:
        f.write(s.lower() + '\n')

with open(f"references/wits.txt", 'w') as f:
    for s in wits[f"test"]["summary"]:
        f.write(s.lower() + '\n')

In [1]:
from datasets import load_metric

rouge = load_metric("rouge")
bertscore = load_metric("bertscore")

rouge_kwargs = {
    'rouge_types': ["rouge1", "rouge2", "rougeL"],
    'use_aggregator': False,
    'use_stemmer': False,
}
bertscore_kwargs = {
    "model_type": "configs/bert-base-italian-xxl-uncased",
    "lang": "it",
    "num_layers": 10,
    "batch_size": 16,
    "rescale_with_baseline": True,
    "baseline_path": "configs/bertscore_baseline_ita.tsv",
    "use_fast_tokenizer": True
}

result_path = "eval/results"

  rouge = load_metric("rouge")


In [None]:
! pip install absl-py nltk rouge_score bert_score

In [2]:
result_path = "eval/results"

def compute_i2f(model, metric, kwargs):
    with open("references/formal0.txt", 'r') as f:
        ref0 = [s.strip() for s in f.readlines()]
    with open("references/formal1.txt", 'r') as f:
        ref1 = [s.strip() for s in f.readlines()]
    with open("references/formal2.txt", 'r') as f:
        ref2 = [s.strip() for s in f.readlines()]
    with open("references/formal3.txt", 'r') as f:
        ref3 = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{model}-informal-to-formal{'-ita' if 'mt5' in model else ''}_test_0.txt", 'r') as f:
        hyp = [s.strip() for s in f.readlines()]
    scores0 = {k:[s.fmeasure for s in v] for k, v in metric.compute(predictions=hyp, references=ref0, **kwargs).items()}
    scores1 = {k:[s.fmeasure for s in v] for k, v in metric.compute(predictions=hyp, references=ref1, **kwargs).items()}
    scores2 = {k:[s.fmeasure for s in v] for k, v in metric.compute(predictions=hyp, references=ref2, **kwargs).items()}
    scores3 = {k:[s.fmeasure for s in v] for k, v in metric.compute(predictions=hyp, references=ref3, **kwargs).items()}
    return {
        s:sum([max(x0, x1, x2, x3) for x0, x1, x2, x3 in zip(scores0[s], scores1[s], scores2[s], scores3[s])])/len(scores0[s])
        for s in scores0.keys()
    }

print("fst_i2f")
print("camoscio-7b", compute_i2f("camoscio-7b", rouge, rouge_kwargs))

fst_i2f
camoscio-7b {'rouge1': 0.6220582572775898, 'rouge2': 0.4285147347040223, 'rougeL': 0.6006278042777528}


In [13]:
result_path = "eval/results"

def compute_f2i(model, metric, kwargs):
    with open("references/informal.txt", 'r') as f:
        ref = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{model}-formal-to-informal{'-ita' if 'mt5' in model else ''}_test_0.txt", 'r') as f:
        hyp0 = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{model}-formal-to-informal{'-ita' if 'mt5' in model else ''}_test_1.txt", 'r') as f:
        hyp1 = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{model}-formal-to-informal{'-ita' if 'mt5' in model else ''}_test_2.txt", 'r') as f:
        hyp2 = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{model}-formal-to-informal{'-ita' if 'mt5' in model else ''}_test_3.txt", 'r') as f:
        hyp3 = [s.strip() for s in f.readlines()]
    scores0 = {k:[s.fmeasure for s in v] for k, v in metric.compute(predictions=hyp0, references=ref, **kwargs).items()}
    scores1 = {k:[s.fmeasure for s in v] for k, v in metric.compute(predictions=hyp1, references=ref, **kwargs).items()}
    scores2 = {k:[s.fmeasure for s in v] for k, v in metric.compute(predictions=hyp2, references=ref, **kwargs).items()}
    scores3 = {k:[s.fmeasure for s in v] for k, v in metric.compute(predictions=hyp3, references=ref, **kwargs).items()}
    return {
        s:sum([max(x0, x1, x2, x3) for x0, x1, x2, x3 in zip(scores0[s], scores1[s], scores2[s], scores3[s])])/len(scores0[s])
        for s in scores0.keys()
    }

print("fst_f2i")
print("camoscio-7b", compute_f2i("camoscio-7b", rouge, rouge_kwargs))

fst_f2i
camoscio-7b {'rouge1': 0.6450322843198721, 'rouge2': 0.4365721944806926, 'rougeL': 0.6233499493070205}


In [14]:
result_path = "eval/results"

def compute_f2i(model, metric, kwargs):
    with open("references/informal.txt", 'r') as f:
        ref = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{model}-formal-to-informal{'-ita' if 'mt5' in model else ''}_test_0.txt", 'r') as f:
        hyp0 = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{model}-formal-to-informal{'-ita' if 'mt5' in model else ''}_test_1.txt", 'r') as f:
        hyp1 = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{model}-formal-to-informal{'-ita' if 'mt5' in model else ''}_test_2.txt", 'r') as f:
        hyp2 = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{model}-formal-to-informal{'-ita' if 'mt5' in model else ''}_test_3.txt", 'r') as f:
        hyp3 = [s.strip() for s in f.readlines()]
    scores0 = metric.compute(predictions=hyp0, references=ref, **kwargs)["f1"]
    scores1 = metric.compute(predictions=hyp1, references=ref, **kwargs)["f1"]
    scores2 = metric.compute(predictions=hyp2, references=ref, **kwargs)["f1"]
    scores3 = metric.compute(predictions=hyp3, references=ref, **kwargs)["f1"]
    return sum([max(x0, x1, x2, x3) for x0, x1, x2, x3 in zip(scores0, scores1, scores2, scores3)])/len(scores0)

print("bertscore f2i")
print("camoscio-7b", compute_f2i("camoscio-7b", bertscore, bertscore_kwargs))

bertscore f2i
camoscio-7b 0.6518663733750582


In [None]:
rouge_kwargs = {
    'rouge_types': ["rouge1", "rouge2", "rougeL"],
}

def compute_rouge(ref_file, hyp_file, kwargs):
    with open(f"references/{ref_file}.txt", 'r') as f:
        ref = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{hyp_file}.txt", 'r') as f:
        hyp = [s.strip() for s in f.readlines()]
    return {k:v.mid.fmeasure for k,v in rouge.compute(predictions=hyp, references=ref, **kwargs).items()}


print("ns_ilpost")
print("camoscio-7b", compute_rouge("ilpost", "llama-7b-hf_news-summarization_test_ilpost", rouge_kwargs))

In [33]:
print("ns_fanpage")
print("camoscio-7b", compute_rouge("fanpage", "llama-7b-hf_ns_test_fanpage", rouge_kwargs))

ns_fanpage
camoscio-7b {'rouge1': 0.2504043047929348, 'rouge2': 0.10648184368593602, 'rougeL': 0.1715946228680273}


In [15]:
def compute_i2f(model, metric, kwargs):
    with open("references/formal0.txt", 'r') as f:
        ref0 = [s.strip() for s in f.readlines()]
    with open("references/formal1.txt", 'r') as f:
        ref1 = [s.strip() for s in f.readlines()]
    with open("references/formal2.txt", 'r') as f:
        ref2 = [s.strip() for s in f.readlines()]
    with open("references/formal3.txt", 'r') as f:
        ref3 = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{model}-informal-to-formal{'-ita' if 'mt5' in model else ''}_test_0.txt", 'r') as f:
        hyp = [s.strip() for s in f.readlines()]

    scores0 = metric.compute(predictions=hyp, references=ref0, **kwargs)["f1"]
    scores1 = metric.compute(predictions=hyp, references=ref1, **kwargs)["f1"]
    scores2 = metric.compute(predictions=hyp, references=ref2, **kwargs)["f1"]
    scores3 = metric.compute(predictions=hyp, references=ref3, **kwargs)["f1"]
    return sum([max(x0, x1, x2, x3) for x0, x1, x2, x3 in zip(scores0, scores1, scores2, scores3)])/len(scores0)

print("bertscore i2f")
print("camoscio-7b", compute_i2f("camoscio-7b", bertscore, bertscore_kwargs))

bertscore i2f
camoscio-7b 0.6672154705971479


In [5]:
def compute_bertscore(ref_file, hyp_file, kwargs):
    with open(f"references/{ref_file}.txt", 'r') as f:
        ref = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{hyp_file}.txt", 'r') as f:
        hyp = [s.strip() for s in f.readlines()]
    scores = bertscore.compute(predictions=hyp, references=ref, **kwargs)["f1"]
    return sum(scores) / len(scores)

In [None]:
print("ns_ilpost")
print("camoscio-7b", compute_bertscore("ilpost", "llama-7b-hf_news-summarization_test_ilpost", bertscore_kwargs))

In [None]:
print("ns_fanpage")
print("camoscio-7b", compute_bertscore("fanpage", "llama-7b-hf_ns_test_fanpage", bertscore_kwargs))

## SQUAD eval

In [13]:
import re
import string
import sys
from collections import Counter
from datasets import load_dataset

squad_it = load_dataset("squad_it")
squad_it["test"][0]



  0%|          | 0/2 [00:00<?, ?it/s]

{'id': '5725b33f6a3fe71400b8952d',
 'context': 'La crisi petrolifera del 1973 iniziò nell\' ottobre 1973 quando i membri dell\' Organizzazione dei Paesi esportatori di petrolio arabo (OAPEC, composta dai membri arabi dell\' OPEC più Egitto e Siria) proclamarono un embargo petrolifero. Alla fine dell\' embargo, nel marzo 1974, il prezzo del petrolio era salito da 3 dollari al barile a quasi 12 dollari a livello mondiale; i prezzi americani erano notevolmente più elevati. L\' embargo ha causato una crisi petrolifera, o "shock", con molti effetti a breve e lungo termine sulla politica globale e sull\' economia globale. Più tardi fu chiamato il "primo shock petrolifero", seguito dalla crisi petrolifera del 1979, definita il "secondo shock petrolifero".',
 'question': 'Quando è iniziata la crisi petrolifera del 1973?',
 'answers': {'text': ['ottobre 1973',
   'ottobre 1973',
   'ottobre 1973',
   'ottobre',
   '1973'],
  'answer_start': [43, 43, 43, 43, 25]}}

In [14]:
def normalize_answer(txt):
  def remove_articles(text):
      return re.sub(r'\b(il|lo|la|i|gli|le|l)\b', ' ', text)
  def remove_prepositions(text):
      return re.sub(r'\b(di|a|da|in|con|su|per|tra|fra)\b', ' ', text)
  def white_space_fix(text):
      return ' '.join(text.split())
  def remove_punc(text):
      exclude = set(string.punctuation)
      return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
      return text.lower()
  return white_space_fix(
      remove_punc(
          remove_prepositions(remove_articles(lower(txt)))
      )
  )


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    return max([metric_fn(prediction, ground_truth) for ground_truth in ground_truths])


def evaluate(predictions):
    f1 = []
    exact_match = []
    for ref, prediction in zip(squad_it["test"], predictions):
        exact_match.append(metric_max_over_ground_truths(exact_match_score, prediction, ref["answers"]["text"]))
        f1.append(metric_max_over_ground_truths(f1_score, prediction, ref["answers"]["text"]))
    exact_match = sum(exact_match) / len(exact_match)
    f1 = sum(f1) / len(f1)

    return {"exact_match": exact_match, "f1": f1}

In [15]:
print("camoscio-7b", evaluate(open(f'{result_path}/llama-7b-hf_question-answering_test.txt', 'r').read().splitlines()))


camoscio-7b {'exact_match': 0.07701406229465108, 'f1': 0.27015108152913614}


camoscio-7b {'exact_match': 0.07701406229465108, 'f1': 0.27015108152913614}

In [30]:
def compute_bertscore(ref_file, hyp_file, kwargs):
    with open(f"references/{ref_file}.txt", 'r') as f:
        ref = [s.strip() for s in f.readlines()]
    with open(f"{result_path}/{hyp_file}.txt", 'r') as f:
        hyp = [s.strip() for s in f.readlines()]
    scores = bertscore.compute(predictions=hyp, references=ref, **kwargs)["f1"]
    return sum(scores) / len(scores)
    #return scores

In [27]:
print("SQUAD-it")
print("camoscio-7b", compute_bertscore("qa", "llama-7b-hf_question-answering_test", bertscore_kwargs))

SQUAD-it




camoscio-7b 0.2373131654580585


In [36]:
print("camoscio-7b", compute_rouge("qa", "llama-7b-hf_question-answering_test", rouge_kwargs))

camoscio-7b {'rouge1': 0.2429478453736345, 'rouge2': 0.1333382459595141, 'rougeL': 0.2416336943874583}


In [21]:
print("mt5-small", compute_rouge("qa", "mt5-small-question-answering-ita_test", rouge_kwargs))
print("mt5-small", compute_bertscore("qa", "mt5-small-question-answering-ita_test", bertscore_kwargs))

mt5-small {'rouge1': 0.6176649280125064, 'rouge2': 0.34690279865702467, 'rougeL': 0.6173512514812024}
mt5-small 0.712143312844745


In [22]:
print("mt5-base", compute_rouge("qa", "mt5-base-question-answering-ita_test", rouge_kwargs))
print("mt5-base", compute_bertscore("qa", "mt5-base-question-answering-ita_test", bertscore_kwargs))

mt5-base {'rouge1': 0.709509904008709, 'rouge2': 0.396262003262724, 'rougeL': 0.7089794174839387}
mt5-base 0.7704119623410816


In [24]:
print("it5-small", compute_rouge("qa", "it5-small-question-answering_test", rouge_kwargs))
print("it5-small", compute_bertscore("qa", "it5-small-question-answering_test", bertscore_kwargs))

it5-small {'rouge1': 0.6712948598730601, 'rouge2': 0.3722742816420268, 'rougeL': 0.6713140331352194}
it5-small 0.7437164323812749


In [25]:
print("it5-base", compute_rouge("qa", "it5-base-question-answering_test", rouge_kwargs))
print("it5-base", compute_bertscore("qa", "it5-base-question-answering_test", bertscore_kwargs))

it5-base {'rouge1': 0.7126444413419158, 'rouge2': 0.40612353674329427, 'rougeL': 0.7124650680793831}
it5-base 0.7707937262429813


In [26]:
print("it5-large", compute_rouge("qa", "it5-large-question-answering_test", rouge_kwargs))
print("it5-large", compute_bertscore("qa", "it5-large-question-answering_test", bertscore_kwargs))

it5-large {'rouge1': 0.730023661619623, 'rouge2': 0.4120094697587379, 'rougeL': 0.7297545452055703}
it5-large 0.7845929185766455


## Exact match via ChatGPT

In [None]:
import json

with open('eval/em_gpt_results.json') as f:
	lines = f.readlines()
	new_list = []
	for line in lines:
		new_list.append(json.loads(line))


models = {
	'it5-base': [],
	'mt5-small': [],
	'llama-7b-hf': [],
	'it5-small': [],
	'mt5-base': [],
	'it5-large': []
}

def get_elem(element, key):
	if element.get(key):
		models[key].append(element.get(key))
	elif element.get(f'A {key}'):
		models[key].append(element.get(f'A {key}'))
	else:
		models[key].append(0)

models_list = models.keys()

for element in new_list:
	for model in models_list:
		get_elem(element, model)

print("Accuracy")
for model in models:
	print(model, sum(models[model])/len(models[model]))