# 0 Preparations
First, install the packages needed in this notebook:

In [1]:
! pip install transformers[torch] datasets evaluate bert_score sacrebleu spacy rouge_score
! pip install git+https://github.com/google-research/bleurt.git

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 KB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting spacy
  Downloading spacy-3.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting portalocker
  Downloading portalocker-2.10.0-py3-none-any.whl (18 kB)
Collecting tabulate>=0.8.9
  Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Collecting lxml
  Down

In [2]:
# Downlaod spacy model
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# 1 Seq2seq evaluation metrics

### 1.1 You are given a candidate and a reference translation and the score of a metric. What type of metrics was used? Can you suggest better metric? Justify your answer!

```
Reference: "My cat loves to watch the birds outside the window."
Candidate: "My cat hates to watch the birds outside the window."
-> score: 0.99
```

### 1.2 You want to train a machine translation system but you only have a few thousand aligned sentences. Are there metrics that are especially suited for this low-resource scenario? Why?


### 1.3 Your friend tells you this: "I cannot use a learned metric for my task because my data is from a very special domain and there will be a domain mismatch." - Is she right? Does she miss something?



## 1.4 Recreate the scores from the lecture slides with Huggingface evaluate

In [3]:
%%capture
from evaluate import load # use the Huggingface evaluate implementations
bertscore = load("bertscore")
bleu = load("sacrebleu")
bleurt = load("bleurt", module_type="metric", checkpoint="Elron/bleurt-base-128")

2024-07-04 11:44:46.762855: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-04 11:44:46.778412: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-04 11:44:46.800730: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-04 11:44:46.800767: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-04 11:44:46.814338: I tensorflow/core/platform/cpu_feature_gua

In [4]:
print(bleu.compute(predictions=["My weekend was bad"], references=["My weekend was superb"])['score'])
print(bleu.compute(predictions=["At the weekend, we ate my grandma's house."], references=["At the weekend, we visited my grandma's house and ate cake."])['score'])
print(bleu.compute(predictions=["At the weekend, we visited my grandma's house. And we ate cake."], references=["At the weekend, we visited my grandma's house and ate cake."])['score'])

59.460355750136046
41.154215810165745
64.75445426291287


In [5]:
# This function makes comparing different scores for a given reference-candidate pair more handy
def evaluate_and_compare_scores(reference: str, candidate: str, language: str='en') -> None:
    print("Reference: ", reference)
    print("Candidate: ", candidate)

    score_bleu = bleu.compute(predictions=[candidate], references=[reference], smooth_method='none')['score']
    print(f"BLEU: {score_bleu}")
    score_bertscore = bertscore.compute(predictions=[candidate], references=[reference], lang=language)['f1']
    print(f"BERTscore: {score_bertscore}")
    score_bleurt = bleurt.compute(predictions=[candidate], references=[reference])['scores']
    print(f"BlEURT: {score_bleurt}")

In [15]:
####################################################################
# TODO come up with own examples and try to fool the scores
# Can you make further observations?
####################################################################
ref = "I feel good."
cands = ["I feel amazing.", "good I feel.", "I feel good, at least today ."]
####################################################################
for cand in cands:
    evaluate_and_compare_scores(ref, cand)
    print('***')

ref_de = "Dieses Haus ist in einer großen Stadt."
cand_de = "Das Haus in einer großen Stadt ist."
evaluate_and_compare_scores(ref_de, cand_de, language='de')

Reference:  I feel good.
Candidate:  I feel amazing.
BLEU: 0.0
BERTscore: [0.9791091084480286]
BlEURT: [0.761749267578125]
***
Reference:  I feel good.
Candidate:  good I feel.
BLEU: 0.0
BERTscore: [0.9251930713653564]
BlEURT: [0.48538005352020264]
***
Reference:  I feel good.
Candidate:  I feel good, at least today .
BLEU: 0.0
BERTscore: [0.9391999840736389]
BlEURT: [0.14559583365917206]
***
Reference:  Dieses Haus ist in einer großen Stadt.
Candidate:  Das Haus in einer großen Stadt ist.
BLEU: 39.76353643835254
BERTscore: [0.9289785027503967]
BlEURT: [0.41286009550094604]


In [None]:
####################################################################
# TODO Look at the Huggingface metrics page (https://huggingface.co/metrics)
# Select two additional metrics and test them on our sample sentences
# Note!: you may have to install additional packages to use these metrics!
####################################################################
metric1 = None
metric2 = None
####################################################################

for cand in cands:
  print("Reference: ", ref)
  print("Candidate: ", cand)
  print(f"{metric1.name}: ", metric1.compute(predictions=[cand], references=[ref]))
  print(f"{metric2.name}: ", metric2.compute(predictions=[cand], references=[ref]))

## 1.5 Explain the predicted scores

Instead of using the Huggingface evaluate library, you can also load the scoring models with the transformers library. With this, you can use any explainability framework that can interact with Huggingface to explain your score.

In [16]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [21]:
#%%capture
import torch
model_name = "Elron/bleurt-base-128"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

def predict_bleurt_score(reference:str, candidate:str) -> None:
    print("Reference: ", reference)
    print("Candidate: ", candidate)
    ####################################################################
    # TODO Tokenize the reference and candidate and feed the tokenizer
    # output into the model. Print the score prediction.
    ####################################################################
    # Tokenize the reference and candidate
    inputs = tokenizer(reference, candidate, return_tensors="pt", truncation=True, padding=True)
    
    # Feed the tokenized inputs into the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # The model outputs a dictionary, we need to extract the score
    score = outputs.logits.squeeze().item()
    
    # Print the score prediction
    print("BLEURT Score: ", score)
    ####################################################################

In [22]:
ref = ("At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. "
  "Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids.")
cand = ("At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. It was really delicious! "
  "Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids.")
cand2 = ("At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. "
  "Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids. It was really delicious!")
predict_bleurt_score(ref, cand)
print('***')
predict_bleurt_score(ref, cand2)

Reference:  At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids.
Candidate:  At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. It was really delicious! Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids.
BLEURT Score:  0.07780024409294128
***
Reference:  At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids.
Candidate:  At the weekend, we visited my grandma's house an

### Both candidates hallucinate "It was really delicious!". However, the second candidate does not seem to get punished for it. Can you think of an explanation why?


# 2 Faithfulness

In this section, we fine-tune a question generation system to create a question-answering based hallucination detection system.

The steps for such a system are:


1.   Answer span extraction
2.   Question generation
3.   Question answering
4.   Answer comparison



In [25]:
####################################################################
# TODO think of additional candidates that you want to evaluate
####################################################################
source = "John became an older brother because Mary gave birth to a girl."
candidates = [ ""
]
####################################################################

## 2.1 Answer span extraction

For simplicity, we will only focus on noun answers.

Parse the candidates with spacy and extract all nouns.

In [24]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Parse the first candidate and print its annotations.
doc = nlp(candidates[0])
for token in doc:
  print(token.text, token.dep_, token.pos_, token.morph)

IndexError: list index out of range

In [None]:
# Extract all nouns from the candidates

answers = {candidate: [] for candidate in candidates}
for candidate in candidates:
  ####################################################################
  # TODO parse the candidate with spacy and append all noun tokens to
  # the answers of that candidate
  ####################################################################

  ####################################################################
answers

## 2.2.1 Train a question generation system

In [None]:
# Load the SQuAD dataset
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)
squad["train"][0]

In [None]:
# Load the model's tokenizer
from transformers import AutoTokenizer

model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_args = {
    #"padding": "max_length",
    #"return_tensors": "pt",
    "truncation": True
}

In [None]:
def prompt_pattern(answer, context):
  ####################################################################
  # TODO Design a prompt pattern for the question generation
  ####################################################################
  prompt = f"answer: {answer} context: {context}"
  ####################################################################
  return prompt

def preprocess(samples):
  ####################################################################
  # TODO Write a preprocessing function:
  # 1. Combine the answers and the contexts in a prompt
  # 2. Tokenize the inputs
  # 3. Tokenize the questions
  ####################################################################
  inputs = None
  ####################################################################
  return inputs

tokenized_squad = squad.map(preprocess, batched=True, remove_columns=squad["train"].column_names)

In [None]:
# Load the model
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Train the model
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_qg_model",
    ####################################################################
    # Set the hyperparameters for training
    ####################################################################

    ####################################################################
)

data_collator = DataCollatorForSeq2Seq(tokenizer)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
model.save_pretrained("my_awesome_qg_model")

## 2.2.2 Generate questions

In [None]:
from transformers import pipeline

question_generator = pipeline("text2text-generation", model="/content/my_awesome_qg_model", tokenizer=tokenizer)

In [None]:
questions = {candidate: [] for candidate in candidates}
for candidate in candidates:
  ####################################################################
  # TODO Use the trained model to extract questions for our samples
  ####################################################################

  ####################################################################
questions

## 2.3 Question answering

Open the [HuggingFace model hub](https://huggingface.co/models) and search for a suitable question answering model.

In [None]:
from  transformers  import  AutoTokenizer, AutoModelWithLMHead, pipeline

####################################################################
# TODO Load the model and write a function to call the model and
# retrieve the answer based on the context
####################################################################
model_name = ""
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained(model_name)

def question_answering(question, context):
  pass
####################################################################

In [None]:
for candidate in candidates:
  print("****", candidate)
  for answer, question in zip(answers[candidate], questions[candidate]):
    print("\t", question)
    print("\t\t Original answer:", answer)
    print("\t\t Answer candidate:", question_answering(question, candidate))
    print("\t\t Answer source:", question_answering(question, source))

### **Discussion**
*  Did you find any hallucinations?
*  What kind of hallucinations cannot be detected with such a system?
*  What system could you use for these hallucinations?


