# *Notebook* à utiliser pour faire le travail pratique # 3 sur l'analyse d'incidents.





## Bibliothèque de fonctions utilitaires

In [None]:
!pip install transformers torch
!pip install transformers[torch]
!pip install evaluate==0.4.0
!pip install rouge_score==0.1.2

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting evaluate==0.4.0
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate==0.4.0)
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate==0.4.0)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from e

In [None]:
# Importation des bibliothèques nécessaires
import string
import re
import argparse
import json
import sys
import spacy
import pandas as pd
import numpy as np
import json
import torch
import transformers
import matplotlib.pyplot as plt
import evaluate

from google.colab import drive
from __future__ import print_function
from collections import Counter
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

Quelques fonctions qui nous seront utiles pour mener à bien notre tâche.

In [None]:
def normalize_answer(s):
    """Mettre en minuscule et retirer la ponctuation, des déterminants and les espaces."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    """Normalise les 2 textes, trouve ce qu'il y a en comment et estime précision, rappel et F1."""
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if len(ground_truth_tokens) == 0 or len(prediction_tokens) == 0:
        return int(ground_truth_tokens == prediction_tokens)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    """Vérifie si les 2 textes sont quasi-identiques."""
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    """La fonction princiaple. Important de noter que ground_truths est une liste
       parce qu'il peut y avoir plusieurs réponses possibles."""
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)

In [None]:
# La fonction load_dataset est utilisée pour lire le contenu des 3 fichiers (train, dev et test)
def load_dataset(filename, base_url):
    with open(base_url + '/'+ filename, 'r') as fp:
        incident_list = json.load(fp)
    # return pd.DataFrame(incident_list)
    return incident_list

In [None]:
# La fonction dataset_df creer un dataframe contenant dans une colomne, la question, la réponse et le contexte.
def dataset_df(data, question_to_sentence):
  questions = []
  answers = []
  contexts = []

  for entry in incidents_dev:
    context = entry['text']
    for qa in entry['arguments'].keys():
      contexts.append(context)
      questions.append(question_to_sentence[qa])
      answers.append(entry['arguments'][qa])

  return pd.DataFrame({
    'question': questions,
    'answer': answers,
    'context': contexts,
  })


In [None]:
# La fonction evaluate_dataset_generative utilise l'approche générative pour produire les réponses aux questions et calcule a chaque fois, les métrics F1 et Exact Match pour la réponse obtenue.
def evaluate_dataset_generative(datasets, pipe):
    f1_scores = []
    em_scores = []
    predictions = []
    for index, row in datasets.iterrows():
        sequences = pipe(row["prompt"], max_new_tokens=40, do_sample=True, top_k=10, return_full_text = False)
        prediction = sequences[0]['generated_text']
        ground_truths = row["answer"]

        exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
        f1_value = metric_max_over_ground_truths(f1_score, prediction, ground_truths)

        f1_scores.append(f1_value)
        em_scores.append(exact_match)
        predictions.append(prediction)

    return f1_scores, em_scores, predictions

In [None]:
# La fonction get_answer permet d'extraire à partir de l'index prédit par le modèle, la réponse à la question sur le contexte.
def get_answer(context, question, prompt, model, tokenizer):
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True)
    start_logits, end_logits = model(**inputs).values()

    start_index_and_logits = torch.argmax(start_logits, dim=1).item(), start_logits[0].max().item()
    end_index_and_logits = torch.argmax(end_logits, dim=1).item(), end_logits[0].max().item()

    if end_index_and_logits[0] >= start_index_and_logits[0]:
        start_index, end_index = start_index_and_logits[0], end_index_and_logits[0]
    else:
        if start_index_and_logits[1] > end_index_and_logits[1]:
            start_index, end_index = start_index_and_logits[0], start_index_and_logits[0]
        else:
            start_index, end_index = end_index_and_logits[0], end_index_and_logits[0]

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index+1], skip_special_tokens=True))
    return answer

In [None]:
# La fonction evaluate_dataset_extractif utilise l'approche extractive pour produire les réponses aux questions et calcule a chaque fois, les metrics F1 et Exact Match pour la reponse obtenue.
def evaluate_dataset_extractif(datasets, model, tokenizer):
    f1_scores = []
    em_scores = []
    predictions = []
    for index, row in datasets.iterrows():
        prediction = get_answer(row["context"], row["question"], row["prompt"], model, tokenizer)
        ground_truths = row["answer"]

        exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
        f1_value = metric_max_over_ground_truths(f1_score, prediction, ground_truths)

        f1_scores.append(f1_value)
        em_scores.append(exact_match)
        predictions.append(prediction)

    return f1_scores, em_scores, predictions

In [None]:
question_to_sentence = [
    {
      "EVENT": "What event occurred?",
      "ACTIVITY": "What activity was being performed?",
      "WHO": "Who was involved?",
      "WHERE": "Where did the incident occur?",
      "WHEN": "When did the incident take place?",
      "CAUSE": "What caused the incident?",
      "EQUIPMENT": "What equipment was involved?",
      "INJURY": "What injuries were sustained?",
      "INJURED": "Who was injured?",
      "BODY-PARTS": "Which body parts were affected?",
      "DEATH": "Who died in the incident?",
      "SUBSTANCE": "What substance was involved in the incident?"
    },
    {
      "EVENT": "What was the nature of the event?",
      "ACTIVITY": "What activity was being performed at the time of the event?",
      "WHO": "Who were the individuals involved in the event?",
      "WHERE": "Where did the event take place?",
      "WHEN": "When did the event occur?",
      "CAUSE": "What was the cause of the event?",
      "EQUIPMENT": "What equipment or vehicles were involved?",
      "INJURY": "Were there any injuries, and if so, what kind",
      "INJURED": "Who sustained injuries in the event?",
      "BODY-PARTS": "What damage occurred to the vehicles involved?",
      "DEATH": "What impact did the event have on traffic?",
      "SUBSTANCE": "What was the key substance involved or impacted by the situation?"
    }
]

torch.manual_seed(0)

<torch._C.Generator at 0x7ac2829ef270>

## Expérience avec un modèle génératif

### Création du modèle

In [None]:
model_name = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = 0

pipe = pipeline("text-generation", model=model_name, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

### Chargement des donnees

In [None]:
# Chemin d'accès aux 3 fichiers (train, dev et test)
incidents_dev_fn = "dev_examples.json"
base_url = '/content/data'

# Chargement des données
incidents_dev = load_dataset(incidents_dev_fn, base_url)

df = dataset_df(incidents_dev, question_to_sentence[0])

In [None]:
prompts = []
f_datasets = df.copy()

for index, row in df.iterrows():
  prompts.append(f"Answer the question using the context below.\nContext: {row['context']}\nQuestion: {row['question']}\nAnswer:")

f_datasets['prompt'] = prompts

### Evaluation des résultats

In [None]:
f1_scores, em_scores, predictions = evaluate_dataset_generative(f_datasets, pipe)
f_datasets['f1_score'] = f1_scores
f_datasets['em_score'] = em_scores
f_datasets['prediction'] = predictions

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The current implementation of Falcon calls `torch.scaled_dot_product_attention` directly, this will be deprecated in the future in favor of the `BetterTransformer` API. Please install the latest optimum library with `pip install -U optimum` and call `model.to_bettertransformer()` to benefit from `torch.scaled_dot_product_attention` and future performance optimizations.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting

In [None]:
print(f"Nombre d'exemples: {len(em_scores)}")

print(f"F1 score: {np.mean(f1_scores) * 100}")

print(f"EM score: {len(f_datasets[em_scores])}")

predictions[:20]

Nombre d'exemples: 1130
F1 score: 35.96483794250167
EM score: 170


[' An employee at a Wirtan Wirted machine was with a coworker on the road at night, and the of machine a construction company was and was and. The was the. was',
 ' Milling machine.',
 ' Employee #1 Wirtgen, the and milling machine #1 and and the center 2006: the was # and #1 and #1: # #',
 ' In\nThe incident occurred in a median on the of of a 12 ft wide Interstate. The was #1 and the 200 feet of the median on 200 ft (approximately the',
 ' November 10 2013',
 ' The cause of the was due to the of and a 45-degree angle the of the the of the with the and the was the to in the. The #1 the (',
 ' The equipment involved was a Wirtagen milling machine, a construction inc.,, and protective Services, Inc. (PSI)',
 '\nEmployee 1 was #1, who died as a result of severe head trauma.',
 ' The operator of the machine the milling machine and Employee with Villager the with coworker.',
 ' The head of was',
 ' The construction of the milling machine',
 ' On August 28 2012 at 10:00 a.m Employee #1 suff

## Expérience avec un modèle extractif

In [None]:
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = 0

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
s_datasets = f_datasets.copy()
extract_f1_scores, extract_em_scores, extract_predictions = evaluate_dataset_extractif(s_datasets, model, tokenizer)
s_datasets['f1_score'] = f1_scores
s_datasets['em_score'] = em_scores
s_datasets['prediction'] = predictions

In [None]:
print(f"Nombre d'exemples: {len(extract_em_scores)}")

print(f"F1 score: {np.mean(extract_f1_scores) * 100}")

print(f"EM score: {len(f_datasets[extract_em_scores])}")

extract_predictions[:20]

Nombre d'exemples: 1130
F1 score: 58.63892426886718
EM score: 533


['',
 '',
 'employee # 1 with villager construction inc.',
 'railroad bridge overpass',
 'november 10 2013',
 '',
 'asphalt milling machine',
 '',
 'employee # 1',
 '',
 'employee # 1',
 '',
 'demolish the interiors of the building',
 'employee # 1',
 'menlo park california',
 'august 27 2012',
 'serious fracture injury to his left leg',
 'gradall machine',
 'serious fracture injury to his left leg',
 'employee # 1']

## Analyse et Conclusion

En Comparant les performances des deux modèles, nous sommes arrivé à l'analyse et à la conclusion suivante :

A. **Analayse des modèles:**

1. **Tiiuae/falcon-7b-instruct (Modèle Génératif):**

* F1 Score : 35.96%
* EM Score : 170 sur 1130

Ce modèle génère des réponses, ce qui signifie qu'il crée du contenu basé sur les informations qu'il a apprises lors de l'entraînement.
Un score F1 plus faible suggère que les réponses générées ne correspondent pas toujours précisément aux réponses attendues, et il peut y avoir des erreurs de précision ou un manque de pertinence.
Un score EM relativement bas indique que dans la plupart des cas, les réponses générées ne sont pas exactement ce qui est recherché.

<br>

2. **Bert-large-uncased-whole-word-masking-finetuned-squad (Modèle
Extractif):**


* F1 Score : 58.64%
* EM Score : 533 sur 1130

Ce modèle extractif trouve des réponses en extrayant directement des segments de texte à partir d'un contexte donné.
Un score F1 plus élevé suggère une meilleure précision et pertinence dans la récupération des réponses.
Un score EM élevé indique que le modèle est capable de trouver la réponse exacte dans un nombre considérable de cas.

<br>

B. **Conclusion:**

On peut retenir de l'analyse qui précède que:

* Le modèle extractif "bert-large-uncased-whole-word-masking-finetuned-squad" surpasse le modèle génératif "tiiuae/falcon-7b-instruct" en termes de précision et de pertinence des réponses dans le cadre de la tâche question-réponse.

* La nature extractive de "bert-large-uncased-whole-word-masking-finetuned-squad" le rend plus adapté pour des tâches où la réponse exacte doit être trouvée dans un contexte donné, comme c'est souvent le cas dans les systèmes de question-réponse.

* Le modèle génératif, malgré sa capacité à créer des réponses nouvelles et uniques, peut manquer de précision et d'exactitude par rapport à un modèle extractif dans ce contexte spécifique.

<br>
En résumé, pour une application de question-réponse où l'exactitude et la pertinence des informations sont cruciales, un modèle extractif comme "bert-large-uncased-whole-word-masking-finetuned-squad" serait probablement plus approprié que le modèle génératif "tiiuae/falcon-7b-instruct".