# <center> Named Entity Recognition

Ce notebook contient le code utilisé pour produire les résultats de la partie 3 du rapport.

## Imports

In [1]:
import evaluate
import json
import yaml
import numpy as np
import regex as re
from datasets import Dataset
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, pipeline

# Chargement des données

In [2]:
tags = yaml.safe_load(open("data/tokens.yml", "r"))
entities = json.load(open("data/entities.json"))

# Preprocessing

In [3]:
all_tags = [v["start"] for v in tags.values()]

prefix_beginning = "B-"
prefix_inside = "I-"
labels_list = [prefix + tag for tag in all_tags for prefix in [prefix_beginning, prefix_inside]]
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {v: k for k,v in label2id.items()}

## Création du dataset

In [4]:
def lines_to_list(lines):
    return lines.split("\n")

def line_to_infos(line):
    all_tags_str = "".join(all_tags)
    pattern = rf"[{all_tags_str}].*?(?=(?:[{all_tags_str}])|$)"
    infos = {}
    infos_in_lines = re.findall(pattern, line)
    for info in infos_in_lines:
        token = info[0]
        text = info[1:].strip()
        infos[token]=text
    return infos

In [5]:
def infos_to_ds(infos):
    tokens = []
    ner_tags = []
    for tag, text in infos.items():
        splitted_text = text.split(" ")
        tags = [label2id[prefix_beginning + tag]] + [label2id[prefix_inside + tag]] * (len(splitted_text)-1)
        tokens.extend(splitted_text)
        ner_tags.extend(tags)
    return tokens, ner_tags

In [6]:
def create_dataset_dict(entities):
    lines_data = []
    for lines in entities.values():
        lines_data.extend(lines_to_list(lines))
        
    data = []
    for line_data in lines_data:
        data.append(line_to_infos(line_data))
    
    ds_dict = {
        "ner_tags":[],
        "tokens": []
        }
    
    for infos in data:
        tokens, ner_tags = infos_to_ds(infos)
        ds_dict["tokens"].append(tokens)
        ds_dict["ner_tags"].append(ner_tags)
        
    return ds_dict

In [7]:
ds = Dataset.from_dict(create_dataset_dict(entities)).train_test_split(0.2)

## Choix du modèle

In [8]:
model_checkpoint = "distilbert/distilbert-base-uncased"
# model_checkpoint = "almanach/camembert-base"

## Tokenisation des séquences

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/20358 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/5090 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Création du processus d'évaluation

In [13]:
seqeval = evaluate.load("seqeval")

In [14]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return results

## Chargement du modèle préentraîné

In [15]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(labels_list), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine tuning du modèle

In [16]:
training_args = TrainingArguments(
    output_dir="nlp_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/2546 [00:00<?, ?it/s]

{'loss': 0.53, 'grad_norm': 1.6168144941329956, 'learning_rate': 1.607227022780833e-05, 'epoch': 0.39}
{'loss': 0.1467, 'grad_norm': 2.387028694152832, 'learning_rate': 1.2144540455616654e-05, 'epoch': 0.79}


  0%|          | 0/319 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_Ⓐ': {'precision': 0.7975579975579976, 'recall': 0.99633923123856, 'f1': 0.8859351688593518, 'number': 3278}, 'eval_Ⓑ': {'precision': 0.9958649207443143, 'recall': 0.9684986595174263, 'f1': 0.9819911654774041, 'number': 1492}, 'eval_Ⓒ': {'precision': 0.9900190114068441, 'recall': 0.9701909641360037, 'f1': 0.980004704775347, 'number': 2147}, 'eval_Ⓔ': {'precision': 0.8379254457050244, 'recall': 0.8792517006802721, 'f1': 0.858091286307054, 'number': 588}, 'eval_Ⓕ': {'precision': 0.9917559769167353, 'recall': 0.9664591283390239, 'f1': 0.9789441562404638, 'number': 4979}, 'eval_Ⓗ': {'precision': 0.9380488962734393, 'recall': 0.9459071325993298, 'f1': 0.9419616255511858, 'number': 4178}, 'eval_Ⓘ': {'precision': 0.9464954521134297, 'recall': 0.9459893048128343, 'f1': 0.946242310778283, 'number': 1870}, 'eval_Ⓚ': {'precision': 0.9783533049864708, 'recall': 0.9641904761904762, 'f1': 0.9712202609363008, 'number': 2625}, 'eval_Ⓛ': {'precision': 0.4411764705882353, 'recall

  0%|          | 0/319 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_Ⓐ': {'precision': 0.7971652003910068, 'recall': 0.9951189749847468, 'f1': 0.8852103120759838, 'number': 3278}, 'eval_Ⓑ': {'precision': 0.9958649207443143, 'recall': 0.9684986595174263, 'f1': 0.9819911654774041, 'number': 1492}, 'eval_Ⓒ': {'precision': 0.9918738049713193, 'recall': 0.9664648346530041, 'f1': 0.9790044821891956, 'number': 2147}, 'eval_Ⓔ': {'precision': 0.9306759098786829, 'recall': 0.9132653061224489, 'f1': 0.9218884120171674, 'number': 588}, 'eval_Ⓕ': {'precision': 0.98828125, 'recall': 0.9654549106246234, 'f1': 0.9767347353449152, 'number': 4979}, 'eval_Ⓗ': {'precision': 0.946875, 'recall': 0.9427955959789373, 'f1': 0.9448308946989685, 'number': 4178}, 'eval_Ⓘ': {'precision': 0.9560737527114967, 'recall': 0.9427807486631016, 'f1': 0.9493807215939687, 'number': 1870}, 'eval_Ⓚ': {'precision': 0.9859649122807017, 'recall': 0.9634285714285714, 'f1': 0.9745664739884392, 'number': 2625}, 'eval_Ⓛ': {'precision': 0.6435643564356436, 'recall': 0.49618320

TrainOutput(global_step=2546, training_loss=0.19089205341833554, metrics={'train_runtime': 475.1057, 'train_samples_per_second': 85.699, 'train_steps_per_second': 5.359, 'train_loss': 0.19089205341833554, 'epoch': 2.0})

## Prédictions du modèle

In [27]:
text = " ".join(tokenized_ds["test"][0]["tokens"])
print(tokenizer.tokenize(text, add_special_tokens=False))
print(tags)

['samuel', 'louis', 'id', '##em', 'fi', '##ls', '1927', 'francais', 'id', '##em']
['B-Ⓞ', 'B-Ⓕ', 'B-Ⓜ', 'B-Ⓗ', 'B-Ⓑ', 'B-Ⓚ', 'B-Ⓘ']


In [28]:
classifier = pipeline("ner", model=model, tokenizer=tokenizer)
classifier(text)

[{'entity': 'B-Ⓞ',
  'score': 0.9934668,
  'index': 1,
  'word': 'samuel',
  'start': 0,
  'end': 6},
 {'entity': 'B-Ⓕ',
  'score': 0.99295926,
  'index': 2,
  'word': 'louis',
  'start': 7,
  'end': 12},
 {'entity': 'B-Ⓜ',
  'score': 0.9969694,
  'index': 3,
  'word': 'id',
  'start': 13,
  'end': 15},
 {'entity': 'I-Ⓜ',
  'score': 0.80385107,
  'index': 4,
  'word': '##em',
  'start': 15,
  'end': 17},
 {'entity': 'B-Ⓗ',
  'score': 0.99697304,
  'index': 5,
  'word': 'fi',
  'start': 18,
  'end': 20},
 {'entity': 'B-Ⓗ',
  'score': 0.99508345,
  'index': 6,
  'word': '##ls',
  'start': 20,
  'end': 22},
 {'entity': 'B-Ⓑ',
  'score': 0.99535537,
  'index': 7,
  'word': '1927',
  'start': 23,
  'end': 27},
 {'entity': 'B-Ⓚ',
  'score': 0.99724364,
  'index': 8,
  'word': 'francais',
  'start': 28,
  'end': 36},
 {'entity': 'B-Ⓘ',
  'score': 0.9956045,
  'index': 9,
  'word': 'id',
  'start': 37,
  'end': 39},
 {'entity': 'B-Ⓘ',
  'score': 0.5931944,
  'index': 10,
  'word': '##em',
  's