In [4]:
import pandas as pd
import evaluate
import torch
import numpy as np
from transformers import TrainingArguments, Trainer, AutoTokenizer, CamembertForTokenClassification, DataCollatorForTokenClassification, AutoModelForTokenClassification
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from load_data import load_df, load_tags, load_and_create_ground_truth_df
from preprocessing import first_preprocessing, second_preprocessing
from experiment_helper import extract_text_tags_with_id_from_df, split_dataset, MyDataset, show_random_elements, tokenize_and_align_labels

  from .autonotebook import tqdm as notebook_tqdm


### Charger les données et preprocessing

In [5]:
# Load data and preprocess

data = load_df('../data/entities.json')
input_tokens = load_tags('../data/tokens.yml')
df = load_and_create_ground_truth_df(data, input_tokens)

# Preprocessing for first model
df = first_preprocessing(df)

# Other preprocessing steps for the second model
df = second_preprocessing(df)

In [6]:
# Add external data if needed
# first_names = pd.read_csv('../data/firstname_with_sex.csv', sep=';')
# first_names['firstname'] = first_names['firstname'].str.capitalize()
# df = pd.concat([df, first_names], axis=0).drop(columns=['male', 'female'])
# df = df.where(pd.notnull(df), None)
# first_names.head()

# communes = pd.read_csv('../data/georef-france-commune.csv', sep=';', usecols=['Nom Officiel Commune'])
# communes.rename(columns={'Nom Officiel Commune': 'lob'}, inplace=True)
# df = pd.concat([df, communes], axis=0)
# df = df.where(pd.notnull(df), None)
# communes.head()

In [7]:
df.shape

(25075, 11)

### Expérimentation

In [8]:
# Create tags and tag_ids
categories = input_tokens.keys()

# Remove maiden_name, education_level, observation from the categories
categories = [category for category in categories if category not in ['maiden_name', 'education_level', 'observation']]

# Create B- and I- tags for each category
tagged_categories = []
for category in categories:
    tagged_categories.append('B-' + category)
    tagged_categories.append('I-' + category)

# create encodings for our tags
tag2id = {tag: id for id, tag in enumerate(tagged_categories)}
id2tag = {id: tag for tag, id in tag2id.items()}

texts_tags = extract_text_tags_with_id_from_df(df, tag2id)

In [9]:
texts_tags[0:3]

[{'id': '0',
  'ner_tags': [0, 4, 8, 14, 16, 18],
  'tokens': ['25', 'Garçon', 'Cyrille', 'française', 'menuisier', 'Breton']},
 {'id': '1',
  'ner_tags': [0, 4, 8, 14, 16, 20],
  'tokens': ['30', 'Garçon', 'Auguste', 'Piémontaise', 'vitrier', 'Ferazzi']},
 {'id': '2',
  'ner_tags': [0, 4, 8, 14, 16, 20],
  'tokens': ['24', 'Garçon', 'Pierre', 'Piémontaise', 'vitrier', 'Machol']}]

In [10]:
# Split the dataset
train_data, validation_data, test_data = split_dataset(texts_tags)
len(train_data), len(validation_data), len(test_data)

(17552, 3761, 3762)

In [11]:
my_dataset = MyDataset(train_data, validation_data, test_data)
dataset_dict = my_dataset.get_dataset()
# print(dataset_dict['train'][0])
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 17552
    })
    validation: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 3761
    })
    test: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 3762
    })
})


In [12]:
# Load a tokenizer to preprocess the tokens field
tokenizer = AutoTokenizer.from_pretrained("camembert-base")

In [13]:
show_random_elements(dataset_dict["train"], tokenizer)

Unnamed: 0,id,ner_tags,tokens,tokenized
0,19794,"[0, 4, 8, 12, 16, 18]","[18, Garçon, Julien, Commune, domestique, Jourdain]","[<s>, ▁18, ▁Garçon, ▁Julien, ▁Commune, ▁domestique, ▁Jour, da, in, </s>]"
1,15257,"[0, 8, 10, 18]","[10, Germaine, enft, Francois]","[<s>, ▁10, ▁Germain, e, ▁en, ft, ▁Franco, is, </s>]"
2,1286,"[2, 6, 8, 10, 11, 11, 12, 13, 14, 16, 20]","[1890, Sipeyre, Albert, chef, de, ménage, Toehbach, (Rhin), francaise, Jardinier, Ethlmann]","[<s>, ▁1890, ▁Si, pe, yre, ▁Albert, ▁chef, ▁de, ▁ménage, ▁To, e, h, bach, ▁(, Rhin, ), ▁francaise, ▁Jardin, ier, ▁Et, hl, mann, </s>]"
3,2388,"[2, 8, 10, 12, 14, 16, 18]","[1903, Louise, fille, Empurany, français, ouvrière, Costerousse]","[<s>, ▁19, 03, ▁Louise, ▁fille, ▁Emp, ura, ny, ▁français, ▁ouvrière, ▁Co, ster, ousse, </s>]"
4,21122,"[0, 8, 10, 14, 18]","[45, Rose, domestique, français, Forgerit]","[<s>, ▁45, ▁Rose, ▁domestique, ▁français, ▁Forge, rit, </s>]"
5,7375,"[2, 8, 10, 12, 14, 18]","[1913, Yvonne, fille, Sermentizou, française, Gidon]","[<s>, ▁1913, ▁Yvon, ne, ▁fille, ▁Ser, ment, iz, ou, ▁française, ▁Gi, don, </s>]"
6,13150,"[2, 6, 7, 8, 10, 12, 14, 16, 20]","[1856, Croquine, Vernon, Constance, chef, Champenard, française, laveuse, Droyer]","[<s>, ▁18, 56, ▁Cro, quin, e, ▁Vern, on, ▁Constance, ▁chef, ▁Champ, en, ard, ▁française, ▁lave, use, ▁Dr, oyer, </s>]"
7,21174,"[0, 8, 10, 18]","[18, Armantine, servante, Aulneau]","[<s>, ▁18, ▁Arm, ant, ine, ▁servant, e, ▁Au, l, neau, </s>]"
8,19262,"[4, 8, 10, 11, 18]","[Garçon, Jacques, sa, femme, Bonneteau]","[<s>, ▁Garçon, ▁Jacques, ▁sa, ▁femme, ▁Bonne, t, eau, </s>]"
9,1255,"[2, 8, 10, 12, 13, 13, 14, 16, 18]","[1881, Marthe, ép, Fleury, sur, Andelle, française, s.p, Rottée]","[<s>, ▁1881, ▁Mar, the, ▁ép, ▁Fleur, y, ▁sur, ▁And, elle, ▁française, ▁s, ., p, ▁Rot, tée, </s>]"


In [14]:
# tokenized_dataset = dataset_dict.map(tokenize_and_align_labels, batched=True)
tokenized_dataset = dataset_dict.map(lambda examples: tokenize_and_align_labels(examples, tokenizer), batched=True)

Map:   0%|          | 0/17552 [00:00<?, ? examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable 

TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 17552/17552 [00:00<00:00, 34693.84 examples/s]
Map: 100%|██████████| 3761/3761 [00:00<00:00, 44348.67 examples/s]
Map: 100%|██████████| 3762/3762 [00:00<00:00, 44954.98 examples/s]


In [15]:
# Verify that the labels are correctly aligned with the tokenized input, especially after tokenization,
# since wordpiece tokenization can split tokens into subwords.
print([tokenizer.convert_ids_to_tokens(input_ids) for input_ids in tokenized_dataset['train'][3]['input_ids']])
print(tokenized_dataset['train'][3])

['<s>', '▁13', '▁Fille', '▁Marie', '▁leur', '▁fille', '▁ber', 'gère', '▁La', 'rig', 'u', 'ou', '</s>']
{'id': '21505', 'ner_tags': [0, 4, 8, 10, 11, 16, 18], 'tokens': ['13', 'Fille', 'Marie', 'leur', 'fille', 'bergère', 'Lariguou'], 'input_ids': [5, 560, 8765, 1521, 97, 536, 8481, 8500, 61, 5911, 518, 308, 6], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 4, 8, 10, 11, 16, -100, 18, -100, -100, -100, -100]}


In [16]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [17]:
# Eval
seqeval = evaluate.load("seqeval")

In [18]:
len(tagged_categories)

22

In [19]:
# Train
# model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=28, id2label=id2tag, label2id=tag2id)
model = CamembertForTokenClassification.from_pretrained("camembert-base", num_labels=22, id2label=id2tag, label2id=tag2id)
# model = CamembertForTokenClassification.from_pretrained("camembert-base", num_labels=28, id2label=id2tag, label2id=tag2id)

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model

CamembertForTokenClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)


In [21]:
def compute_metrics(p):
    """Calculates the metrics for the model."""
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tagged_categories[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tagged_categories[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [18]:
training_args = TrainingArguments(
    output_dir="camembert-base-0704-3",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    # eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Finetune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 46%|████▌     | 500/1097 [03:26<02:31,  3.94it/s]

{'loss': 1.5253, 'grad_norm': 1.989731788635254, 'learning_rate': 1.0884229717411122e-05, 'epoch': 0.46}


 91%|█████████ | 1000/1097 [05:52<00:27,  3.50it/s]

{'loss': 0.8621, 'grad_norm': 1.3360854387283325, 'learning_rate': 1.7684594348222425e-06, 'epoch': 0.91}


                                                   
100%|██████████| 1097/1097 [06:37<00:00,  3.49it/s]

{'eval_loss': 0.7026417851448059, 'eval_precision': 0.9534067910951203, 'eval_recall': 0.966093970742378, 'eval_f1': 0.9597084521707637, 'eval_accuracy': 0.9656755655941053, 'eval_runtime': 15.5757, 'eval_samples_per_second': 241.465, 'eval_steps_per_second': 15.152, 'epoch': 1.0}


100%|██████████| 1097/1097 [06:42<00:00,  2.73it/s]

{'train_runtime': 402.0344, 'train_samples_per_second': 43.658, 'train_steps_per_second': 2.729, 'train_loss': 1.1571335075329734, 'epoch': 1.0}





TrainOutput(global_step=1097, training_loss=1.1571335075329734, metrics={'train_runtime': 402.0344, 'train_samples_per_second': 43.658, 'train_steps_per_second': 2.729, 'train_loss': 1.1571335075329734, 'epoch': 1.0})

In [19]:
# evaluate again on the evaluation dataset or on another dataset
trainer.evaluate(eval_dataset=tokenized_dataset["test"])

100%|██████████| 236/236 [00:35<00:00,  6.71it/s]


{'eval_loss': 0.6969587802886963,
 'eval_precision': 0.9564302346408741,
 'eval_recall': 0.9687939402186239,
 'eval_f1': 0.962572387841096,
 'eval_accuracy': 0.967741935483871,
 'eval_runtime': 35.3126,
 'eval_samples_per_second': 106.534,
 'eval_steps_per_second': 6.683,
 'epoch': 1.0}

In [20]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [tagged_categories[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [tagged_categories[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
results = seqeval.compute(predictions=true_predictions, references=true_labels)

# Get scores and categories
f1_scores = {category: results[category]['f1'] for category in results if category not in ['overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']}
recall = {category: results[category]['recall'] for category in results if category not in ['overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']}
precision = {category: results[category]['precision'] for category in results if category not in ['overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']}
df_scores = pd.DataFrame(f1_scores.items(), columns=['Category', 'F1 Score'])
df_scores['Recall'] = [recall[category] for category in df_scores['Category']]
df_scores['Precision'] = [precision[category] for category in df_scores['Category']]
df_scores = df_scores.sort_values(by='F1 Score', ascending=False)
df_scores

100%|██████████| 236/236 [00:25<00:00,  9.12it/s]


Unnamed: 0,Category,F1 Score,Recall,Precision
1,birth_date,1.0,1.0,1.0
0,age,0.999794,1.0,0.999587
2,civil_status,0.999374,1.0,0.998749
4,firstname,0.989872,0.9904,0.989345
7,nationality,0.989194,0.991142,0.987255
5,link,0.979592,0.981783,0.97741
6,lob,0.970914,0.985935,0.956344
8,occupation,0.970729,0.968918,0.972546
9,surname,0.918723,0.912894,0.924627
10,surname_household,0.783305,0.847772,0.727949


#### Inférence

In [22]:
# Inférence
# text = "45 ans Pierre Homme marié  française chef de ménage SNCF Breton"
# text = "82 Louis journalier chef de mén Veuf français Vendée Laidet"
# text = "auber elise s.p épouse 1842 française fleury s andelle"
# text = '66 Antoine chef française cult Dumergue'
# text = '33 Homme marié Jean chef de ménage idem idem domestique en chef Guillotteau'
# text = '20 Fille Emilie idem Vignal'
# text = '1860 Choffre Antoine Veuf Homme Femme française cultivateur'
# text = '1999 Adèle Moreau Fille française étudiante Bretagne'
# text = 'française Fille Moreau Adèle 24 ans Dinan étudiante ingénieur'
text = '24 ans Adèle Fille française Dinan étudiante ingénieur Moreau'

In [23]:
df[(df['surname']=='Moreau') & (df['firstname']=='Adèle')]

Unnamed: 0,age,birth_date,civil_status,employer,firstname,link,lob,nationality,occupation,surname,surname_household
21240,18,,,,Adèle,,,,servante,Moreau,


In [24]:
# classifier = pipeline("ner", model="camembert-base1/checkpoint-2508/")
# classifier(text)

In [25]:
tokenizer = AutoTokenizer.from_pretrained("camembert-base-0704-3/checkpoint-1097/")
inputs = tokenizer(text, return_tensors="pt")

In [26]:
model = AutoModelForTokenClassification.from_pretrained("camembert-base-0704-3/checkpoint-1097/")
with torch.no_grad():
    logits = model(**inputs).logits

In [27]:
# Get probabilities from logits for each token
index = 2

def get_probas_from_logits(logits):
    return logits.softmax(-1)

probas = get_probas_from_logits(logits)
print(f"Token associated to the word {index}: {tokenizer.convert_ids_to_tokens(int(inputs['input_ids'][0][index]))}")
print(probas[0, index, :])

# print the 3 max probabilities
print(f"Top 3 probabilities: {torch.topk(probas[0, index, :], 3).values}")
print(f"Top 3 categories: {torch.topk(probas[0, index, :], 3).indices}")
print("Categories associated to the top 3 probabilities: ", [id2tag[i] for i in torch.topk(probas[0, index, :], 3).indices.tolist()])


Token associated to the word 2: ▁ans
tensor([0.0373, 0.2349, 0.0518, 0.0426, 0.0369, 0.0508, 0.0454, 0.0406, 0.0265,
        0.0388, 0.0301, 0.0524, 0.0286, 0.0271, 0.0284, 0.0359, 0.0283, 0.0337,
        0.0267, 0.0390, 0.0304, 0.0339])
Top 3 probabilities: tensor([0.2349, 0.0524, 0.0518])
Top 3 categories: tensor([ 1, 11,  2])
Categories associated to the top 3 probabilities:  ['I-age', 'I-link', 'B-birth_date']


In [28]:
# Get the maximum probability for each word
max_probabilities = torch.max(probas, dim=2).values.squeeze().tolist()

# Get the tokens
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
tags = [id2tag[torch.argmax(probas[0, i, :]).item()] for i, _ in enumerate(tokens)]

fig = go.Figure()
fig.add_trace(go.Bar(x=tokens, y=max_probabilities, text=tags,))
fig.update_layout(
    # title='Top category for each token with its maximum probability',
    xaxis_tickangle=-45,
    xaxis_title='Token',
    yaxis_title='Probabilité',
    font=dict(family="Arial, sans-serif", size=17, color="RebeccaPurple"),
    width=800,
    height=600
)
fig.show()

In [29]:
# Assuming tokenizer, inputs, torch, probas, and id2tag are defined
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
N = len(tokens)
cols_per_row = 4
rows = -(-N // cols_per_row)

fig = make_subplots(rows=rows, cols=cols_per_row, subplot_titles=tokens, horizontal_spacing=0.02)

for token_index in range(N):
    top_categories = torch.topk(probas[0, token_index, :], 5).indices.tolist()
    top_probs = torch.topk(probas[0, token_index, :], 5).values.tolist()
    category_labels = [id2tag[i] for i in top_categories]
    token_name = tokens[token_index]

    row = token_index // cols_per_row + 1
    col = token_index % cols_per_row + 1

    bar = go.Bar(x=top_probs, y=category_labels, orientation='h', name=token_name)
    fig.add_trace(bar, row=row, col=col)

    # Add annotations for each category label at the end of the bars
    for i, (prob, label) in enumerate(zip(top_probs, category_labels)):
        fig.add_annotation(x=prob, y=label, text=label, showarrow=False, xanchor='left', row=row, col=col, font=dict(size=13))

# Hide y-axis labels for all subplots
for r in range(1, rows+1):
    for c in range(1, cols_per_row+1):
        fig.update_yaxes(showticklabels=False, row=r, col=c)
        
fig.update_layout(height=250*rows, width=1000, showlegend=False, font=dict(size=14), title_font_size=16)
fig.show()
