In [918]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import yaml
import re
import evaluate
import random
import torch
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import TrainingArguments, Trainer, pipeline, AutoTokenizer, CamembertForTokenClassification, DataCollatorForTokenClassification, AutoModelForTokenClassification
from datasets import ClassLabel, Sequence
from IPython.display import display, HTML
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
hello

### Build ground truth dataframe

In [919]:
def load_df(file_path):
    df = pd.read_json(file_path, orient='index', )
    df.rename(columns={df.columns[0]: 'Information'}, inplace=True)
    return df

def load_tags(file_path):
    with open(file_path, 'r') as file:
        tags = yaml.safe_load(file)
    return tags

In [920]:
def create_dict_tags(input_dict):
    # Initialize an empty dictionary to hold the tags and their corresponding categories
    tags_categories = {}

    # Iterate over the input dictionary to extract the start markers and their corresponding categories
    for category, markers in input_dict.items():
        # Assign the start marker as the key and the category name as the value in the tags_categories dictionary
        tags_categories[markers['start']] = category

    return tags_categories

In [921]:
def create_list_tags(input_dict):
    tags_symbols = []
    for category in input_dict:
        tags_symbols.append(input_dict[category]['start'])
    return tags_symbols

In [922]:
def classify_entry(entry, tags_symbols, tags_categories):
    """Take an entry and classify each part according to the tags"""
    # Split the entry by the tags
    parts = re.split('('+ '|'.join(re.escape(tag) for tag in tags_symbols) +')', entry)

    # Dictionary to hold the classified parts with categories as keys
    classified_parts = {category: None for category in tags_categories.values()}

    # Process the split parts and classify them according to the tags
    for i in range(1, len(parts), 2):  # iterate over every second element (tags) starting from index 1
        tag = parts[i]
        value = parts[i+1].strip()  # get the value after the tag
        category = tags_categories.get(tag, None)  # get the category for the tag
        if category:
            classified_parts[category] = value  # assign the value to the correct category

    return classified_parts

In [923]:
def load_and_create_ground_truth_df(input_df, tags):

    tags_dict = create_dict_tags(tags)
    tags_list = create_list_tags(tags)

    classified_data = []
    # Concatenate all the dataframes from the entities.json file
    for i in range(len(input_df)):
        for entry in input_df['Information'].iloc[i].split('\n'):
            classified_data.append(classify_entry(entry, tags_list, tags_dict))

    # Convert the classified data into a DataFrame
    classified_df = pd.DataFrame(classified_data)
    return classified_df

In [924]:
data = load_df('../data/entities.json')
input_tokens = load_tags('../data/tokens.yml')
df = load_and_create_ground_truth_df(data, input_tokens)

### Statistiques descriptives des données disponibles

In [925]:
df.head()

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname,surname_household
0,25,,Garçon,,,Cyrille,,,,française,,menuisier,Breton,
1,30,,Garçon,,,Auguste,,,,Piémontaise,,vitrier,,Ferazzi
2,24,,Garçon,,,Pierre,,,,Piémontaise,,vitrier,,Machol
3,48,,Homme marié,,,Alexandre,,,,française,,prop re,,Desbois
4,30,,,,,Zélie,sa fe,,,française,,prop re,Vignat,


In [926]:
# Drop rows with only NaN values
df = df.dropna(how='all')

# 25075 rows (=persons), 14 columns (=categories)
df.shape

(25075, 14)

In [927]:
df.describe()

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname,surname_household
count,16436,7344,10705,0.0,2911,24931,20736,9236,0.0,13314,602,16179,19159,5640
unique,253,158,6,0.0,1087,2456,937,2923,0.0,73,310,2056,8120,4126
top,2,1901,Garçon,,patron,Marie,chef,idem,,française,idem,idem,idem,Martin
freq,337,138,2824,,644,2758,3022,3378,,7916,88,3695,669,29


In [928]:
# Missing values
missing_percentage = df.isnull().sum() / len(df) * 100
missing_percentage = missing_percentage.reset_index()
missing_percentage.columns = ['Column', 'MissingPercentage']
missing_percentage = missing_percentage.sort_values(by='MissingPercentage', ascending=False)

# Plot
fig = px.bar(missing_percentage, x='Column', y='MissingPercentage', text='MissingPercentage',
            #  title="Pourcentage de valeurs manquantes par catégorie",
             width=700,
             height=500)
fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside', textfont_size=15)
fig.update_layout(yaxis=dict(title='Pourcentage de valeurs manquantes'),
                  xaxis=dict(title='Catégorie'),
                  font = dict(size=14))
fig.show()


In [929]:
# Duplicates
df[df.duplicated()]
# df[(df.duplicated()) & (df['firstname']=='Marie')].shape
df.duplicated().sum()

41

In [930]:
# Unique values
unique_values_count = df.nunique()
unique_values_count = unique_values_count.reset_index()
unique_values_count.columns = ['Column', 'UniqueValuesCount']

# Sort the DataFrame by the UniqueValuesCount column
unique_values_count_sorted = unique_values_count.sort_values(by='UniqueValuesCount', ascending=False)

# Plot with the sorted data
fig = px.bar(unique_values_count_sorted, x='Column', y='UniqueValuesCount', text='UniqueValuesCount',
            #  title="Nombre de valeurs uniques par catégorie",
             width=700, height=500)
fig.update_traces(texttemplate='%{text}', textposition='outside', textfont_size=13)
fig.update_layout(yaxis=dict(title='Nombre de valeurs uniques'),
                  xaxis=dict(title='Catégorie'),
                  font = dict(size=14))
fig.show()

In [931]:
# Count number of different surnames, including surname_household
unique_surnames = set(df['surname_household']).union(set(df['surname']))
num_unique_surnames = len(unique_surnames)
print("Number of different surnames:", num_unique_surnames)

Number of different surnames: 9589


In [932]:
print(f"Civil status: {df['civil_status'].unique()}")
print(f"Civil status when surname_household is not null:")
df[df['surname_household'].notnull()]['civil_status'].value_counts()

# plot histogram
fig = px.histogram(df, x='civil_status', 
                #    title='Histogramme de la variable civil_status', 
                   width=700, height=500)
fig.update_layout(yaxis=dict(title='Count'),
                  xaxis=dict(title='Statut civil'),
                  font = dict(size=14))
fig.show()

Civil status: ['Garçon' 'Homme marié' None 'Fille' 'Femme mariée' 'Veuve' 'Veuf']
Civil status when surname_household is not null:


In [933]:
# Prénoms les plus observés
top_names = df['firstname'].value_counts().head(10)
top_names_df = top_names.reset_index()
top_names_df.columns = ['Prénom', 'Nombre de fois donné']

fig = px.bar(top_names_df, x='Prénom', y='Nombre de fois donné',
            #  title='Les 10 prénoms les plus courants',
             labels={'Prénom': 'Prénom', 'Nombre de fois donné': 'Nombre de fois donné'},
             width=700,
             height=500)

fig.update_layout(xaxis_tickangle=-45,
                #   plot_bgcolor='white',
                  xaxis=dict(title='Prénom'),
                  yaxis=dict(title='Nombre de fois donné'),
                  title=dict(x=0.5),
                  font = dict(size=14))

fig.show()

In [934]:
# Statistiques sur l'âge et l'année de naissance

# Âge
df_age = df.copy()
df_age = df_age[df_age['age'].apply(lambda x: len(str(x)) != 4)]
df_age = df_age[df_age['age'].apply(lambda x: len(str(x)) != 3)]
df_age = df_age[~df_age['age'].str.contains('\⁇')]
df_age = df_age[~df_age['age'].str.contains('\?')]
df_age = df_age.replace(to_replace='mois', value=1, regex=True)
df_age = df_age.replace(to_replace='mpis', value=1, regex=True)
df_age = df_age.replace(to_replace='semaines', value=1, regex=True)
df_age = df_age.replace(to_replace='jours', value=0, regex=True)
df_age = df_age.replace(to_replace='jour', value=0, regex=True)
df_age['age'] = df_age['age'].str.replace(' ans', '')
df_age['age'] = df_age['age'].str.replace('ans', '')
df_age['age'] = df_age['age'].str.replace('an', '')
df_age['age'] = df_age['age'].str.replace("1'", '1')

df_age = df_age[df_age['age'].apply(lambda x: len(str(x)) < 3)]
df_age['age'] = df_age['age'].astype(int)
print(f"Age moyen : {df_age['age'].mean()}")
print(f"Age médian : {df_age['age'].median()}")
# print(set(df_age['age'].unique()))

Age moyen : 31.55021181161226
Age médian : 29.0


In [935]:
# Année de naissance
df_birth_year = df.copy()
df_birth_year = df_birth_year[df_birth_year['birth_date'].apply(lambda x: len(str(x)) == 4)]
df_birth_year = df_birth_year[df_birth_year['birth_date']!='idem']
df_birth_year = df_birth_year.dropna(subset=['birth_date'])

df_birth_year['birth_date'] = df_birth_year['birth_date'].astype(int)
px.histogram(df_birth_year, x='birth_date', title='Histogramme des années de naissance', nbins=50, width=800, height=500)

#### Pre-processing

In [936]:
# Drop les colonnes vides (maiden_name, education_level) + observation car biaise
df = df.drop(columns=['maiden_name', 'education_level', 'observation'])

In [937]:
# Age
# If the date is composed of 4 digits, it is not an age, thus replace by NA
mask = df['age'].notna() & df['age'].astype(str).str.match(r'^\d{4}$')
df.loc[mask, 'age'] = None

# Birthdate
# If the date is not composed of 4 characters, it is not a birth date, thus replace by NA
mask = df['birth_date'].notna() & ~df['birth_date'].astype(str).str.match(r'^\d{4}$')
df.loc[mask, 'birth_date'] = None

In [938]:
# Replace "idem" by the previous cell value
def replace_with_previous(df):
    for col in df.columns:
        previous_value = None
        for index, row in df.iterrows():
            value = row[col]
            if value == 'idem' and previous_value is not None:
                df.at[index, col] = previous_value
            else:
                previous_value = value
    return df

df = replace_with_previous(df)

In [726]:
# Lowercase
# df = df.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

In [939]:
# Preprocessing 2
# Si age ne contient pas semaines, mois, jours, an, ans ou jour et que la longueur est supérieure à 2, alors on remplace par NA
df['age'] = df['age'].apply(lambda x: None if isinstance(x, float) or (x is not None and not any(word in x for word in ['semaine', 'semaines', 'mois', 'jours', 'an', 'ans', 'jour']) and len(x) > 2) else x)

In [940]:
# impossible de tout modifier
replace_chef = ['ch d m', 'ch d m.', 'ch d mé', 'ch de m', 'ch de m ge', 'ch de m.', 'ch de ménage', 'ch. d . m', 'ch. d m', 'ch. d m.', 'ch. d. m', 'ch. d. m.', 'ch. de M.', 'ch. de m', 'ch. de m ge', 
                'ch. de m.', 'ch. de mge', 'ch. de ménage', 'ch. m', 'ch.de ménage', 'ch.m', 'ch.m.', 'chef d m', 'chef de f', 'chef de f le', 'chef de fam', 'chef de famille', 'chef de flle', 
                'chef de m e', 'chef de m ge', 'chef de m.', 'chef de maison', 'chef de mange', 'chef de mge', 'chef de mé age', 'chef de méange', 'chef de mén.', 
                'chef de ménage et veuve', 'chef de ménage, son fils', 'chef de m⁇nage', 'chef domestique', 'chef du ménage', 'chef m ge', 'chef ménage', 'CHEF DE MENAGE', 
                'Ch. de M.', 'Chef de menage', 'Chef de mén', 'Chef de ménage', 'chef de ménage d m', 'chef de ménage de f', 'chef de ménage de f le', 'chef de ménage de fam', 'chef de ménage de famille', 
                'chef de ménage de flle', 'chef de ménage de menage', 'chef de ménage de mén', 'chef de ménage de ménage', 'chef de ménage de poste', 'chef de ménage domestique', 'chef de ménage du ménage',
                'chef de ménage ge', 'chef de ménage m ge', 'chef de ménage ménage', 'chef de ménage religieuse', 'chef de ménage veuf', 'chef de ménage.', 'cheg','chf']
pattern = r'\b(?:' + '|'.join(map(re.escape, replace_chef)) + r')\b'
def replace_first_occurrence_chef(match):
    return 'chef de ménage'
df['link'] = df['link'].str.replace(pattern, replace_first_occurrence_chef, 1, regex=True)

replace_link = ['s f e', 'sa f', 'sa f e', 'sa f me', 'sa fe', 'sa felle', 'sa fem', 'sa femlme']
pattern = r'\b(?:' + '|'.join(map(re.escape, replace_link)) + r')\b'
def replace_first_occurrence_link(match):
    return 'sa femme'
df['link'] = df['link'].str.replace(pattern, replace_first_occurrence_link, 1, regex=True)

In [941]:
# set(df['link'].unique())

#### Ajouter données externes

In [942]:
# Run once
first_names = pd.read_csv('../data/firstname_with_sex.csv', sep=';')
first_names['firstname'] = first_names['firstname'].str.capitalize()
df = pd.concat([df, first_names], axis=0).drop(columns=['male', 'female'])
df = df.where(pd.notnull(df), None)
first_names.head()

Unnamed: 0,firstname,male,female
0,Marie,10145,2390322
1,Jean,1869615,6476
2,Pierre,1475841,5047
3,Jeanne,1765,1097397
4,François,1089009,5951


In [943]:
# Run once
communes = pd.read_csv('../data/georef-france-commune.csv', sep=';', usecols=['Nom Officiel Commune'])
communes.rename(columns={'Nom Officiel Commune': 'lob'}, inplace=True)
df = pd.concat([df, communes], axis=0)
df = df.where(pd.notnull(df), None)
communes.head()

Unnamed: 0,lob
0,Mirabeau
1,Revest-Saint-Martin
2,Sainte-Croix-à-Lauze
3,Remollon
4,Villar-d'Arêne


In [944]:
df.shape

(66979, 11)

#### Expérimentation

- https://nlpprogress.com/english/named_entity_recognition.html
- https://huggingface.co/transformers/v3.2.0/custom_datasets.html#token-classification-with-w-nut-emerging-entities
- https://reybahl.medium.com/token-classification-in-python-with-huggingface-3fab73a6a20e
- https://huggingface.co/docs/transformers/training#train-with-pytorch-trainer
- https://huggingface.co/docs/transformers/tasks/token_classification#evaluate

In [945]:
def extract_text_tags_with_id_from_df(df, category_to_index):
    """Extracts the text and tags from the DataFrame and returns them as lists.

    Args:
        df (pd.DataFrame): The DataFrame containing the text and tags.
        category_to_index (dict): A dictionary mapping the categories to their corresponding indices.

    Returns:
        output (list): A list of dictionaries containing the text, tags, and ID for each row.
    """
    output = []

    for index, row in df.iterrows():
        words_list = []
        categories_list = []

        for column in df.columns:
            value = row[column]
            if pd.notnull(value) and isinstance(value, str):
                split_values = value.split()
                for i, item in enumerate(split_values):
                    # Determine the tag (B- or I-)
                    tag = f'B-{column}' if i == 0 else f'I-{column}'

                    # Append the word and its category index
                    words_list.append(item)
                    # If the tag is not found in the predefined mapping, append 0 (for 'O')
                    categories_list.append(category_to_index.get(tag, 0))
            elif pd.notnull(value):
                words_list.append(str(value))
                categories_list.append(0)  # Non-categorical or non-string values

        # For any remaining space, mark as 'O'
        categories_list += [0] * (len(words_list) - len(categories_list))

        # Append the dictionary for this row to the output list
        output_dict = {'id': str(index), 'ner_tags': categories_list, 'tokens': words_list}
        output.append(output_dict)

    return output

The letter that prefixes each ner_tag indicates the token position of the entity:
- B- indicates the beginning of an entity.
- I- indicates a token is contained inside the same entity (for example, the State token is a part of an entity like Empire State Building).

In [946]:
# Create tags and tag_ids
categories = input_tokens.keys()

# Remove maiden_name, education_level, observation from the categories
categories = [category for category in categories if category not in ['maiden_name', 'education_level', 'observation']]

# Create B- and I- tags for each category
tagged_categories = []
for category in categories:
    tagged_categories.append('B-' + category)
    tagged_categories.append('I-' + category)

# create encodings for our tags
tag2id = {tag: id for id, tag in enumerate(tagged_categories)}
id2tag = {id: tag for tag, id in tag2id.items()}

texts_tags = extract_text_tags_with_id_from_df(df, tag2id)

In [947]:
texts_tags[0:3]

[{'id': '0',
  'ner_tags': [0, 4, 8, 14, 16, 18],
  'tokens': ['25', 'Garçon', 'Cyrille', 'française', 'menuisier', 'Breton']},
 {'id': '1',
  'ner_tags': [0, 4, 8, 14, 16, 20],
  'tokens': ['30', 'Garçon', 'Auguste', 'Piémontaise', 'vitrier', 'Ferazzi']},
 {'id': '2',
  'ner_tags': [0, 4, 8, 14, 16, 20],
  'tokens': ['24', 'Garçon', 'Pierre', 'Piémontaise', 'vitrier', 'Machol']}]

In [948]:
# Shuffle the dataset to ensure random distribution
random.shuffle(texts_tags)

# Define the split ratio
train_ratio = 0.7
test_ratio = 0.15
validation_ratio = 0.15

# Calculate the split index
split_index_1 = int(len(texts_tags) * train_ratio)
split_index_2 = int(len(texts_tags) * (train_ratio + test_ratio))

# Split the data into training and test sets
train_data = texts_tags[:split_index_1]
validation_data = texts_tags[split_index_1:split_index_2]
test_data = texts_tags[split_index_2:]

len(train_data), len(validation_data), len(test_data)

(46885, 10047, 10047)

In [949]:
class MyDataset:
    # def __init__(self, train_data, test_data):
    def __init__(self, train_data, validation_data, test_data):
        # Convert list of dictionaries to Dataset directly without additional formatting
        self.dataset_dict = DatasetDict({
            'train': Dataset.from_pandas(pd.DataFrame(train_data)),
            'validation': Dataset.from_pandas(pd.DataFrame(validation_data)),
            'test': Dataset.from_pandas(pd.DataFrame(test_data)),
        })

    def get_dataset(self):
        return self.dataset_dict

In [950]:
my_dataset = MyDataset(train_data, validation_data, test_data)
# my_dataset = MyDataset(train_data, test_data)
dataset_dict = my_dataset.get_dataset()
# print(dataset_dict['train'][0])
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 46885
    })
    validation: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 10047
    })
    test: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 10047
    })
})


In [951]:
# Load a tokenizer to preprocess the tokens field
# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
tokenizer = AutoTokenizer.from_pretrained("camembert-base")

In [952]:
def show_random_elements(dataset, num_examples=10):
    """Display `num_examples` of random elements from the dataset with their features."""
    assert num_examples <= len(dataset)
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
            
    # add tokenization column
    df['tokenized'] = df['tokens']
    df['tokenized'] = df['tokenized'].apply(lambda x: tokenizer(x, is_split_into_words=True))
    df['tokenized'] = df['tokenized'].apply(lambda x: tokenizer.convert_ids_to_tokens(x["input_ids"]))    
    display(HTML(df.to_html()))

show_random_elements(dataset_dict["train"])

Unnamed: 0,id,ner_tags,tokens,tokenized
0,12024,"[2, 6, 8, 10, 12, 14, 16, 17, 18]","[1899, Eymar, Marie, épouse, Beaumontel, française, ouv., tisseuse, Vallée]","[<s>, ▁18, 99, ▁Ey, mar, ▁Marie, ▁épouse, ▁Beaumont, el, ▁française, ▁ou, v, ., ▁t, isse, use, ▁Vallée, </s>]"
1,26340,[12],[Cier-de-Luchon],"[<s>, ▁C, ier, -, de, -, Luc, hon, </s>]"
2,30352,[12],[Altagène],"[<s>, ▁Alt, a, gène, </s>]"
3,8501,"[12, 13]","[La, Capelle-et-Masmolène]","[<s>, ▁La, ▁Cap, elle, -, et, -, Ma, s, mo, lène, </s>]"
4,1492,[12],[Davron],"[<s>, ▁D, av, ron, </s>]"
5,2618,"[12, 13]","[Les, Gets]","[<s>, ▁Les, ▁Get, s, </s>]"
6,33316,[12],[Montmorency],"[<s>, ▁Mont, mor, en, cy, </s>]"
7,13134,"[2, 8, 10, 12, 13, 14, 16, 18]","[1873, Jeanne, femme, Sorcy, Bauthémont, française, ménagère, Louis]","[<s>, ▁18, 73, ▁Jeanne, ▁femme, ▁Sor, cy, ▁Bau, thé, mont, ▁française, ▁, ménagère, ▁Louis, </s>]"
8,5811,[12],[Bréhéville],"[<s>, ▁Bré, hé, ville, </s>]"
9,12834,"[0, 8, 10, 14, 16, 18]","[18, Augustine, domestique, française, bonne, Travers]","[<s>, ▁18, ▁Augustin, e, ▁domestique, ▁française, ▁bonne, ▁Travers, </s>]"


In [953]:
def tokenize_and_align_labels(examples):
    """Réaligne les tokens et les étiquettes, et tronquer les séquences pour qu'elles ne soient pas plus longues que la longueur d'entrée maximale

    Args:
        examples (dict): Une entrée de l'ensemble de données.

    Returns:
        tokenized_inputs (dict): Les tokens encodés et alignés avec les étiquettes.
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=False, is_split_into_words=True)
    # tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx: 
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [954]:
tokenized_dataset = dataset_dict.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 46885/46885 [00:01<00:00, 32987.98 examples/s]
Map: 100%|██████████| 10047/10047 [00:00<00:00, 57261.89 examples/s]
Map: 100%|██████████| 10047/10047 [00:00<00:00, 60480.01 examples/s]


In [956]:
# Verify that the labels are correctly aligned with the tokenized input, especially after tokenization,
# since wordpiece tokenization can split tokens into subwords.
print([tokenizer.convert_ids_to_tokens(input_ids) for input_ids in tokenized_dataset['train'][3]['input_ids']])
print(tokenized_dataset['train'][3])

['<s>', '▁Clermont', '-', 'd', "'", 'Ex', 'cid', 'euil', '</s>']
{'id': '18956', 'ner_tags': [12], 'tokens': ["Clermont-d'Excideuil"], 'input_ids': [5, 11073, 26, 204, 11, 4549, 15830, 7047, 6], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 12, -100, -100, -100, -100, -100, -100, -100]}


In [957]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [958]:
# Eval
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tagged_categories[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tagged_categories[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [959]:
len(tagged_categories)

22

In [960]:
# Train
# model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=28, id2label=id2tag, label2id=tag2id)
model = CamembertForTokenClassification.from_pretrained("camembert-base", num_labels=22, id2label=id2tag, label2id=tag2id)
# model = CamembertForTokenClassification.from_pretrained("camembert-base", num_labels=28, id2label=id2tag, label2id=tag2id)

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [961]:
model

CamembertForTokenClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)


In [962]:
training_args = TrainingArguments(
    output_dir="camembert-base-0704-1",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    # eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Finetune the model
trainer.train()


Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches', 'even_batches', 'use_seedable_sampler']). Please pass an `accelerate.DataLoaderConfiguration` instead: 
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)

  1%|          | 36/5862 [48:21:57<7827:13:34, 4836.60s/it]
  2%|▏         | 96/5862 [09:24<23:19:34, 14.56s/it]

KeyboardInterrupt: 

In [None]:
# evaluate again on the evaluation dataset or on another dataset
trainer.evaluate(eval_dataset=tokenized_dataset["test"])

100%|██████████| 237/237 [00:17<00:00, 13.51it/s]


{'eval_loss': 0.35700175166130066,
 'eval_precision': 0.9642298331015299,
 'eval_recall': 0.9677630430989356,
 'eval_f1': 0.9659932073499956,
 'eval_accuracy': 0.9705206463195691,
 'eval_runtime': 18.1206,
 'eval_samples_per_second': 208.768,
 'eval_steps_per_second': 13.079,
 'epoch': 2.0}

In [76]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [tagged_categories[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [tagged_categories[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
results = seqeval.compute(predictions=true_predictions, references=true_labels)

# Get scores and categories
f1_scores = {category: results[category]['f1'] for category in results if category not in ['overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']}
recall = {category: results[category]['recall'] for category in results if category not in ['overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']}
precision = {category: results[category]['precision'] for category in results if category not in ['overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']}
df_scores = pd.DataFrame(f1_scores.items(), columns=['Category', 'F1 Score'])
df_scores['Recall'] = [recall[category] for category in df_scores['Category']]
df_scores['Precision'] = [precision[category] for category in df_scores['Category']]
df_scores = df_scores.sort_values(by='F1 Score', ascending=False)
df_scores

100%|██████████| 237/237 [00:15<00:00, 15.08it/s]


Unnamed: 0,Category,F1 Score,Recall,Precision
1,birth_date,1.0,1.0,1.0
2,civil_status,0.999385,1.0,0.998771
0,age,0.997943,0.998354,0.997533
4,firstname,0.991044,0.992503,0.989589
5,link,0.980032,0.983773,0.976318
7,nationality,0.976274,0.96441,0.988434
6,lob,0.967574,0.98121,0.954312
8,occupation,0.965201,0.963727,0.966679
3,employer,0.936759,0.934911,0.938614
9,surname,0.921271,0.921271,0.921271


#### Inférence

In [849]:
# Inférence
# text = "45 ans Pierre Homme marié  française chef de ménage SNCF Breton"
# text = "82 Louis journalier chef de mén Veuf français Vendée Laidet"
# text = "auber elise s.p épouse 1842 française fleury s andelle"
# text = '66 Antoine chef française cult Dumergue'
# text = '33 Homme marié Jean chef de ménage idem idem domestique en chef Guillotteau'
# text = '20 Fille Emilie idem Vignal'
# text = '1860 Choffre Antoine Veuf Homme Femme française cultivateur'
# text = '1999 Adèle Moreau Fille française étudiante Bretagne'
# text = 'française Fille Moreau Adèle 24 ans Dinan étudiante ingénieur'
text = '24 ans Adèle Fille française Dinan étudiante ingénieur Moreau'

In [850]:
df[(df['surname']=='Moreau') & (df['firstname']=='Adèle')]

Unnamed: 0,age,birth_date,civil_status,employer,firstname,link,lob,nationality,occupation,surname,surname_household
21240,18,,,,Adèle,,,,servante,Moreau,


In [851]:
# classifier = pipeline("ner", model="camembert-base1/checkpoint-2508/")
# classifier(text)

In [852]:
tokenizer = AutoTokenizer.from_pretrained("camembert-base-0404-1/checkpoint-2208/")
inputs = tokenizer(text, return_tensors="pt")

In [853]:
model = AutoModelForTokenClassification.from_pretrained("camembert-base-0404-1/checkpoint-2208/")
with torch.no_grad():
    logits = model(**inputs).logits

In [854]:
# Get probabilities from logits for each token
index = 2

def get_probas_from_logits(logits):
    return logits.softmax(-1)

probas = get_probas_from_logits(logits)
print(f"Token associated to the word {index}: {tokenizer.convert_ids_to_tokens(int(inputs['input_ids'][0][index]))}")
print(probas[0, index, :])

# print the 3 max probabilities
print(f"Top 3 probabilities: {torch.topk(probas[0, index, :], 3).values}")
print(f"Top 3 categories: {torch.topk(probas[0, index, :], 3).indices}")
print("Categories associated to the top 3 probabilities: ", [id2tag[i] for i in torch.topk(probas[0, index, :], 3).indices.tolist()])


Token associated to the word 2: ▁ans
tensor([0.0247, 0.4029, 0.0307, 0.0333, 0.0336, 0.0270, 0.0300, 0.0311, 0.0247,
        0.0376, 0.0290, 0.0222, 0.0230, 0.0269, 0.0218, 0.0333, 0.0257, 0.0268,
        0.0217, 0.0312, 0.0235, 0.0393])
Top 3 probabilities: tensor([0.4029, 0.0393, 0.0376])
Top 3 categories: tensor([ 1, 21,  9])
Categories associated to the top 3 probabilities:  ['I-age', 'I-surname_household', 'I-firstname']


In [905]:
# Get the maximum probability for each word
max_probabilities = torch.max(probas, dim=2).values.squeeze().tolist()

# Get the tokens
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
tags = [id2tag[torch.argmax(probas[0, i, :]).item()] for i, _ in enumerate(tokens)]

fig = go.Figure()
fig.add_trace(go.Bar(x=tokens, y=max_probabilities, text=tags,))
fig.update_layout(
    # title='Top category for each token with its maximum probability',
    xaxis_tickangle=-45,
    xaxis_title='Token',
    yaxis_title='Probabilité',
    font=dict(family="Arial, sans-serif", size=17, color="RebeccaPurple"),
    width=800,
    height=600
)
fig.show()

In [917]:
# Assuming tokenizer, inputs, torch, probas, and id2tag are defined
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
N = len(tokens) - 1
cols_per_row = 4
rows = -(-N // cols_per_row)

fig = make_subplots(rows=rows, cols=cols_per_row, subplot_titles=tokens, horizontal_spacing=0.02)

for token_index in range(1,N):
    top_categories = torch.topk(probas[0, token_index, :], 5).indices.tolist()
    top_probs = torch.topk(probas[0, token_index, :], 5).values.tolist()
    category_labels = [id2tag[i] for i in top_categories]
    token_name = tokens[token_index]

    row = token_index // cols_per_row + 1
    col = token_index % cols_per_row + 1

    bar = go.Bar(x=top_probs, y=category_labels, orientation='h', name=token_name)
    fig.add_trace(bar, row=row, col=col)

    # Add annotations for each category label at the end of the bars
    for i, (prob, label) in enumerate(zip(top_probs, category_labels)):
        fig.add_annotation(x=prob, y=label, text=label, showarrow=False, xanchor='left', row=row, col=col, font=dict(size=13))

# Hide y-axis labels for all subplots
for r in range(1, rows+1):
    for c in range(1, cols_per_row+1):
        fig.update_yaxes(showticklabels=False, row=r, col=c)
        
fig.update_layout(height=250*rows, width=1000, showlegend=False, font=dict(size=14), title_font_size=16)
fig.show()
