In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import yaml
import re
import evaluate
import random
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline, DistilBertTokenizerFast, DistilBertForTokenClassification, AutoTokenizer, CamembertModel, CamembertForTokenClassification

  from .autonotebook import tqdm as notebook_tqdm


### Build ground truth dataframe

In [2]:
def load_df(file_path):
    df = pd.read_json(file_path, orient='index', )
    df.rename(columns={df.columns[0]: 'Information'}, inplace=True)
    return df

def load_tags(file_path):
    with open(file_path, 'r') as file:
        tags = yaml.safe_load(file)
    return tags

In [3]:
def create_dict_tags(input_dict):
    # Initialize an empty dictionary to hold the tags and their corresponding categories
    tags_categories = {}
    
    # Iterate over the input dictionary to extract the start markers and their corresponding categories
    for category, markers in input_dict.items():
        # Assign the start marker as the key and the category name as the value in the tags_categories dictionary
        tags_categories[markers['start']] = category
    
    return tags_categories

In [4]:
def create_list_tags(input_dict):
    tags_symbols = []
    for category in input_dict:
        tags_symbols.append(input_dict[category]['start'])
    return tags_symbols

In [5]:
def classify_entry(entry, tags_symbols, tags_categories):
    """Take an entry and classify each part according to the tags"""
    # Split the entry by the tags
    parts = re.split('('+ '|'.join(re.escape(tag) for tag in tags_symbols) +')', entry)
    
    # Dictionary to hold the classified parts with categories as keys
    classified_parts = {category: None for category in tags_categories.values()}
    
    # Process the split parts and classify them according to the tags
    for i in range(1, len(parts), 2):  # iterate over every second element (tags) starting from index 1
        tag = parts[i]
        value = parts[i+1].strip()  # get the value after the tag
        category = tags_categories.get(tag, None)  # get the category for the tag
        if category:
            classified_parts[category] = value  # assign the value to the correct category
    
    return classified_parts

In [6]:
def load_and_create_ground_truth_df(input_df, tags):
    
    tags_dict = create_dict_tags(tags)
    tags_list = create_list_tags(tags)

    classified_data = []
    # Concatenate all the dataframes from the entities.json file
    for i in range(len(input_df)):
        for entry in input_df['Information'].iloc[i].split('\n'):
            classified_data.append(classify_entry(entry, tags_list, tags_dict))

    # Convert the classified data into a DataFrame
    classified_df = pd.DataFrame(classified_data)
    return classified_df

In [86]:
data = load_df('../data/entities.json')
input_tokens = load_tags('../data/tokens.yml')
df = load_and_create_ground_truth_df(data, input_tokens)

### Statistiques descriptives des données disponibles

- Get average age

In [87]:
df.head()

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname,surname_household
0,25,,Garçon,,,Cyrille,,,,française,,menuisier,Breton,
1,30,,Garçon,,,Auguste,,,,Piémontaise,,vitrier,,Ferazzi
2,24,,Garçon,,,Pierre,,,,Piémontaise,,vitrier,,Machol
3,48,,Homme marié,,,Alexandre,,,,française,,prop re,,Desbois
4,30,,,,,Zélie,sa fe,,,française,,prop re,Vignat,


In [88]:
# Drop rows with only NaN values
df = df.dropna(how='all')

# 25075 rows (=persons), 14 columns (=categories)
df.shape

(25075, 14)

In [89]:
df.describe()

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname,surname_household
count,16436,7344,10705,0.0,2911,24931,20736,9236,0.0,13314,602,16179,19159,5640
unique,253,158,6,0.0,1087,2456,937,2923,0.0,73,310,2056,8120,4126
top,2,1901,Garçon,,patron,Marie,chef,idem,,française,idem,idem,idem,Martin
freq,337,138,2824,,644,2758,3022,3378,,7916,88,3695,669,29


In [90]:
# Missing values
missing_percentage = df.isnull().sum() / len(df) * 100
missing_percentage = missing_percentage.reset_index()
missing_percentage.columns = ['Column', 'MissingPercentage']
missing_percentage = missing_percentage.sort_values(by='MissingPercentage', ascending=False)

# Plot
fig = px.bar(missing_percentage, x='Column', y='MissingPercentage', text='MissingPercentage',
             title="Pourcentage de valeurs manquantes par catégorie",
             width=800,
             height=500)
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.update_layout(yaxis=dict(title='Pourcentage de valeurs manquantes'),
                  xaxis=dict(title='Catégories'))
fig.show()


In [91]:
# Duplicates
df[df.duplicated()]
# df[(df.duplicated()) & (df['firstname']=='Marie')].shape
df.duplicated().sum()

41

In [92]:
# Unique values
unique_values_count = df.nunique()
unique_values_count = unique_values_count.reset_index()
unique_values_count.columns = ['Column', 'UniqueValuesCount']

# Sort the DataFrame by the UniqueValuesCount column
unique_values_count_sorted = unique_values_count.sort_values(by='UniqueValuesCount', ascending=False)

# Plot with the sorted data
fig = px.bar(unique_values_count_sorted, x='Column', y='UniqueValuesCount', text='UniqueValuesCount',
             title="Nombre de valeurs uniques par catégorie", 
             width=800, height=500)
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(yaxis=dict(title='Nombre de valeurs uniques'),
                  xaxis=dict(title='Catégorie'))
fig.show()

In [93]:
# Count number of different surnames, including surname_household
unique_surnames = set(df['surname_household']).union(set(df['surname']))
num_unique_surnames = len(unique_surnames)
print("Number of different surnames:", num_unique_surnames)

Number of different surnames: 9589


In [94]:
print(f"Civil status: {df['civil_status'].unique()}")
print(f"Civil status when surname_household is not null:")
df[df['surname_household'].notnull()]['civil_status'].value_counts()

Civil status: ['Garçon' 'Homme marié' None 'Fille' 'Femme mariée' 'Veuve' 'Veuf']
Civil status when surname_household is not null:


civil_status
Homme marié     1494
Veuve            263
Veuf             191
Garçon            97
Fille             93
Femme mariée      25
Name: count, dtype: int64

In [95]:
# Prénoms les plus observés
top_names = df['firstname'].value_counts().head(10)
top_names_df = top_names.reset_index()
top_names_df.columns = ['Prénom', 'Nombre de fois donné']

fig = px.bar(top_names_df, x='Prénom', y='Nombre de fois donné',
             title='Les 10 prénoms les plus courants',
             labels={'Prénom': 'Prénom', 'Nombre de fois donné': 'Nombre de fois donné'},
             width=800,
             height=500)

fig.update_layout(xaxis_tickangle=-45,
                #   plot_bgcolor='white', 
                  xaxis=dict(title='Prénom'), 
                  yaxis=dict(title='Nombre de fois donné'), 
                  title=dict(x=0.5)) 

fig.show()

In [141]:
# Statistiques sur l'âge et l'année de naissance

# Âge
df_age = df.copy()
df_age = df_age[df_age['age'].apply(lambda x: len(str(x)) != 4)]
df_age = df_age[df_age['age'].apply(lambda x: len(str(x)) != 3)]
df_age = df_age[~df_age['age'].str.contains('\⁇')]
df_age = df_age[~df_age['age'].str.contains('\?')]
df_age = df_age.replace(to_replace='mois', value=1, regex=True)
df_age = df_age.replace(to_replace='mpis', value=1, regex=True)
df_age = df_age.replace(to_replace='semaines', value=1, regex=True)
df_age = df_age.replace(to_replace='jours', value=0, regex=True)
df_age = df_age.replace(to_replace='jour', value=0, regex=True)
df_age['age'] = df_age['age'].str.replace(' ans', '')
df_age['age'] = df_age['age'].str.replace('ans', '')
df_age['age'] = df_age['age'].str.replace('an', '')
df_age['age'] = df_age['age'].str.replace("1'", '1')

df_age = df_age[df_age['age'].apply(lambda x: len(str(x)) < 3)]
df_age['age'] = df_age['age'].astype(int)
print(f"Age moyen : {df_age['age'].mean()}")
print(f"Age médian : {df_age['age'].median()}")
# print(set(df_age['age'].unique()))

Age moyen : 31.55021181161226
Age médian : 29.0


In [163]:
# Année de naissance
df_birth_year = df.copy()
df_birth_year = df_birth_year[df_birth_year['birth_date'].apply(lambda x: len(str(x)) == 4)]
df_birth_year = df_birth_year[df_birth_year['birth_date']!='idem']
df_birth_year = df_birth_year.dropna(subset=['birth_date'])

df_birth_year['birth_date'] = df_birth_year['birth_date'].astype(int)
px.histogram(df_birth_year, x='birth_date', title='Histogramme des années de naissance', nbins=50, width=800, height=500)

#### Pre-processing

Questions:
- if idem replace by NA or leave?
- same questions for inconsistent values?

In [164]:
# Age
# If the date is composed of 4 digits, it is not an age, thus replace by NA
mask = df['age'].notna() & df['age'].astype(str).str.match(r'^\d{4}$')
df.loc[mask, 'age'] = pd.NA

# Birthdate
# If the date is not composed of 4 characters, it is not a birth date, thus replace by NA
mask = df['birth_date'].notna() & ~df['birth_date'].astype(str).str.match(r'^\d{4}$')
df.loc[mask, 'birth_date'] = pd.NA

In [18]:
# Link
# df['link'] = df['link'].replace('sa fe', 'sa femme')
# df['link'] = df['link'].replace('(sa femme)', 'sa femme')
# df['link'] = df['link'].replace('CHEF DE MENAGE', 'Chef de ménage')
# df['link'] = df['link'].replace('Chef', 'Chef de ménage')
# df['link'] = df['link'].replace('Chef de menage', 'Chef de ménage')
# df['link'] = df['link'].replace('Chef de mén', 'Chef de ménage')

# Beaucoup de valeurs similaires sont écrites de manière différente, utiles de les modifier?

In [19]:
# Nationality
# df['nationality'] = df['nationality'].replace('idem', pd.NA).ffill()

#### LLM

- https://nlpprogress.com/english/named_entity_recognition.html
- https://huggingface.co/transformers/v3.2.0/custom_datasets.html#token-classification-with-w-nut-emerging-entities
- https://reybahl.medium.com/token-classification-in-python-with-huggingface-3fab73a6a20e

In [21]:
# Preprocessing for distilbert-base-uncased

# Since distilbert-base-uncased is an uncased model, convert your text input to lowercase to match the case of the training data the model was originally trained on.
df = df.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

In [23]:
df

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname,surname_household
0,25,,garçon,,,cyrille,,,,française,,menuisier,breton,
1,30,,garçon,,,auguste,,,,piémontaise,,vitrier,,ferazzi
2,24,,garçon,,,pierre,,,,piémontaise,,vitrier,,machol
3,48,,homme marié,,,alexandre,,,,française,,prop re,,desbois
4,30,,,,,zélie,sa fe,,,française,,prop re,vignat,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25443,,1887,,,,annunziata,épouse,idem,,idem,,,berni-laureti,
25444,,1914,,,,primo,fils,idem,,idem,,,berni,
25445,,,,,,,,,,,,,,
25446,,,,,,,,,,,,,,


In [22]:
def extract_text_tags_with_id_from_df(df, category_to_index):
    # Prepare the output list of dictionaries
    output = []
    
    # Iterate over each row and column
    for index, row in df.iterrows():
        words_list = []
        categories_list = []
        
        for column in df.columns:
            value = row[column]
            if pd.notnull(value) and isinstance(value, str):
                split_values = value.split()
                for i, item in enumerate(split_values):
                    # Determine the tag (B- or I-)
                    tag = f'B-{column}' if i == 0 else f'I-{column}'
                    
                    # Append the word and its category index
                    words_list.append(item)
                    # If the tag is not found in the predefined mapping, append 0 (for 'O')
                    categories_list.append(category_to_index.get(tag, 0))
            elif pd.notnull(value):
                words_list.append(str(value))
                categories_list.append(0)  # Non-categorical or non-string values
        
        # For any remaining space, mark as 'O'
        categories_list += [0] * (len(words_list) - len(categories_list))
        
        # Append the dictionary for this row to the output list
        output_dict = {'id': str(index), 'ner_tags': categories_list, 'tokens': words_list}
        output.append(output_dict)
    
    return output

The letter that prefixes each ner_tag indicates the token position of the entity:
- B- indicates the beginning of an entity.
- I- indicates a token is contained inside the same entity (for example, the State token is a part of an entity like Empire State Building).

In [25]:
# Create tags and tag_ids
categories = input_tokens.keys()
tagged_categories = []
for category in categories:
    tagged_categories.append('B-' + category)
    tagged_categories.append('I-' + category)

# create encodings for our tags
tag2id = {tag: id for id, tag in enumerate(tagged_categories)}
id2tag = {id: tag for tag, id in tag2id.items()}

texts_tags = extract_text_tags_with_id_from_df(df, tag2id)

In [26]:
texts_tags[0:3]

[{'id': '0',
  'ner_tags': [0, 4, 10, 18, 22, 24],
  'tokens': ['25', 'garçon', 'cyrille', 'française', 'menuisier', 'breton']},
 {'id': '1',
  'ner_tags': [0, 4, 10, 18, 22, 26],
  'tokens': ['30', 'garçon', 'auguste', 'piémontaise', 'vitrier', 'ferazzi']},
 {'id': '2',
  'ner_tags': [0, 4, 10, 18, 22, 26],
  'tokens': ['24', 'garçon', 'pierre', 'piémontaise', 'vitrier', 'machol']}]

In [27]:
# texts = [text['tokens'] for text in texts_tags]
# tags = [text['ner_tags'] for text in texts_tags]

In [28]:
# train_texts, test_texts, train_tags, test_tags = train_test_split(texts, tags, test_size=.2)
# train_texts, val_texts, train_tags, val_tags = train_test_split(train_texts, train_tags, test_size=.2)

In [29]:
# Shuffle the dataset to ensure random distribution
random.shuffle(texts_tags)

# Define the split ratio
train_ratio = 0.8

# Calculate the split index
split_index = int(len(texts_tags) * train_ratio)

# Split the data into training and test sets
train_data = texts_tags[:split_index]
test_data = texts_tags[split_index:]

len(train_data), len(test_data)

(20358, 5090)

In [30]:
train_data[0]

{'id': '22513',
 'ner_tags': [2, 8, 9, 10, 12, 14, 18, 22, 24],
 'tokens': ['1897',
  'eynard',
  'françois',
  'denise',
  'fille',
  'idem',
  'idem',
  'comptable',
  'gaude']}

In [31]:
class MyDataset:
    def __init__(self, train_data, test_data):
    # def __init__(self, train_data, validation_data, test_data):
        # Convert list of dictionaries to Dataset directly without additional formatting
        self.dataset_dict = DatasetDict({
            'train': Dataset.from_pandas(pd.DataFrame(train_data)),
            # 'validation': Dataset.from_pandas(pd.DataFrame(validation_data)),
            'test': Dataset.from_pandas(pd.DataFrame(test_data)),
        })

    def get_dataset(self):
        return self.dataset_dict

In [32]:
# my_dataset = MyDataset(train_data, validation_data, test_data)
my_dataset = MyDataset(train_data, test_data)
dataset_dict = my_dataset.get_dataset()
print(dataset_dict['train'][0])
print(dataset_dict)

{'id': '22513', 'ner_tags': [2, 8, 9, 10, 12, 14, 18, 22, 24], 'tokens': ['1897', 'eynard', 'françois', 'denise', 'fille', 'idem', 'idem', 'comptable', 'gaude']}
DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 20358
    })
    test: Dataset({
        features: ['id', 'ner_tags', 'tokens'],
        num_rows: 5090
    })
})


In [34]:
# Load a DistilBERT tokenizer to preprocess the tokens field
# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")

tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 5.44kB/s]
config.json: 100%|██████████| 508/508 [00:00<00:00, 309kB/s]
sentencepiece.bpe.model: 100%|██████████| 811k/811k [00:00<00:00, 8.81MB/s]
tokenizer.json: 100%|██████████| 1.40M/1.40M [00:00<00:00, 6.35MB/s]


In [35]:
# Example
example = dataset_dict['train'][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['<s>',
 '▁18',
 '97',
 '▁',
 'ey',
 'nard',
 '▁fran',
 'çois',
 '▁de',
 'nis',
 'e',
 '▁fille',
 '▁idem',
 '▁idem',
 '▁comptable',
 '▁gau',
 'de',
 '</s>']

In [36]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=False, is_split_into_words=True)
    # tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [37]:
tokenized_dataset = dataset_dict.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/20358 [00:00<?, ? examples/s]

Map: 100%|██████████| 20358/20358 [00:00<00:00, 30047.20 examples/s]
Map: 100%|██████████| 5090/5090 [00:00<00:00, 40274.69 examples/s]


In [38]:
# Verify that the labels are correctly aligned with the tokenized input, especially after tokenization, 
# since wordpiece tokenization can split tokens into subwords.
tokenized_dataset['train'][0]

{'id': '22513',
 'ner_tags': [2, 8, 9, 10, 12, 14, 18, 22, 24],
 'tokens': ['1897',
  'eynard',
  'françois',
  'denise',
  'fille',
  'idem',
  'idem',
  'comptable',
  'gaude'],
 'input_ids': [5,
  301,
  4842,
  21,
  2842,
  7592,
  9871,
  14292,
  8,
  4628,
  35,
  536,
  23784,
  23784,
  9615,
  20223,
  234,
  6],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100,
  2,
  -100,
  8,
  -100,
  -100,
  9,
  -100,
  10,
  -100,
  -100,
  12,
  14,
  18,
  22,
  24,
  -100,
  -100]}

In [39]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [40]:
# Eval
seqeval = evaluate.load("seqeval")

In [41]:
labels = [tagged_categories[i] for i in example[f"ner_tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tagged_categories[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tagged_categories[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [42]:
len(tagged_categories)

28

- https://huggingface.co/docs/transformers/training#train-with-pytorch-trainer
- https://huggingface.co/docs/transformers/tasks/token_classification#evaluate

In [43]:
# Train
# model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=28, id2label=id2tag, label2id=tag2id)
model = CamembertForTokenClassification.from_pretrained("almanach/camembert-base", num_labels=28, id2label=id2tag, label2id=tag2id)

model.safetensors: 100%|██████████| 445M/445M [00:46<00:00, 9.59MB/s] 
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at almanach/camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
training_args = TrainingArguments(
    output_dir="camembert-base1",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 20%|█▉        | 500/2546 [02:37<08:53,  3.84it/s]

{'loss': 0.1035, 'grad_norm': 3.6722424030303955, 'learning_rate': 1.607227022780833e-05, 'epoch': 0.39}


 39%|███▉      | 1000/2546 [05:05<07:19,  3.52it/s]

{'loss': 0.0893, 'grad_norm': 3.6347031593322754, 'learning_rate': 1.2144540455616654e-05, 'epoch': 0.79}


                                                   
 50%|█████     | 1273/2546 [06:55<06:56,  3.05it/s]

{'eval_loss': 0.10221856087446213, 'eval_precision': 0.9688495695207104, 'eval_recall': 0.9712858502106837, 'eval_f1': 0.970066180213813, 'eval_accuracy': 0.9722440944881889, 'eval_runtime': 26.6279, 'eval_samples_per_second': 191.153, 'eval_steps_per_second': 11.98, 'epoch': 1.0}


 59%|█████▉    | 1500/2546 [08:19<06:13,  2.80it/s]  

{'loss': 0.0775, 'grad_norm': 1.1349321603775024, 'learning_rate': 8.21681068342498e-06, 'epoch': 1.18}


 79%|███████▊  | 2000/2546 [11:49<03:35,  2.53it/s]

{'loss': 0.0647, 'grad_norm': 0.9503529667854309, 'learning_rate': 4.289080911233308e-06, 'epoch': 1.57}


 98%|█████████▊| 2500/2546 [15:01<00:15,  2.92it/s]

{'loss': 0.067, 'grad_norm': 1.6609553098678589, 'learning_rate': 3.6135113904163394e-07, 'epoch': 1.96}


                                                   
100%|██████████| 2546/2546 [15:36<00:00,  2.37it/s]

{'eval_loss': 0.09742994606494904, 'eval_precision': 0.9693293116887518, 'eval_recall': 0.9719314938154139, 'eval_f1': 0.9706286586917791, 'eval_accuracy': 0.972806524184477, 'eval_runtime': 18.6907, 'eval_samples_per_second': 272.328, 'eval_steps_per_second': 17.067, 'epoch': 2.0}


100%|██████████| 2546/2546 [15:39<00:00,  2.71it/s]

{'train_runtime': 939.5555, 'train_samples_per_second': 43.335, 'train_steps_per_second': 2.71, 'train_loss': 0.07996936445618275, 'epoch': 2.0}





TrainOutput(global_step=2546, training_loss=0.07996936445618275, metrics={'train_runtime': 939.5555, 'train_samples_per_second': 43.335, 'train_steps_per_second': 2.71, 'train_loss': 0.07996936445618275, 'epoch': 2.0})

In [51]:
# model

In [62]:
# Inférence
text = "Breton Pierre 45 ans Homme marié 1976 française chef de ménage SNCF"

In [63]:
classifier = pipeline("ner", model="camembert-base1/checkpoint-2546/")
classifier(text)

[{'entity': 'B-employer',
  'score': 0.9394388,
  'index': 1,
  'word': '▁Breton',
  'start': 0,
  'end': 6},
 {'entity': 'B-firstname',
  'score': 0.9957612,
  'index': 2,
  'word': '▁Pierre',
  'start': 7,
  'end': 13},
 {'entity': 'I-age',
  'score': 0.45467874,
  'index': 3,
  'word': '▁45',
  'start': 14,
  'end': 16},
 {'entity': 'I-age',
  'score': 0.8696587,
  'index': 4,
  'word': '▁ans',
  'start': 17,
  'end': 20},
 {'entity': 'B-civil_status',
  'score': 0.5580595,
  'index': 5,
  'word': '▁Homme',
  'start': 21,
  'end': 26},
 {'entity': 'I-civil_status',
  'score': 0.99544525,
  'index': 6,
  'word': '▁marié',
  'start': 27,
  'end': 32},
 {'entity': 'B-lob',
  'score': 0.9933514,
  'index': 7,
  'word': '▁1976',
  'start': 33,
  'end': 37},
 {'entity': 'B-nationality',
  'score': 0.9942942,
  'index': 8,
  'word': '▁française',
  'start': 38,
  'end': 47},
 {'entity': 'B-occupation',
  'score': 0.9792651,
  'index': 9,
  'word': '▁chef',
  'start': 48,
  'end': 52},
 {'e

In [54]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("camembert-base1/checkpoint-2546/")
inputs = tokenizer(text, return_tensors="pt")

In [55]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("camembert-base1/checkpoint-2546/")
with torch.no_grad():
    logits = model(**inputs).logits

In [72]:
# model

In [56]:
predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
predicted_token_class

['B-age',
 'B-employer',
 'B-firstname',
 'B-link',
 'I-age',
 'I-civil_status',
 'B-lob',
 'B-nationality',
 'B-occupation',
 'I-occupation',
 'I-occupation',
 'B-surname',
 'B-age']

In [61]:
inputs

{'input_ids': tensor([[    5, 15419,  1140,  2040,   134,  9667, 12956,   781,   918,     8,
          4602, 10752,     6]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [66]:
# # Encode tokens
# train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
# val_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [67]:
# tokenized_input = tokenizer(train_texts[:20], is_split_into_words=True)
# tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
# print(tokens)

In [68]:
# for i in train_encodings:
#     print(i)
    
# len(train_encodings['input_ids']), len(val_encodings['input_ids'])

Todo:
- créer dataframe avec chaque info séparées
- utiliser les tokens.yml pour évaluer le système
- une fonction qui renvoie le df ground truth et une fonction qui renvoie un dataframe avec tous les mots à classifier dans une liste
- séparer train et test, quel format?

Pre-processing:
- replace 'sa fe' par 'sa femme' par exemple

Questions:
- Sous quel format arrive les données? pas déjà taggées
- Drop colonne observation, et colonnes vides ?

Notes: 
- surname household = nom homme marié
- Attention: différencier pre-processing sur le dataset de train et dataset de test
- text and tags do not take into account the order, should we?

Issues:
- most of the words are not known by the model
- lowercasing removes the uppercase info in names