In [272]:
import pandas as pd
import yaml
import re
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForTokenClassification

### Build ground truth dataframe

In [177]:
def load_df(file_path):
    df = pd.read_json(file_path, orient='index', )
    df.rename(columns={df.columns[0]: 'Information'}, inplace=True)
    return df

def load_tags(file_path):
    with open(file_path, 'r') as file:
        tags = yaml.safe_load(file)
    return tags

In [178]:
def create_dict_tags(input_dict):
    # Initialize an empty dictionary to hold the tags and their corresponding categories
    tags_categories = {}
    
    # Iterate over the input dictionary to extract the start markers and their corresponding categories
    for category, markers in input_dict.items():
        # Assign the start marker as the key and the category name as the value in the tags_categories dictionary
        tags_categories[markers['start']] = category
    
    return tags_categories

In [179]:
def create_list_tags(input_dict):
    tags_symbols = []
    for category in input_dict:
        tags_symbols.append(input_dict[category]['start'])
    return tags_symbols

In [180]:
def classify_entry(entry, tags_symbols, tags_categories):
    """Take an entry and classify each part according to the tags"""
    # Split the entry by the tags
    parts = re.split('('+ '|'.join(re.escape(tag) for tag in tags_symbols) +')', entry)
    
    # Dictionary to hold the classified parts with categories as keys
    classified_parts = {category: None for category in tags_categories.values()}
    
    # Process the split parts and classify them according to the tags
    for i in range(1, len(parts), 2):  # iterate over every second element (tags) starting from index 1
        tag = parts[i]
        value = parts[i+1].strip()  # get the value after the tag
        category = tags_categories.get(tag, None)  # get the category for the tag
        if category:
            classified_parts[category] = value  # assign the value to the correct category
    
    return classified_parts

In [181]:
def load_and_create_ground_truth_df(input_df, tags):
    
    tags_dict = create_dict_tags(tags)
    tags_list = create_list_tags(tags)

    classified_data = []
    # Concatenate all the dataframes from the entities.json file
    for i in range(len(input_df)):
        for entry in input_df['Information'].iloc[i].split('\n'):
            classified_data.append(classify_entry(entry, tags_list, tags_dict))

    # Convert the classified data into a DataFrame
    classified_df = pd.DataFrame(classified_data)
    return classified_df

In [220]:
data = load_df('../data/entities.json')
tokens = load_tags('../data/tokens.yml')
df = load_and_create_ground_truth_df(data, tokens)

### Statistiques descriptives des données disponibles

In [221]:
# 25446 rows (=persons), 14 columns (=categories)
df

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname,surname_household
0,25,,Garçon,,,Cyrille,,,,française,,menuisier,Breton,
1,30,,Garçon,,,Auguste,,,,Piémontaise,,vitrier,,Ferazzi
2,24,,Garçon,,,Pierre,,,,Piémontaise,,vitrier,,Machol
3,48,,Homme marié,,,Alexandre,,,,française,,prop re,,Desbois
4,30,,,,,Zélie,sa fe,,,française,,prop re,Vignat,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25443,,1887,,,,Annunziata,épouse,idem,,idem,,,Berni-Laureti,
25444,,1914,,,,Primo,fils,idem,,idem,,,Berni,
25445,,,,,,,,,,,,,,
25446,,,,,,,,,,,,,,


In [222]:
df.nunique()

age                   253
birth_date            158
civil_status            6
education_level         0
employer             1087
firstname            2456
link                  937
lob                  2923
maiden_name             0
nationality            73
observation           310
occupation           2056
surname              8120
surname_household    4126
dtype: int64

In [223]:
print(f"Civil status: {df['civil_status'].unique()}")
# df['employer'].unique()

Civil status: ['Garçon' 'Homme marié' None 'Fille' 'Femme mariée' 'Veuve' 'Veuf']


In [224]:
df.describe()

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname,surname_household
count,16436,7344,10705,0.0,2911,24931,20736,9236,0.0,13314,602,16179,19159,5640
unique,253,158,6,0.0,1087,2456,937,2923,0.0,73,310,2056,8120,4126
top,2,1901,Garçon,,patron,Marie,chef,idem,,française,idem,idem,idem,Martin
freq,337,138,2824,,644,2758,3022,3378,,7916,88,3695,669,29


In [225]:
# Percentage of missing values
df.isnull().sum() / len(df) * 100

age                   35.413392
birth_date            71.141151
civil_status          57.933826
education_level      100.000000
employer              88.560987
firstname              2.031594
link                  18.516190
lob                   63.706382
maiden_name          100.000000
nationality           47.681547
observation           97.634392
occupation            36.423295
surname               24.713141
surname_household     77.837158
dtype: float64

In [226]:
df.dtypes

age                  object
birth_date           object
civil_status         object
education_level      object
employer             object
firstname            object
link                 object
lob                  object
maiden_name          object
nationality          object
observation          object
occupation           object
surname              object
surname_household    object
dtype: object

In [227]:
# transform age in int to get mean (attention: missing values, or non-integer values like 'unknown', '5 mois')
# df['age'].astype(int).mean()

In [228]:
# civil status when surname_household is not null
df[df['surname_household'].notnull()]['civil_status'].value_counts()

civil_status
Homme marié     1494
Veuve            263
Veuf             191
Garçon            97
Fille             93
Femme mariée      25
Name: count, dtype: int64

#### Pre-processing for each category

In [229]:
# Age

# when idem is written, replace by the value of the previous row
# df['age'] = df['age'].replace('idem', pd.NA).ffill()

# when age is composed of a number followed by 'mois' and nothing else, replace by the number divided by 12
# if df['age'].str.contains('mois').any():
#         df['age'] = df['age'].str.replace('mois', '').astype(float)
#         df['age'] = df['age'].apply(lambda x: x / 12 if x > 12 else x)

# Some dates are composed of 4 digits, but looking at them, they are not necessarily birth dates
# Replace them by NA to avoid misclassification
mask = df['age'].notna() & df['age'].astype(str).str.match(r'^\d{4}$')
df.loc[mask, 'age'] = pd.NA

In [230]:
# Birthdate
# If the date is not composed of 4 characters, it is not a birth date, thus replace by NA
mask = df['birth_date'].notna() & ~df['birth_date'].astype(str).str.match(r'^\d{4}$')
df.loc[mask, 'birth_date'] = pd.NA

In [231]:
# Link
# df['link'] = df['link'].replace('sa fe', 'sa femme')
# df['link'] = df['link'].replace('(sa femme)', 'sa femme')
# df['link'] = df['link'].replace('CHEF DE MENAGE', 'Chef de ménage')
# df['link'] = df['link'].replace('Chef', 'Chef de ménage')
# df['link'] = df['link'].replace('Chef de menage', 'Chef de ménage')
# df['link'] = df['link'].replace('Chef de mén', 'Chef de ménage')

# Beaucoup de valeurs similaires sont écrites de manière différente, utiles de les modifier?

In [232]:
# Nationality
# df['nationality'] = df['nationality'].replace('idem', pd.NA).ffill()

In [233]:
df

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname,surname_household
0,25,,Garçon,,,Cyrille,,,,française,,menuisier,Breton,
1,30,,Garçon,,,Auguste,,,,Piémontaise,,vitrier,,Ferazzi
2,24,,Garçon,,,Pierre,,,,Piémontaise,,vitrier,,Machol
3,48,,Homme marié,,,Alexandre,,,,française,,prop re,,Desbois
4,30,,,,,Zélie,sa fe,,,française,,prop re,Vignat,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25443,,1887,,,,Annunziata,épouse,idem,,idem,,,Berni-Laureti,
25444,,1914,,,,Primo,fils,idem,,idem,,,Berni,
25445,,,,,,,,,,,,,,
25446,,,,,,,,,,,,,,


#### Build test data

#### LLM

- https://nlpprogress.com/english/named_entity_recognition.html
- https://huggingface.co/transformers/v3.2.0/custom_datasets.html#token-classification-with-w-nut-emerging-entities
  

In [289]:
def extract_text_tags_from_df(df):
    words_list = []
    categories_list = []

    # Iterate over each row and column, collecting non-None values and their column names
    for index, row in df.iterrows():
        for column in df.columns:
            value = row[column]
            if pd.notnull(value):
                words_list.append(value)
                categories_list.append(column)

    # print(len(words_list), len(categories_list))
    return words_list, categories_list

In [290]:
# Create text and tags
categories = tokens.keys()
texts, tags = extract_text_tags_from_df(df)

In [291]:
texts[:3], tags[:3]

(['25', 'Garçon', 'Cyrille'], ['age', 'civil_status', 'firstname'])

In [296]:
train_texts, test_texts, train_tags, test_tags = train_test_split(texts, tags, test_size=.2)
# train_texts, val_texts, train_tags, val_tags = train_test_split(train_texts, train_tags, test_size=.2)

In [297]:
train_texts[:3], train_tags[:3]

(['Marie', 'fille', 'son fils'], ['firstname', 'link', 'link'])

In [298]:
# create encodings for our tags
tag2id = {tag: id for id, tag in enumerate(categories)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [317]:
# Encode tokens
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [323]:
def encode_tags(tags, encodings):
    # labels = [[tag2id[tag] for tag in doc] for doc in tags]
    labels = [tag2id[tag] for tag in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        
        print(arr_offset.shape)  # To check the shape of the array.
        print(arr_offset[:10])   # To print a sample of the offsets.

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(test_tags, val_encodings)

(2,)
[0 0]


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
# Create a dataset class
class ArchiveDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = ArchiveDataset(train_encodings, train_labels)
val_dataset = ArchiveDataset(val_encodings, val_labels)

In [None]:
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(categories))

In [None]:
model

Todo:
- créer dataframe avec chaque info séparées
- utiliser les tokens.yml pour évaluer le système
- une fonction qui renvoie le df ground truth et une fonction qui renvoie un dataframe avec tous les mots à classifier dans une liste
- séparer train et test, quel format?

Pre-processing:
- replace 'sa fe' par 'sa femme' par exemple

Questions:
- Sous quel format arrive les données? pas déjà taggées

Notes: 
- surname household = nom homme marié
- Attention: différencier pre-processing sur le dataset de train et dataset de test
- text and tags do not take into account the order, should we?