# BERT

## Imports

In [1]:

import json
import yaml
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import time
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Database

In [2]:
df = pd.read_csv("database.csv")

In [3]:
df

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname,surname_household,household,sexe
0,25,,Garçon,,,Cyrille,,,,française,,menuisier,Breton,,0,1.0
1,30,,Garçon,,,Auguste,,,,Piémontaise,,vitrier,,Ferazzi,1,1.0
2,24,,Garçon,,,Pierre,,,,Piémontaise,,vitrier,,Machol,1,1.0
3,48,,Homme marié,,,Alexandre,,,,française,,prop re,,Desbois,1,1.0
4,30,,,,,Zélie,sa fe,,,française,,prop re,Vignat,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25069,,1869,,,,Marie,chef,Pailharès,,française,,,,Chameton-Dideron,1,
25070,,1863,,,Cara,Marie,chef,St Naz en Royans,,française,,ouv chaus res,,Ode,1,
25071,,1886,,,Baretto,Nello,chef,Castel,,italienne,,manoeuvre,,Berni,1,
25072,,1887,,,,Annunziata,épouse,Castel,,italienne,,,Berni-Laureti,,0,


In [4]:
nan_counts = df.isna().sum()
nan_counts

age                   8639
birth_date           17730
civil_status         14370
education_level      25074
employer             22220
firstname              144
link                  4383
lob                  16004
maiden_name          25074
nationality          12186
observation          24482
occupation            9089
surname               6120
surname_household    19438
household                0
sexe                 14370
dtype: int64

In [5]:
df['inputs'] = ''
for col in df.columns:
    df['inputs'] += col + ': ' + df[col].astype(str) + ', '

In [6]:
df

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname,surname_household,household,sexe,inputs
0,25,,Garçon,,,Cyrille,,,,française,,menuisier,Breton,,0,1.0,"age: 25 , birth_date: nan, civil_status: Garço..."
1,30,,Garçon,,,Auguste,,,,Piémontaise,,vitrier,,Ferazzi,1,1.0,"age: 30 , birth_date: nan, civil_status: Garço..."
2,24,,Garçon,,,Pierre,,,,Piémontaise,,vitrier,,Machol,1,1.0,"age: 24 , birth_date: nan, civil_status: Garço..."
3,48,,Homme marié,,,Alexandre,,,,française,,prop re,,Desbois,1,1.0,"age: 48 , birth_date: nan, civil_status: Homme..."
4,30,,,,,Zélie,sa fe,,,française,,prop re,Vignat,,0,,"age: 30 , birth_date: nan, civil_status: nan, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25069,,1869,,,,Marie,chef,Pailharès,,française,,,,Chameton-Dideron,1,,"age: nan, birth_date: 1869 , civil_status: nan..."
25070,,1863,,,Cara,Marie,chef,St Naz en Royans,,française,,ouv chaus res,,Ode,1,,"age: nan, birth_date: 1863 , civil_status: nan..."
25071,,1886,,,Baretto,Nello,chef,Castel,,italienne,,manoeuvre,,Berni,1,,"age: nan, birth_date: 1886 , civil_status: nan..."
25072,,1887,,,,Annunziata,épouse,Castel,,italienne,,,Berni-Laureti,,0,,"age: nan, birth_date: 1887 , civil_status: nan..."


## Model

In [7]:
X = df['inputs'].tolist()
y = df['household'].tolist()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=12,shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=12,shuffle=True)

In [9]:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [11]:
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print("Device:", device)
model.to(device)

Device: cuda:0


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [13]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [14]:
train_dataset = IMDbDataset(train_encodings, y_train)
val_dataset = IMDbDataset(val_encodings, y_val)
test_dataset = IMDbDataset(test_encodings, y_test)

In [None]:
import transformers

# Define the training arguments
training_args = transformers.TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
)


# Define the Trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()

In [17]:
predictions = trainer.predict(test_dataset)

y_pred = np.argmax(predictions.predictions, axis=1)

In [None]:
print(classification_report(y_test, y_pred))