# BERT

Dans ce notebook on utilise les classes de PyTorch `TransformerEncoderLayer`, `TransformerEncoder` pour implémenter un modèle BERT pour la reconnaissance d'entités nommées.

## Préparer les données

In [2]:
#!pip install torchtext==0.6.0

import time
import torch
from torch import nn
from torch.optim import Adam
from torchtext.data import Field, BucketIterator
from torchtext.datasets import SequenceTaggingDataset
from torchtext.vocab import Vocab
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext import datasets

import spacy
import numpy as np
import pandas as pd
import math
import time
import random
import string
from itertools import chain

import numpy as np
import copy
from copy import deepcopy
import torch.optim as optim
from torch.nn import functional as F

In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(lower = True) 
TAG = data.Field(unk_token = None) 

my_path = "../libex/data/" 
my_path = "./data_ner/" 

train_data, valid_data, test_data = data.TabularDataset.splits(
        path=my_path,
        train="train.csv",
        validation="valid.csv",
        test="test.csv", format='csv', skip_header=True,
        fields=(("text", TEXT), ("tag", TAG))
         
    )

MIN_FREQ = 2

TEXT.build_vocab(train_data, 
                 min_freq = MIN_FREQ, # les mots qui apparaissent moins que MIN_FREQ fois seront ignorés du vocabulaire
                 vectors = "glove.6B.300d",
                 unk_init = torch.Tensor.normal_)


TAG.build_vocab(train_data)

BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device, sort=False)

# padding index
TEXT_PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] 
TAG_PAD_IDX = TAG.vocab.stoi[TAG.pad_token]

## Construire le modèle

Le modèle BERT est décrit dans [cet article](https://arxiv.org/abs/1810.04805) et est composé de N = 12 transformers composés de 6 encoders et d'une couche linéaire.

Nous utilisons les paramètres du modèle BERT de l'article $BERT_{BASE}$ avec : 
 
 - N = 12 transformers 
 - Dimension des états cachés : fc_hidden = 768 
 - Têtes d'attention : attn_heads = 12
 
La fonction `clones` suivante permet de produire N couches identiques et sera utile pour assembler N=12 modèles transformers.

In [15]:
def clones(module, N):
    "Produit N couches identiques"
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

On implémente le même modèle transformer que dans un précédent notebook dans une classe à part.

In [16]:
class Transformer(nn.Module):
    def __init__(self, embedding_dim, attn_heads, dropout, trf_layers,fc_hidden, output_dim):
        super(Transformer, self).__init__()
          
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=all_emb_size,
            nhead=attn_heads,
            activation="relu",
            dropout=dropout
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layers,
            num_layers=trf_layers
        ) 
                
        self.fc1 = nn.Linear(
        in_features=all_emb_size,
        out_features=fc_hidden
        )
        self.fc1_gelu = nn.GELU()
        self.fc1_norm = nn.LayerNorm(fc_hidden)
        self.fc2_dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(
            in_features=fc_hidden,
            out_features=output_dim
        ) 
    def forward(self, words, pos_out, word_pad_idx):
        
        key_padding_mask = torch.as_tensor(words == word_pad_idx).permute(1, 0)
        # pos_out = [sentence length, batch size, embedding dim ]
        
        # enc_out = [sentence length, batch size, embedding dim ]
        enc_out = self.encoder(pos_out, src_key_padding_mask=key_padding_mask)
        # Fully-connected
        # fc1_out = [sentence length, batch size, fc hidden]
        fc1_out = self.fc1_norm(self.fc1_gelu(self.fc1(enc_out)))
        # fc2_out = [sentence length, batch size, output dim]
        fc2_out = self.fc2(self.fc2_dropout(fc1_out))
        return fc2_out


In [86]:
def pad_masking(x):
    # x: (batch_size, seq_len)
    padded_positions = x == TEXT_PAD_IDX
    return padded_positions.unsqueeze(1)

class PositionalEmbedding(nn.Module):

    def __init__(self, max_len, hidden_size, ):
        super(PositionalEmbedding, self).__init__()
        self.positional_embedding = nn.Embedding(max_len, hidden_size)
        positions = torch.arange(0, max_len)
        self.register_buffer('positions', positions)

    def forward(self, sequence):
        batch_size, seq_len = sequence.size()
        positions = self.positions[:seq_len].unsqueeze(0).repeat(batch_size, 1)
        return self.positional_embedding(positions)
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def build_model(layers_count, hidden_size, heads_count, d_ff, dropout, max_len, vocabulary_size):
    
    token_embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=hidden_size)
    positional_embedding = PositionalEncoding(max_len=max_len, hidden_size=hidden_size )

    encoder_layers = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=heads_count,
            activation="relu",
            dropout=dropout)    
    
    
    encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layers,
            num_layers=layers_count
        ) 

    bert = BERT(
        encoder=encoder,
        token_embedding=token_embedding,
        positional_embedding=positional_embedding,
        hidden_size=hidden_size,
        vocabulary_size=vocabulary_size)

    return bert

class BERT(nn.Module):

    def __init__(self, encoder, token_embedding, positional_embedding, hidden_size, vocabulary_size):
        super(BERT, self).__init__()

        self.encoder = encoder
        self.token_embedding = token_embedding
        self.positional_embedding = positional_embedding
        self.token_prediction_layer = nn.Linear(hidden_size, vocabulary_size)
        self.classification_layer = nn.Linear(hidden_size, len(TAG.vocab))

    def forward(self, inputs):
        sequence = inputs
        token_embedded = self.token_embedding(sequence)
        positional_embedded = self.positional_embedding(token_embedded)
        #embedded_sources = token_embedded + positional_embedded 
      
        key_padding_mask = torch.as_tensor(sequence == TEXT_PAD_IDX).permute(1, 0)
        encoded_sources = self.encoder(embedded_sources, key_padding_mask)
        token_predictions = self.token_prediction_layer(encoded_sources)
        classification_embedding = encoded_sources[:, 0, :]
        classification_output = self.classification_layer(classification_embedding)
        return token_predictions, classification_output
    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

La classe BERT contient alors l'embedding des mots et des caractères et l'assemblage des N transformers.

In [87]:
model = build_model(hidden_size=300, layers_count=6, heads_count=4, d_ff=1024, dropout=0.1, max_len=512,
                        vocabulary_size=len(TEXT.vocab))

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)
        
model.apply(init_weights)

pretrained_embeddings = TEXT.vocab.vectors
model.token_embedding.weight.data.copy_(pretrained_embeddings)
model.token_embedding.weight.data[TAG_PAD_IDX] = torch.zeros( 300)

print(f"Le modèle a {model.count_parameters():,} paramètres à entraîner.")

Le modèle a 16,300,050 paramètres à entraîner.


In [17]:


class BERT(nn.Module):

    def __init__(self,
                 input_dim,
                 embedding_dim,
                 attn_heads,
                 fc_hidden,
                 trf_layers,
                 output_dim,
                 dropout,
                 word_pad_idx,
                 tag_pad_idx,
                 N):  
        super().__init__()
        self.N = N
        self.word_pad_idx = word_pad_idx
        self.tag_pad_idx = tag_pad_idx
        # LAYER 1A: Word Embedding
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(
            num_embeddings=input_dim,
            embedding_dim=embedding_dim,
            padding_idx=word_pad_idx
        )
        self.emb_dropout = nn.Dropout(dropout)
        all_emb_size = embedding_dim 
        self.position_encoder = PositionalEncoding(
            d_model=all_emb_size
        )
        
        self.layers = clones(Transformer(embedding_dim, attn_heads, dropout, trf_layers,fc_hidden, output_dim), N)
        #self.trans = Transformer(embedding_dim, char_emb_dim, char_cnn_filter_num, attn_heads, trf_dropout, trf_layers,fc_hidden, output_dim)

        self.softmax = nn.Softmax(dim=1)
   
        # init weights from normal distribution
        for name, param in self.named_parameters():
            nn.init.normal_(param.data, mean=0, std=0.1)

    def forward(self, words, tags=None):
        # words = [sentence length, batch size]
        # tags = [sentence length, batch size]
        # embedding_out = [sentence length, batch size, embedding dim]
        embedding_out = self.emb_dropout(self.embedding(words))
        pos_out = self.position_encoder(embedding_out)
        for i in range(self.N):
            x = self.layers[i](words, pos_out, self.word_pad_idx)    
        return self.softmax(x)

    def init_embeddings(self, word_pad_idx=TEXT_PAD_IDX,pretrained=None, freeze=True):
        # initialize embedding for padding as zero
        self.embedding.weight.data[self.word_pad_idx] = torch.zeros(self.embedding_dim)
        if pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(
                embeddings=torch.as_tensor(pretrained),
                padding_idx=self.word_pad_idx,
                freeze=freeze
            )
    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [18]:
model = BERT(
    input_dim=len(TEXT.vocab),
    embedding_dim=300,
    attn_heads=12,  
    fc_hidden=768,  
    trf_layers=1,
    output_dim=len(TAG.vocab),
    dropout=0.7,
    word_pad_idx=TEXT_PAD_IDX,
    tag_pad_idx=TAG_PAD_IDX,
    N = 12
)
model.init_embeddings(
    word_pad_idx=TEXT_PAD_IDX,
    pretrained= TEXT.vocab.vectors,
    freeze=True
)

print(f"Le modèle a {model.count_parameters():,} paramètres à entraîner.")

Le modèle a 22,099,584 paramètres à entraîner.


In [88]:
optimizer = optim.Adam(model.parameters(), lr = 5e-5)

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

model = model.to(device)
criterion = criterion.to(device)

In [89]:
def accuracy_per_tag(predictions, tags):
    n_tags = len(TAG.vocab)
    class_correct = list(0 for i in range(n_tags))
    class_total = list(0 for i in range(n_tags))
    acc = list(0 for i in range(n_tags))
    _, pred = torch.max(predictions, 1)
    # # compare predictions to true label
    correct = np.squeeze(pred.eq(tags.data.view_as(pred)))
    # # calculate test accuracy for each object class
    for i in range(len(tags.data)):
        label = tags.data[i]
        class_correct[label] += correct[i].item()
        class_total[label] += 1
    for i in range(n_tags):
        if np.sum(class_total[i]) == 0 and np.sum(class_correct[i]) ==0:
            res = 100
        else:
            res = 100 * class_correct[i] / class_total[i]
        acc[i] = res, np.sum(class_correct[i]), np.sum(class_total[i])
        
    return acc  

from sklearn.metrics import f1_score
def f1_loss(preds, y, tag_pad_idx):
    index_o = TAG.vocab.stoi["O"]
    positive_labels = [i for i in range(len(TAG.vocab.itos))
                           if i not in (tag_pad_idx, index_o)]
    _, pred = torch.max(preds, 1)
    pred = pred.data.cpu().numpy() 
    tags = y.data.cpu().numpy()
    f1 = f1_score(
            y_true=tags,
            y_pred=pred,
            labels=positive_labels,
            average="micro"
        ) 
       
    return f1

In [90]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_f1 = 0
    
    model.train()
    
    for batch in iterator:
        
        text = batch.text
        tags = batch.tag
      
        optimizer.zero_grad()

        predictions = model(text)
        
        #predictions = [sent len, batch size, output dim]
        #tags = [sent len, batch size]
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        #predictions = [sent len * batch size, output dim]
        #tags = [sent len * batch size]
        
        loss = criterion(predictions, tags)
                
        f1 = f1_loss(predictions, tags, tag_pad_idx)
        acc = accuracy_per_tag(predictions, tags)   
        
        loss.backward()
        
        optimizer.step()
        epoch_loss += loss.item()
        epoch_f1 += f1
        
    return epoch_loss / len(iterator), acc, epoch_f1 / len(iterator)

def evaluate(model, iterator, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_f1 = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch.text
            tags = batch.tag
         
            
            predictions = model(text)
            
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            acc = accuracy_per_tag(predictions, tags)
            f1 = f1_loss(predictions, tags, tag_pad_idx) 
            
            epoch_loss += loss.item()
            epoch_f1 += f1
        
    return epoch_loss / len(iterator), acc, epoch_f1 / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc, train_f1 = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc, valid_f1 = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut7-model.pt')
    if epoch%2 == 0:
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train F1 score: {train_f1*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. F1 score: {valid_f1*100:.2f}%')

ValueError: too many values to unpack (expected 2)

In [91]:
n_tags = len(TAG.vocab)
for i in range(n_tags):   
    print('Train Accuracy of %5s: %2d%% (%2d/%2d)' % (
           TAG.vocab.itos[i], train_acc[i][0],
           train_acc[i][1], train_acc[i][2]))  

NameError: name 'train_acc' is not defined

In [23]:
for i in range(n_tags):
    print('Valid Accuracy of %5s: %2d%% (%2d/%2d)' % (
           TAG.vocab.itos[i], valid_acc[i][0],
           valid_acc[i][1], valid_acc[i][2]))
  

Valid Accuracy of <pad>:  0% ( 0/77)
Valid Accuracy of     O: 80% (28/35)
Valid Accuracy of I-Locution: 100% ( 0/ 0)
Valid Accuracy of I-Catalyst: 100% ( 0/ 0)
Valid Accuracy of I-Support: 100% ( 0/ 0)
Valid Accuracy of B-Locution: 100% ( 0/ 0)
Valid Accuracy of I-Pore_volume: 100% ( 0/ 0)
Valid Accuracy of B-Pore_volume: 100% ( 0/ 0)
Valid Accuracy of I-PV_unit: 100% ( 0/ 0)
Valid Accuracy of I-Surface_area: 100% ( 0/ 0)
Valid Accuracy of I-SA_unit: 100% ( 0/ 0)
Valid Accuracy of B-Support: 100% ( 0/ 0)
Valid Accuracy of B-Surface_area: 100% ( 0/ 0)
Valid Accuracy of I-SA_val_min: 100% ( 0/ 0)
Valid Accuracy of I-PV_val_min: 100% ( 0/ 0)
Valid Accuracy of B-Catalyst: 100% ( 0/ 0)
Valid Accuracy of I-SA_val_max: 100% ( 0/ 0)
Valid Accuracy of I-PV_val_max: 100% ( 0/ 0)
Valid Accuracy of B-PV_unit: 100% ( 0/ 0)
Valid Accuracy of B-SA_unit: 100% ( 0/ 0)


In [24]:
model.load_state_dict(torch.load('tut7-model.pt'))

test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)
n_tags = len(TAG.vocab)
for i in range(n_tags):   
    print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
           TAG.vocab.itos[i], test_acc[i][0],
           test_acc[i][1], test_acc[i][2]))
print(f'Test Loss: {test_loss:.3f} |  Test F1 score: {test_f1*100:.2f}%')

Test Accuracy of <pad>:  0% ( 0/187)
Test Accuracy of     O: 98% (106/108)
Test Accuracy of I-Locution:  0% ( 0/ 2)
Test Accuracy of I-Catalyst: 100% ( 0/ 0)
Test Accuracy of I-Support:  0% ( 0/ 1)
Test Accuracy of B-Locution: 100% ( 0/ 0)
Test Accuracy of I-Pore_volume:  0% ( 0/ 1)
Test Accuracy of B-Pore_volume:  0% ( 0/ 1)
Test Accuracy of I-PV_unit:  0% ( 0/ 2)
Test Accuracy of I-Surface_area: 100% ( 0/ 0)
Test Accuracy of I-SA_unit: 100% ( 0/ 0)
Test Accuracy of B-Support:  0% ( 0/ 1)
Test Accuracy of B-Surface_area: 100% ( 0/ 0)
Test Accuracy of I-SA_val_min: 100% ( 0/ 0)
Test Accuracy of I-PV_val_min:  0% ( 0/ 1)
Test Accuracy of B-Catalyst: 100% ( 0/ 0)
Test Accuracy of I-SA_val_max: 100% ( 0/ 0)
Test Accuracy of I-PV_val_max:  0% ( 0/ 1)
Test Accuracy of B-PV_unit: 100% ( 0/ 0)
Test Accuracy of B-SA_unit: 100% ( 0/ 0)
Test Loss: 2.845 |  Test F1 score: 0.00%


In [26]:
def tag_sentence(model, device, sentence, text_field, tag_field):
    
    model.eval()
    
    if isinstance(sentence, str):
        nlp = spacy.load('en')
        tokens = [token.text for token in nlp(sentence)]
    else:
        tokens = [token for token in sentence]

    if text_field.lower:
        tokens = [t.lower() for t in tokens]
        
    max_word_len = max([len(token) for token in tokens])

    numericalized_tokens = [text_field.vocab.stoi[t] for t in tokens]

    unk_idx = text_field.vocab.stoi[text_field.unk_token]
    
    unks = [t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx]
    
    token_tensor = torch.LongTensor(numericalized_tokens)
    
    token_tensor = token_tensor.unsqueeze(-1).to(device)
    
    predictions = model(token_tensor)
    
    top_predictions = predictions.argmax(-1)
    
    predicted_tags = [tag_field.vocab.itos[t.item()] for t in top_predictions]
    
    return tokens, predicted_tags, unks

In [27]:
example_index = 39

sentence = vars(test_data.examples[example_index])['text']
actual_tags = vars(test_data.examples[example_index])['tag']

print(sentence)
print(actual_tags)

tokens, pred_tags, unks = tag_sentence(model, 
                                       device, 
                                       sentence, 
                                       TEXT, 
                                       TAG
                                      )
print(pred_tags)

['53', '.', 'the', 'method', 'of', 'claim', '3', 'wherein', 'the', 'silica', 'support', 'has', 'a', 'pore', 'volume', 'between', 'about', '1.0', 'cm3/g', 'and', 'about', '1.5', 'cm3/g', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Support', 'I-Support', 'O', 'O', 'B-Pore_volume', 'I-Pore_volume', 'I-Locution', 'O', 'I-PV_val_min', 'I-PV_unit', 'I-Locution', 'O', 'I-PV_val_max', 'I-PV_unit', 'O']
['B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit', 'B-SA_unit']


In [28]:
print("Pred. Tag\tActual Tag\tCorrect?\tToken\n")

for token, pred_tag, actual_tag in zip(tokens, pred_tags, actual_tags):
    correct = '✔' if pred_tag == actual_tag else '✘'
    print(f"{pred_tag}\t\t{actual_tag}\t\t{correct}\t\t{token}")

Pred. Tag	Actual Tag	Correct?	Token

B-SA_unit		O		✘		53
B-SA_unit		O		✘		.
B-SA_unit		O		✘		the
B-SA_unit		O		✘		method
B-SA_unit		O		✘		of
B-SA_unit		O		✘		claim
B-SA_unit		O		✘		3
B-SA_unit		O		✘		wherein
B-SA_unit		O		✘		the
B-SA_unit		B-Support		✘		silica
B-SA_unit		I-Support		✘		support
B-SA_unit		O		✘		has
B-SA_unit		O		✘		a
B-SA_unit		B-Pore_volume		✘		pore
B-SA_unit		I-Pore_volume		✘		volume
B-SA_unit		I-Locution		✘		between
B-SA_unit		O		✘		about
B-SA_unit		I-PV_val_min		✘		1.0
B-SA_unit		I-PV_unit		✘		cm3/g
B-SA_unit		I-Locution		✘		and
B-SA_unit		O		✘		about
B-SA_unit		I-PV_val_max		✘		1.5
B-SA_unit		I-PV_unit		✘		cm3/g
B-SA_unit		O		✘		.


In [None]:
sentence = 'The will deliver a speech about the conflict in Sao Paulo at tomorrow in Anne Mary with Jack.'

tokens, tags, unks = tag_sentence(model, 
                                  device, 
                                  sentence, 
                                  TEXT, 
                                  TAG,
                                  CHAR)

print(unks)
print("Pred. Tag\tToken\n")


for token, tag in zip(tokens, tags):
    print(f"{tag}\t\t{token}")