# BERT - Fine-tuning NER


## Préparation des données CoNLL-2003

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext import datasets
from torchtext.datasets import SequenceTaggingDataset

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [2]:
data = pd.read_csv("data_ner/data_train.csv", encoding="latin1").fillna(method="ffill")
test = pd.read_csv("data_ner/data_test.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,text,tag
204557,14985,Plymouth,B-ORG
204558,14985,2,O
204559,14985,Preston,B-ORG
204560,14985,1,O
204561,14986,Division,O
204562,14986,three,O
204563,14987,Swansea,B-ORG
204564,14987,1,O
204565,14987,Lincoln,B-ORG
204566,14987,2,O


On récupère les phrases et les tags avec la fonction suivante.

In [3]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,  t) for w, t in zip(s["text"].values.tolist(),
                                                          
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
      
getter = SentenceGetter(data)
get_test = SentenceGetter(test)

In [4]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
sentences_test = [[word[0] for word in sentence] for sentence in get_test.sentences]
print(sentences[1])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']


In [5]:
labels = [[s[1] for s in sentence] for sentence in getter.sentences]
labels_test = [[s[1] for s in sentence] for sentence in get_test.sentences]
print(labels[1])

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


On attribut un indice pour chaque tag et on ajoute le tag PAD.

In [6]:
tag_values = list(set(data["tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [7]:
tag2idx

{'I-MISC': 0,
 'B-ORG': 1,
 'I-LOC': 2,
 'B-PER': 3,
 'B-LOC': 4,
 'I-ORG': 5,
 'O': 6,
 'B-MISC': 7,
 'I-PER': 8,
 'PAD': 9}

In [8]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

Using TensorFlow backend.


'1.4.0'

Ici, nous corrigeons certaines configurations. 
Nous limiterons la longueur de notre séquence à 512 tokens et nous utiliserons une taille de batch de 32 comme suggéré par l'article de Bert. 

In [9]:
MAX_LEN = 512
bs = 32

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Maintenant, nous tokenisons toutes les phrases. Étant donné que le tokenizer BERT est basé sur un tokenizer Wordpiece, il divisera les tokens en tokens de sous-mots. Par exemple, "gunships" sera divisé en deux tokens "guns" et "##hips". 
La fonction suivante nous permet de résoudre ce problème.

In [11]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the mot et compte le nombre de sous-mots 
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)

        # Ajoute le même tag pour tous les sous-mots d'un mot
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels


In [12]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]
# pour le test
tokenized_texts_and_labels_test = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences_test, labels_test)
]

In [13]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

# pour le test
tokenized_texts_test = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels_test]
labels_test = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels_test]

Ensuite, nous coupons et remplissons les séquences de tokens et de labels à la longueur souhaitée en ajoutant des tokens `PAD`.

In [14]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

#test
input_ids_test = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_test],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [15]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

#test
tags_test = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_test],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")


Le modèle de Bert prend en charge des masques d'attention ou attention_mask. Nous créons donc ici le masque pour ignorer les éléments PAD dans les séquences.

In [16]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

# test
attention_masks_test = [[float(i != 0.0) for i in ii] for ii in input_ids_test]

In [17]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

On convertit nos inputs, tags et masques en tensor torch.

In [18]:
# train
tr_inputs = torch.tensor(tr_inputs)
tr_tags = torch.tensor(tr_tags)
tr_masks = torch.tensor(tr_masks)
# valid
val_inputs = torch.tensor(val_inputs)
val_tags = torch.tensor(val_tags)
val_masks = torch.tensor(val_masks)
# test
test_inputs = torch.tensor(input_ids_test)
test_tags = torch.tensor(tags_test)
test_masks = torch.tensor(attention_masks_test)

La dernière étape consiste à définir les itérateurs. Nous mélangons les données au moment de l'apprentissage avec le RandomSampler et au moment du test, nous les transmettons simplement séquentiellement avec le SequentialSampler.

In [19]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

## Construction du modèle

Le package transformer fournit une classe BertForTokenClassification pour les prédictions au niveau du token. 

BertForTokenClassification est un modèle de réglage fin qui enveloppe BertModel et ajoute un classificateur au niveau du token au-dessus du BertModel. Le classificateur au niveau du token est une couche linéaire qui prend comme entrée le dernier état caché de la séquence. Nous chargeons le modèle pré-formé et fournissons le nombre de tags possibles.

In [20]:
import transformers
from transformers import BertForTokenClassification, AdamW

In [21]:
model = BertForTokenClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
#model.cuda();
device = 'cpu'
model.to(device);

 - Optimiseur et scheduler : 
 
 

In [23]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0}
]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [24]:
from transformers import get_linear_schedule_with_warmup

epochs = 10
max_grad_norm = 1.0


total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


 - Loss

In [25]:
TAG_PAD_IDX = tag2idx.get('PAD')
criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)
criterion = criterion.to(device)

### Entraînement

In [26]:
from sklearn.metrics import f1_score, classification_report
def train(model, iterator, optimizer, criterion):
    tag_pad_idx = 0
    
    model.train()
    
    total_loss = 0
    f1 = 0
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    
    for step, batch in enumerate(train_dataloader):
        
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch 
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]    
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()
  
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
        pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
        valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    
        f1s = f1_score(pred_tags, valid_tags, average="micro")
        f1 = f1 + f1s
    f1_s = f1/len(train_dataloader)
    
    avg_train_loss = total_loss / len(train_dataloader)
    return avg_train_loss, f1_s

def evaluate(model, iterator, criterion):
    model.eval()
    eval_loss, eval_accuracy, f1 = 0, 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    preds = []
    labels = []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
        
        pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
        valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    
        f1s = f1_score(pred_tags, valid_tags, average="micro")
        f1 = f1 + f1s
        preds.append(pred_tags)
        labels.append(valid_tags)
        
    f1_s = f1 /  len(valid_dataloader)
    eval_loss = eval_loss / len(valid_dataloader)
    
    return eval_loss, f1_s, preds, labels
   

In [None]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 5

t_loss = []
t_f1 = []
v_loss = []
v_f1 = []

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_f1 = train(model, train_dataloader, optimizer, criterion)
    t_loss.append(train_loss)
    t_f1.append(train_f1) 
    
    valid_loss, valid_f1,_,_ = evaluate(model, valid_dataloader, criterion)
    v_loss.append(valid_loss)
    v_f1.append(valid_f1)  
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut5-model_ner.pt')
    if epoch%1 == 0:
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train F1 score: {train_f1*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. F1 score: {valid_f1*100:.2f}%')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np 
sns.set()

x = np.linspace(0, N_EPOCHS,N_EPOCHS)

plt.plot(x,t_loss)
plt.plot(x,v_loss)
plt.title("Loss")
plt.legend(["Train loss", "Valid loss"])

In [None]:
x = np.linspace(0, N_EPOCHS,N_EPOCHS)

plt.plot(x,t_f1)
plt.plot(x,v_f1)
plt.title("F1 score")
plt.legend(["Train F1", "Valid F1"])

### Test

In [None]:
model.load_state_dict(torch.load('tut5-model_ner.pt'))

test_loss, test_f1, preds, labels = evaluate(model, test_dataloader, criterion)
print(f'Test Loss: {test_loss:.3f} |  Test F1 score: {test_f1*100:.2f}%')

In [None]:
predict =  [item for sublist in preds for item in sublist]
true =  [item for sublist in labels for item in sublist]

print(classification_report(
                y_true=true,
                y_pred=predict
            ))

In [None]:
ent_manquantes = []
for i in tag2idx:
    if i not in true:
        ent_manquantes.append(i)
        print(i)

In [None]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(true, predict)
confusion = confusion_matrix(true, predict)
l = [i for i in tag2idx if i not in ent_manquantes]

In [None]:
confusion_df =pd.DataFrame(confusion)

confusion_df.columns=[i for i in TAG.vocab.itos][1:]
s = pd.Series([i for i in TAG.vocab.itos][1:])
confusion_df = confusion_df.set_index([s])

confusion_df['LOC'] = confusion_df['B-LOC'] + confusion_df['I-LOC']
confusion_df['PER'] = confusion_df['B-PER'] + confusion_df['I-PER']
confusion_df['ORG'] = confusion_df['B-ORG'] + confusion_df['I-ORG']
confusion_df['MISC'] = confusion_df['B-MISC'] + confusion_df['I-MISC']


confusion_df = confusion_df.drop(columns=[ i for i in TAG.vocab.itos if i != 'O' and i != '<pad>'])

confusion_df.loc['LOC'] = confusion_df.loc['B-LOC'] + confusion_df.loc['I-LOC']
confusion_df.loc['PER'] = confusion_df.loc['B-PER'] + confusion_df.loc['I-PER']
confusion_df.loc['ORG'] = confusion_df.loc['B-ORG'] + confusion_df.loc['I-ORG']
confusion_df.loc['MISC'] = confusion_df.loc['B-MISC'] + confusion_df.loc['I-MISC']

confusion_df = confusion_df.drop([i for i in TAG.vocab.itos if i != 'O' and i != '<pad>'])

confusion_df

In [None]:
cm = confusion_df.to_numpy()

TP = np.diag(cm)
FP = np.sum(cm, axis=0) - TP
FN = np.sum(cm, axis=1) - TP

num_classes = 4
TN = []
for i in range(num_classes):
    temp = np.delete(cm, i, 0)    
    temp = np.delete(temp, i, 1) 
    TN.append(sum(sum(temp)))

precision = [0]*len(TP)
recall = [0]*len(TP)
f1 = [0]*len(TP)
for i in range(len(TP)):
    if TP[i]+FP[i] != 0:
        precision[i] = TP[i]/(TP[i]+FP[i])
    else:
        precision[i] = 0

    if TP[i]+FN[i] != 0:
        recall[i] = TP[i]/(TP[i]+FN[i])
    else:
        recall[i] = 0

    if precision[i]+recall[i] != 0:
        f1[i] = (2*precision[i]*recall[i])/(precision[i]+recall[i])
    else:
        f1[i] = 0


prf_df =pd.DataFrame()
prf_df['Precision'] = precision
prf_df['Recall'] = recall
prf_df['F1-score'] = f1

s = pd.Series([i for i in confusion_df.index])
prf_df = prf_df.set_index([s])

prf_df

## Inférence

In [None]:
test_sentence = 'The will deliver a speech about the conflict in North Korea tomorrow in New York with my friend Mary Kate.'

In [None]:
tokenized_sentence = tokenizer.encode(test_sentence)
input_ids = torch.tensor([tokenized_sentence])#.cuda()

In [None]:
with torch.no_grad():
    output = model(input_ids)
label_indices = np.argmax(output[0].numpy(), axis=2)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)

In [None]:
for token, label in zip(new_tokens, new_labels):
    print("{}\t{}".format(label, token))