In [10]:
%load_ext autoreload
%autoreload 2


import numpy as np
import pandas as pd

from src.preprocess.text import SentenceGetter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

from tqdm.notebook import tqdm
from tqdm import trange

from itertools import chain

import matplotlib.pyplot as plt

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

from src.preprocess.text import sent2features, sent2labels

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


from src.preprocess.bert import tokenize_and_preserve_labels

torch.__version__

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'1.12.1'

In [11]:
ner_dataset = pd.read_csv("/Users/Mikhail_Bulgakov/GitRepo/pos_ner_task/data/ner_dataset.csv", delimiter=',', encoding='unicode_escape')
ner_dataset = ner_dataset.fillna(method="ffill")

In [12]:
sg = SentenceGetter(ner_dataset)

In [13]:
train_data, test_data = train_test_split(sg.get_full_data(), test_size=0.2, random_state=100)

In [14]:
sentences_train = [[word[0] for word in sentence] for sentence in train_data]
sentences_test = [[word[0] for word in sentence] for sentence in test_data]
sentences_train[0][:5]

['North', 'Korean', 'state', 'media', 'reported']

In [15]:
labels_train = [[s[2] for s in sentence] for sentence in train_data]
labels_test = [[s[2] for s in sentence] for sentence in test_data]
print(labels_train[0][:5])

['B-geo', 'B-gpe', 'O', 'O', 'O']


In [16]:
tag_values = list(set([i[2] for i in chain.from_iterable(train_data)]))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [17]:
MAX_LEN = 75
bs = 32

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [19]:
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=False)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [20]:
tokenized_texts_and_labels_train = [
    tokenize_and_preserve_labels(tokenizer, sent, labs)
    for sent, labs in zip(sentences_train, labels_train)
]
tokenized_texts_and_labels_test = [
    tokenize_and_preserve_labels(tokenizer, sent, labs)
    for sent, labs in zip(sentences_test, labels_test)
]


In [21]:
tokenized_texts_train = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels_train]
tokenized_texts_test = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels_test]
labels_train = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels_train]
labels_test = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels_test]

In [22]:
input_ids_train = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_train],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")
input_ids_test = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_test],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [23]:
tags_train = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_train],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
tags_test = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_test],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [24]:
attention_masks_train = [[float(i != 0.0) for i in ii] for ii in input_ids_train]
attention_masks_test = [[float(i != 0.0) for i in ii] for ii in input_ids_test]

In [25]:
tr_inputs = torch.tensor(input_ids_train)
val_inputs = torch.tensor(input_ids_test)
tr_tags = torch.tensor(tags_train)
val_tags = torch.tensor(tags_test)
tr_masks = torch.tensor(attention_masks_train)
val_masks = torch.tensor(attention_masks_test)

In [26]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [27]:
import transformers
from transformers import BertForTokenClassification
from torch.optim import AdamW

transformers.__version__

'4.24.0'

In [28]:
model = BertForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForTokenClassification: ['distilbert.transformer.layer.3.sa_layer_norm.bias', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.transformer.layer.2.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.2.sa_layer_norm.bias', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.transformer.layer.5.output_layer_norm.weight', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.1.attention.out_lin.bias', 'distilbert.transformer.layer.1.sa_layer_norm.weight', 'distilbert.transformer.layer.5.output_layer_norm.bias', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.transformer.layer.1.sa_layer_norm.bias', 'distilber

In [29]:
FULL_FINETUNING = False
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [30]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [31]:
from seqeval.metrics import f1_score, accuracy_score

In [32]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print()

Epoch:   0%|          | 0/3 [1:59:01<?, ?it/s]


KeyboardInterrupt: 