In [None]:
#!pip install seqeval
#!pip install transformers
#!pip install torch==1.5.0

In [None]:
import json
import os
import re
from ast import literal_eval
from collections import Counter, OrderedDict, defaultdict
from datetime import date, datetime

import boto
import boto3
import numpy as np
import pandas as pd
import s3fs
import torch
from nltk import sent_tokenize, word_tokenize
from seqeval.metrics import accuracy_score, performance_measure
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from tqdm import notebook, tqdm
from transformers import (
    AdamW,
    BertConfig,
    BertForTokenClassification,
    BertModel,
    BertTokenizerFast,
    get_linear_schedule_with_warmup,
)

tqdm.pandas()

What's your GPU?

In [None]:
!nvidia-smi

Set up transformer classes

In [None]:
config_cls = BertConfig
token_class_model_cls = BertForTokenClassification
tokenizer_cls = BertTokenizerFast

Define seeds for reproducibility

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Load GOV.UK NER data

Paths and filenames...

In [None]:
system = "AWS"

In [None]:
if system == "AWS":
    fs = s3fs.S3FileSystem()
    bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f"s3://{bucket}/model-data/govner-data"
    print(fs.ls(DATA_DIR))
elif system == "COLAB":
    from google.colab import drive

    drive.mount("/content/gdrive")
    DATA_DIR = os.path.join(
        "/content/gdrive/My Drive", "transformer_fun/govner/roberta"
    )
elif system == "LOCAL":
    DATA_DIR = os.getenv("DATA_DIR")
DATA_DIR

Read the data...

In [None]:
ner_data_file = os.path.join(
    DATA_DIR, "line_by_line_NER_data_sampled_12062020_more_ents.csv"
)
label_map_file = os.path.join(DATA_DIR, "label_map_12062020_more_ents.json")
ner_data_file

In [None]:
df = pd.read_csv(ner_data_file, sep="\t", low_memory=False)

In [None]:
print(df.shape)
f
df.head()

In [None]:
if system == "AWS":
    with fs.open(label_map_file, "rb") as f:
        label_name_map = json.load(f)
    print(label_name_map)
else:
    with open(label_map_file, "r") as f:
        label_name_map = json.load(f)
    print(label_name_map)

literal_eval for list values

In [None]:
for col in ["text_token", "label_list"]:
    print(col)
    df[col] = df[col].progress_map(literal_eval)

label_map = {v: k for k, v in label_name_map.items()}
label_map

In [None]:
print(df[df.text_token.str.len() != df.label_list.str.len()].shape)
print(df[df.text.duplicated()].shape)
print(df.base_path.nunique())

Subsample, if necessary

In [None]:
frac = 0.9
int(df.shape[0] * frac)

In [None]:
df = df.sample(frac=frac, random_state=RANDOM_SEED)
df.shape

#### Create torch dataset

In [None]:
class NERDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, label_map, max_len=256):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.label_map = label_map
        self.max_len = max_len

    def _tokenize_and_realign_labels(self, sent, labels):
        new_labels = []
        tokenized = []
        for i, (tok, label) in enumerate(zip(sent, labels)):
            for t in self.tokenizer.encode([tok], add_special_tokens=False):
                new_labels.append(label)
                tokenized.append(t)
        new_labels = ["O"] + new_labels[: self.max_len - 2] + ["O"]
        tokenized = (
            [self.tokenizer.cls_token_id]
            + tokenized[: self.max_len - 2]
            + [self.tokenizer.sep_token_id]
        )
        assert len(tokenized) == len(new_labels)
        return tokenized, new_labels

    def _pad(self, seq):
        return seq + [0] * (self.max_len - len(seq))

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, i):
        sentence_i = self.sentences[i]
        label_i = self.labels[i]

        assert len(sentence_i) == len(label_i)

        input_ids, labels = self._tokenize_and_realign_labels(sentence_i, label_i)
        attention_mask = [1] * len(input_ids)
        label_num_i = [self.label_map[l] for l in labels]
        assert len(label_num_i) == len(input_ids) == len(attention_mask)

        return {
            "sentence": sentence_i,
            "input_ids": torch.tensor(self._pad(input_ids)),
            "attention_mask": torch.tensor(self._pad(attention_mask)),
            "text": " ".join(self.sentences[i]),
            "labels": torch.tensor(self._pad(label_num_i), dtype=torch.long),
        }


def create_data_loader(
    df, text_column, target_column, tokenizer, label_map, max_len, batch_size
):
    ds = NERDataset(
        sentences=df[text_column].to_numpy(),
        labels=df[target_column].to_numpy(),
        tokenizer=tokenizer,
        label_map=label_map,
        max_len=max_len,
    )

    return DataLoader(ds, batch_size=batch_size, num_workers=4)

#### Define metrics

In [None]:
def get_entities(llist):
    prev_tag = ""
    indices = []
    for i, ent in enumerate(llist):
        if ent != prev_tag:
            indices.append([ent, i, i])
        else:
            indices[-1][2] = i
        prev_tag = ent
    return [tuple(i) for i in indices if i[0] != "O"]


def f1_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    # intersection of predicted and true indexed named
    # entities
    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)
    nb_true = len(true_entities)

    p = nb_correct / nb_pred if nb_pred > 0 else 0
    r = nb_correct / nb_true if nb_true > 0 else 0

    return 2 * p * r / (p + r) if p + r > 0 else 0


def precision_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)

    return nb_correct / nb_pred if nb_pred > 0 else 0


def recall_score(y_true, y_pred):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    nb_correct = len(true_entities & pred_entities)
    nb_true = len(true_entities)

    return nb_correct / nb_true if nb_true > 0 else 0


def classification_report(y_true, y_pred, digits=2):
    true_entities = set(get_entities(y_true))
    pred_entities = set(get_entities(y_pred))

    name_width = 0
    d1 = defaultdict(set)
    d2 = defaultdict(set)
    for e in true_entities:
        d1[e[0]].add((e[1], e[2]))
        name_width = max(name_width, len(e[0]))
    for e in pred_entities:
        d2[e[0]].add((e[1], e[2]))

    last_line_heading = "macro avg"
    width = max(name_width, len(last_line_heading), digits)

    headers = ["precision", "recall", "f1-score", "support"]
    head_fmt = "{:>{width}s} " + " {:>9}" * len(headers)
    report = head_fmt.format("", *headers, width=width)
    report += "\n\n"

    row_fmt = "{:>{width}s} " + " {:>9.{digits}f}" * 3 + " {:>9}\n"

    ps, rs, f1s, s = [], [], [], []
    for type_name, true_entities in d1.items():
        pred_entities = d2[type_name]
        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)

        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        f1 = 2 * p * r / (p + r) if p + r > 0 else 0

        report += row_fmt.format(
            *[type_name, p, r, f1, nb_true], width=width, digits=digits
        )

        ps.append(p)
        rs.append(r)
        f1s.append(f1)
        s.append(nb_true)

    report += "\n"

    # compute averages
    report += row_fmt.format(
        "micro avg",
        precision_score(y_true, y_pred),
        recall_score(y_true, y_pred),
        f1_score(y_true, y_pred),
        np.sum(s),
        width=width,
        digits=digits,
    )
    report += row_fmt.format(
        last_line_heading,
        np.average(ps, weights=s),
        np.average(rs, weights=s),
        np.average(f1s, weights=s),
        np.sum(s),
        width=width,
        digits=digits,
    )

    return report


y_true = ["a", "a", "b", "o", "o", "i", "a"]
y_pred = ["a", "a", "O", "o"]
print(classification_report(y_true, y_pred, digits=4))
print(accuracy_score(y_true, y_pred))

#### Define a model

In [None]:
## model name
# 'bert-base-cased' "roberta-base"
MODEL_NAME = "bert-base-cased"
MODEL_NAME

#### Define model config

hidden_size (int, optional, defaults to 768) – Dimensionality of the encoder layers and the pooler layer.

num_hidden_layers (int, optional, defaults to 12) – Number of hidden layers in the Transformer encoder.

num_attention_heads (int, optional, defaults to 12) – Number of attention heads for each attention layer in the Transformer encoder.

intermediate_size (int, optional, defaults to 3072) – Dimensionality of the “intermediate” (i.e., feed-forward) layer in the Transformer encoder.

hidden_act (str or function, optional, defaults to “gelu”) – The non-linear activation function (function or string) in the encoder and pooler. If string, “gelu”, “relu”, “swish” and “gelu_new” are supported.

hidden_dropout_prob (float, optional, defaults to 0.1) – The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.

attention_probs_dropout_prob (float, optional, defaults to 0.1) – The dropout ratio for the attention probabilities.

max_position_embeddings (int, optional, defaults to 512) – The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).

initializer_range (float, optional, defaults to 0.02) – The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

layer_norm_eps (float, optional, defaults to 1e-12) – The epsilon used by the layer normalization layers.

In [None]:
config = config_cls.from_pretrained(MODEL_NAME, num_labels=len(label_map))
model = token_class_model_cls.from_pretrained(MODEL_NAME, config=config)
print(list(model.classifier.named_parameters()))
print(model.num_parameters())
print(model.dropout)

In [None]:
model = model.to(device)

#### Define a tokenizer

In [None]:
tokenizer = tokenizer_cls.from_pretrained(MODEL_NAME)

#### Define an optimizer and learning parameters

In [None]:
lr = 2e-5
eps = 1e-8

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "gamma", "beta"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay_rate": 0.01,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay_rate": 0.0,
        },
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]


optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=eps)

#### Data splitting

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

df_train.shape, df_val.shape, df_test.shape

#### Define parameters for data input

In [None]:
BATCH_SIZE = 24
MAX_LEN = 256
text_column = "text_token"
target_column = "label_list"

train_data_loader = create_data_loader(
    df_train, text_column, target_column, tokenizer, label_name_map, MAX_LEN, BATCH_SIZE
)
val_data_loader = create_data_loader(
    df_val, text_column, target_column, tokenizer, label_name_map, MAX_LEN, BATCH_SIZE
)
test_data_loader = create_data_loader(
    df_test, text_column, target_column, tokenizer, label_name_map, MAX_LEN, BATCH_SIZE
)

len(train_data_loader), len(train_data_loader) * BATCH_SIZE, df_train.shape

#### Scheduling

In [None]:
epochs = 10
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_data_loader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

#### Define functions for training and evaluation

In [None]:
def train_epoch(model, data_loader, optimizer, device, scheduler):

    model = model.train()
    train_loss = []
    predictions = []
    true_labels = []

    for d in notebook.tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )

        loss = outputs[0]
        train_loss.append(loss.item())

        logits = outputs[1].detach().cpu().numpy()
        label_ids = labels.to("cpu").numpy()

        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return true_labels, predictions, np.mean(train_loss)


def eval_model(model, data_loader, device):
    model = model.eval()
    eval_loss = []
    predictions = []
    true_labels = []
    for batch in notebook.tqdm(data_loader):

        with torch.no_grad():
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )

        loss = outputs[0]
        eval_loss.append(loss.item())

        logits = outputs[1].detach().cpu().numpy()
        label_ids = labels.to("cpu").numpy()

        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    return true_labels, predictions, np.mean(eval_loss)


def get_stats(predictions_local, true_labels_local, loss, loss_name="Train"):
    pred_tags = [
        label_map[p_i]
        for p, l in zip(predictions_local, true_labels_local)
        for p_i, l_i in zip(p, l)
        if label_map[l_i] != "PAD"
    ]
    valid_tags = [
        label_map[l_i]
        for l in true_labels_local
        for l_i in l
        if label_map[l_i] != "PAD"
    ]
    acc = accuracy_score(valid_tags, pred_tags) * 100
    print(f"{loss_name} loss {loss} accuracy {acc}\n")
    print(classification_report(valid_tags, pred_tags, digits=4))

    return acc

In [None]:
def model_saver(model, filename):
    if system == "AWS":
        with fs.open(filename, "wb") as f:
            torch.save(model.state_dict(), f)
    else:
        torch.save(model.state_dict(), filename)

In [None]:
torch.cuda.empty_cache()

In [None]:
name_for_saving = (
    f"{MODEL_NAME}_{BATCH_SIZE}_{MAX_LEN}_"
    + f"{df_train.shape[0]}_{df_test.shape[0]}_"
    + f"{lr}_{eps}_{epochs}"
)
name_for_saving

In [None]:
history = defaultdict(list)
best_accuracy = 0
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    print("-" * 10)

    true_labels, predictions, train_loss = train_epoch(
        model, train_data_loader, optimizer, device, scheduler
    )
    train_acc = get_stats(predictions, true_labels, train_loss, "Train")

    true_labels, predictions, val_loss = eval_model(model, val_data_loader, device)
    val_acc = get_stats(predictions, true_labels, val_loss, "Val")

    history["train_acc"].append(train_acc)
    history["train_loss"].append(train_loss)
    history["val_acc"].append(val_acc)
    history["val_loss"].append(val_loss)

    if val_acc > best_accuracy:
        filename = os.path.join(DATA_DIR, f"{name_for_saving}_model_state.bin")
        model_saver(model, filename)
        best_accuracy = val_acc