# SLU Models Slot Filling

In [1]:
__author__ = "Adrian Sarno, Jennifer Arnold"
__version__ = "CS224u, Stanford, Spring 2020"

In [2]:
# Set all the random seeds for reproducibility. Only the
# system and torch seeds are relevant for this notebook.
import utils
utils.fix_random_seeds()

In [3]:
# Python imports
import os
import numpy as np
from sklearn.metrics import classification_report
import logging
logger = logging.getLogger()
logger.level = logging.ERROR

In [4]:
# torch imports
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

In [5]:
# HuggingFace import
from transformers import BertTokenizer, BertModel, BertPreTrainedModel

In [6]:
# local imports
import atis
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from torch_rnn_classifier import TorchRNNClassifier, TorchRNNClassifierModel

In [7]:
# CUDA test
import sys; print(sys.version)
import torch; print(torch.__version__, torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]
1.4.0 True
GeForce GTX 1080


In [8]:
hf_weights_name = 'bert-base-cased'
# hf_weights_name = 'bert-base-uncased' - in this case the tokenizer does not split into subwords so often

hf_tokenizer = BertTokenizer.from_pretrained(hf_weights_name)
hf_model = BertModel.from_pretrained(hf_weights_name)

In [9]:
ATIS_HOME = os.path.join("data", "atis")

* * *

### Featurization

#### Batching (normalizing sentence lenghts)

In [10]:
def batch_encoder_vectorizer(input_sentences, max_length=None):
    """
    This function accomplishes two tasks:
    1.  tokenization and sentence-length normalization
    2.  featurization, it calls the bert model to convert tokens to embeddings 
    """
    
    # tokenization, encoding and sentence-length normalization
    tokenizer_output = hf_tokenizer.batch_encode_plus(
        input_sentences, 
        max_length=max_length,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True)
    
    input_token_ids = torch.tensor(tokenizer_output['input_ids'])
    input_mask = torch.tensor(tokenizer_output['attention_mask'])

    # featurization
    with torch.no_grad():
        final_hidden_states, cls_output = \
        hf_model(input_token_ids, attention_mask=input_mask)
    
    # cls_output not used
    # convert to numpy to match the type of all other results (all numpy)
    final_hidden_states = final_hidden_states.detach().cpu().numpy()
    
    return final_hidden_states, np.array(tokenizer_output['attention_mask']), np.array(tokenizer_output['input_ids'])

* * *

#### Modeling

In [29]:
class TorchShallowClassifierModel(nn.Module):
    def __init__(self,
            embed_dim,
            output_dim,
            dropout_prob):
        super(TorchShallowClassifierModel, self).__init__()
        
        self.embed_dim = embed_dim
        self.output_dim = output_dim
        
        # Graph
        self.dropout = nn.Dropout(dropout_prob)  
        self.classifier_layer = nn.Linear(embed_dim, output_dim)
        
        torch.nn.init.xavier_uniform(self.classifier_layer.weight) 
        if self.classifier_layer.bias is not None:
            self.classifier_layer.bias.data.zero_()

            
    def forward(self, X):
    
        # separate the feature vectors (embeddings) from the attention_mask
        X, attention_mask = X

        X = self.dropout(X)
        logits = self.classifier_layer(X)

        return logits


In [85]:
from torch_model_base import TorchModelBase
from utils import progress_bar

class TorchShallowSequenceTagger(TorchModelBase):
    """
     Featurization:
        Takes the embeddings already pre-computed.
    
    Classification:
        The simplest token classifier uses just a linear layer.
        The Pytorch linear layer can take as input a tensor of any number of dimensions
        and only the last dimension needs to be specified as input dimension.
   
        https://pytorch.org/docs/master/nn.html#linear   
    """
    def __init__(self, config, **kwargs):
        super(TorchShallowSequenceTagger, self).__init__(**kwargs)
        
        self.config = config
        self.input_embedding_size = config["input_embedding_size"]
        self.hidden_dropout_prob = config["hidden_dropout_prob"]
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.l2_strength = config["l2_strength"]
        self.max_iter = config["max_iter"]
        self.device = config["device"]
        self.class_weights = config.get("class_weights", None)
        if self.class_weights is not None:
            class_weights = torch.FloatTensor(self.class_weights)
        
    def define_graph(self):
        """
        This is a shallow model. so it does not really define a graph here 
        but it instantiates a model class with the classfier top.
        """     
        self.num_classes = len(self.class2index)   # class2index is set in fit()
        print(f"define_graph: num_classes: {self.num_classes}")
        return TorchShallowClassifierModel(
            self.input_embedding_size, 
            self.num_classes, 
            self.hidden_dropout_prob)

    def forward(self, X=None):
        """
        X: token embeddings
        
        attention_mask:
            This argument is only needed if you want to compute the training loss
            
        labels: 
            2D tensor (batch_size, max_sequence_length), 
            each element is a class index in the range [0, C-1], no one-hot encodding required.
            In mini-batch processing mode, the labels vectors must be padded 
            up to max_sequence_length.
            This argument is only needed if you want to compute the training loss
        """
        
        # call the forward method on the classification model
        logits = self.model.forward(X=X)                
        outputs = (logits,) 


        return outputs  # (loss), logits

    def compute_loss(self, logits, attention_mask, labels):
        loss_fct = CrossEntropyLoss(weight=self.class_weights)
        active_logits = logits.view(-1, self.num_classes)

        if attention_mask is not None:              
            # computes a boolean mask (flat boolean array), one element for each token of each row
            active_loss_mask = attention_mask.view(-1) == 1
            active_labels = torch.where(
                active_loss_mask, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
            )
        else:
            active_labels = labels.view(-1)

        # computes the loss between labels and logits 
        loss = loss_fct(active_logits, active_labels)

        return loss
        
        
        
    def fit(self, X, y, **kwargs):
        """Standard `fit` method.
        
        fit() expects embeddings in X and strings in y.
        The class itself is in charge of encoding the labels.

        Parameters
        ----------
        X : [embeddings, attention_mask]
        y : array-like, a list of lists of string [['O', 'B-fromcity']]
        kwargs : dict
            For passing other parameters. If 'X_dev' is included,
            then performance is monitored every 10 epochs; use
            `dev_iter` to control this number.

        Returns
        -------
        self

        """               
        
        ################################################################
        # Model definition
        ################################################################
        
        # Graph:
        if not hasattr(self, "model"):
            
            self.compute_class2index(y)  # expects strings, must run before tensorizing y
            self.model = self.define_graph()
        
        # Prime the model for training
        self.model.to(self.device)
        self.model.train()
        
        # Default is torch.optim.Adam
        optimizer = self.optimizer(
            self.model.parameters(),
            lr=self.lr,
            weight_decay=self.l2_strength)    
          
        
        ################################################################
        # Data 
        ################################################################
        # separate the feature vectors (embeddings) from the attention_mask
        X, attention_mask = X
        print(f"X: {type(X)},     attention_mask: {type(attention_mask)},    y: [{len(y)}, {len(y[0])}]")
         
        # Compute Incremental performance:
        X_dev = kwargs.get('X_dev')
        if X_dev is not None:
            # X_dev contains 2 parameters, X_dev, attention_mask_dev
            # the dev mask stays as numpy as is not used for anything
            # because the class does not compute dev loss, it just stores 
            # the dev predictions. This mask is required by predict but
            # it is just returned in the predict results (never used)            
            dev_iter = kwargs.get('dev_iter', 10)
        
        # encode labels (label vectorization). must run before tensorizing y
        y = self.encode_labels(y)
        y = self.pad_to_max_length(y, X.shape[:2]) # y must have the shape of first 2 dims of X

        # cast data into PyTorch tensors
        X = torch.FloatTensor(X)
        attention_mask = torch.tensor(attention_mask, dtype=torch.bool)
        y = torch.tensor(y, dtype=torch.long)

        # Wrap data into a dataset and use a Dataloader for batching
        dataset = torch.utils.data.TensorDataset(X, attention_mask, y)
        dataloader = torch.utils.data.DataLoader(
            dataset, batch_size=self.batch_size, 
            shuffle=True, pin_memory=True) 
        

        ################################################################
        # Training process (Gradient Descent)
        ################################################################
        for iteration in range(1, self.config["max_iter"]+1):
            epoch_error = 0.0
            for i, (X_batch, m_batch, y_batch) in enumerate(dataloader):
                
                # load the batch input tensors into GPU memory
                X_batch = X_batch.to(self.device, non_blocking=True)
                m_batch = m_batch.to(self.device, non_blocking=True)
                
                # call forward (mask is not used unless you want to compute the training loss)
                logits = self.model.forward(
                    X=[X_batch, m_batch])
                
                # load the batch label tensors into GPU memory
                y_batch = y_batch.to(self.device, non_blocking=True)
                
                # compute the loss, the gradients and update the weights
                err = self.compute_loss(logits, m_batch, y_batch)
                epoch_error += err.item()
                optimizer.zero_grad()
                err.backward()
                optimizer.step()

            # Incremental predictions where possible:
            if X_dev is not None and iteration > 0 and iteration % dev_iter == 0:
                self.dev_predictions[iteration] = self.predict(X_dev)
                self.model.train()
            self.errors.append(epoch_error)
            progress_bar(
                "Finished epoch {} of {}; error is {}".format(
                    iteration, self.config["max_iter"], epoch_error))
        return self


    
    def predict_flat(self, X):
        """Predicted classes for the examples in `X`. In flat format for metric functions.

        Parameters
        ----------
        X : np.array
        attention_mask: input mask

        Returns
        -------
        probs: torch.tensor(batch_size * max_sequence_length, num_classes)
        pred_class: numpy.array(batch_size * max_sequence_length)

        """

        probs, preds, attention_mask = self.predict(X)
        
        # Flatten and apply mask
        preds_flat = preds.flatten()
        preds_flat = preds_flat[attention_mask.flatten() == 1]        
        
        probs_flat = probs.flatten()
        probs_flat = probs_flat[attention_mask.flatten() == 1] 
        
        return probs_flat, preds_flat

    
    def predict(self, X):
        """Predicted classes for the examples in `X`.

        Parameters
        ----------
        X : np.array
        attention_mask: np.array 
            input mask
            # the dev mask stays as numpy as is not used for anything
            # because the class does not compute dev loss, it just stores 
            # the dev predictions. This mask is required by predict but
            # it is just returned in the predict results (never used)

        Returns
        -------
        probs: torch.tensor(batch_size, max_sequence_length, num_classes)
        pred_class: numpy.array(batch_size, max_sequence_length)
        attention_mask

        """
        # get the attention_mask to return it
        _, attention_mask = X

        # compute probabilities and predicted class
        probs = self.predict_proba(X)

        # compute predicted class, maximizing across the last dimension (classes)
        _, pred_class_idx = torch.max(probs, dim=-1)
        pred_class_idx = pred_class_idx.detach().cpu().numpy()
        
        # decode the class indices tocla class names (IOB_tag)
        preds = [[self.index2class[i] for i in row_class_idx] 
                              for row_class_idx in pred_class_idx]
        preds = np.array(preds)

        return probs.detach().cpu().numpy(), preds, attention_mask
        

    def predict_proba(self, X):
        """Predicted probabilities for the examples in `X`.

        Parameters
        ----------
        X : np.array

        Returns
        -------
        torch.tensor(batch_size, max_sequence_length, num_classes)

        """

        # Graph:
        if not hasattr(self, "model"):
            # self.class2index must be defined in this case
            self.model = self.define_graph()
        
        # prime the model for prediction-only mode
        self.model.eval()
        with torch.no_grad():
            self.model.to(self.device)
       
            # cast input data into PyTorch tensors
            x, attention_mask = X
            x, attention_mask = torch.tensor(x), torch.tensor(attention_mask, dtype=torch.bool)

            # load the input tensors into GPU memory
            x = x.to(self.device)
            attention_mask = attention_mask.to(self.device)
            
            # call forward 
            output = self.forward(
                X=[x, attention_mask])
            
            logits = output[0]
            
            # compute probabilities and predicted class
            probs = nn.Softmax(dim=-1)(logits) # normalize scores along the latest dimension
            
            return probs  # tensor (no_grad)
        
    def compute_class2index(self, y):
        """
        y: 2-D list of (lists of) strings (iob_tags, not indices)
        
        expects strings, must run before tensorizing y,
        must run before defining the graph because it computes
        the output network output (num_classes)
        
        Note:
        if the input type is incorrect and it results in a number of classes is incorrect (very high) 
        it will likely cause a CUDA-OUT-OF-MEMORY error
        """

        # flat list of iob labels
        iob_labels = []
        for y_row in y:
            iob_labels.extend(y_row)
            
        # create mapping
        classes = sorted(set(iob_labels))
        self.class2index = dict(zip(classes, range(len(classes))))
        self.index2class = {i:c for c, i in self.class2index.items()}        
        
    def encode_labels(self, y):
        """
        y: 2-D list of (lists of) strings (iob_tags, not indices)
        
        expects strings, must run before tensorizing y,
        """
        tag_id_matrix = []
        for iob_tags in y:
            tag_ids = [self.class2index[iob_tag] for iob_tag in iob_tags]
            tag_id_matrix.append(tag_ids)  

        return  tag_id_matrix
    
    def pad_to_max_length(self, jagged_matrix, output_shape):
        padded_matrix = np.zeros(shape=output_shape)
        for i, row in enumerate(jagged_matrix):
            padded_matrix[i, :len(row)] = row 
        return padded_matrix

#### Batch Training and Prediction of the Shallow Slot Filling model (the Shallow model uses Bert embeddings without fine-tunning)
* * *

### Labelling: Alignment, Encoding, Normalize the length of the sequences, Class Weights

#### Compute class weights

In [86]:
def compute_class_weights(class_ids):
    """
        class_ids: 1D tensor, contains one class_id for each example
    """
    # encode the class_ids as onehot
    class_matrix = np.zeros(shape=(len(class_ids), max(class_ids)))
    class_matrix[class_ids] = 1
    
    # set the positive weights as the fraction of negative labels (0) for each class (each column)
    w_p = np.sum(class_matrix == 0, axis=0) / class_matrix.shape[0]

    # set the negative weights as the fraction of positive labels (1) for each class (each column)
    w_n = np.sum(class_matrix == 1, axis=0) / class_matrix.shape[0]

    return class_weights

#### Label and Sub-token Alignment (call WordPiece for each token, output word_to_tok_map and aligned_labels)

In [87]:
def word_start_label_aligner(sentence, word_labels=None):
    """
    Aligns the IOB labels to the word-starting tokens in the list of
    sub-word tokens returned by the WordPiece tokenizer.
    Returns:
    - an array of indices, each pointing to the first sub-token of every word
    - a padded list of labels, which has one element for each sub-token (the first
      sub-token of every word gets the label, the rest get the padding label 'X')
    """
    # Token map will be an int -> int mapping between the `word` index in the sentence and
    # the WordPiece `tokens` index.
    word_start_indices = []
    tokens = ["[CLS]"]
    if word_labels is not None:
        token_labels = ["O"]
    else:
        token_labels = None
    if len(sentence.split()) != len(word_labels):
        print(f"sentence: {len(sentence.split())}, word_labels: {len(word_labels)}")
        print(f"sentence: {sentence.split()}, word_labels: {word_labels}")
    for word_idx, word in enumerate(sentence.split(' ')):
        word_start_indices.append(len(tokens))
        word_tokens = hf_tokenizer.tokenize(word)  # tokenize ONE word 
        tokens.extend(word_tokens)
        if word_labels is not None:
            token_labels.append(word_labels[word_idx])
            if len(word_tokens) > 1:
                token_labels.extend(["X"]*(len(word_tokens)-1))

    tokens.append("[SEP]")
    token_labels.append( "O")
    return token_labels, word_start_indices, tokens

In [88]:
def sequence_tagging_label_aligner(sentences, labels):
    """
    """
    label_matrix, word_start_matrix, token_matrix = [], [], []
    
    for sentence, word_labels in zip(sentences, labels):
        token_labels, word_start_indices, tokens =\
            word_start_label_aligner(sentence, word_labels)
        
        label_matrix.append(token_labels)
        word_start_matrix.append(word_start_indices)
        token_matrix.append(tokens)

    return label_matrix, word_start_matrix, token_matrix

### Metrics

In [89]:
def flatten_predict_output(y_true, preds, attention_mask):
    """
    y_true : list of list of strings (IOB_tags)
        y_true is a list of  variable-length lists of strings, 
        is token-alignmed but not lenght-padded
        
    preds : np.array(batch_size, max_sentence_length)
        2-D array with the class predicted by the model,
        for each token of each sentence.
        The attention mask must be applied to exclude the padding tokens.
        
    attention_mask: np.array(batch_size, max_sentence_length)
        boolean tensor to filter the padding tokens
    
    In order to produce a classification report for sequence tagging, 
    first al the arrays need to be flattened.
    """

    # flatten the sequence labels
    y_flat = []
    for iob_tags in y_true:
        y_flat.extend(iob_tags)

    # apply mask to remove padding token positions and flatten the matrix
    preds_flat = preds.flatten()
    preds_flat = preds_flat[attention_mask.flatten() == 1]
    
    return y_flat, preds_flat

In [90]:
def sequence_tagging_classification_report(y, predict_output, digits=3):
    """
    Adapts the interface between the experiment and the sequence-tagging report function
    y : non-padded token_label_matrix
        list of list of strings (IOB_tags)
        y is a list of  variable-length lists of strings, 
        is token-alignmed but not lenght-padded
        
    preds : np.array(batch_size, max_sentence_length)
        2-D array with the class predicted by the model,
        for each token of each sentence.
        The attention mask must be applied to exclude the padding tokens.
        
    attention_mask:
        boolean tensor to filter the padding tokens
    """
    probs, preds, attention_mask = predict_output[:3]
        
    y_flat, preds_flat =\
        flatten_predict_output(y, preds, attention_mask)

    print(classification_report(y_flat, preds_flat, digits=digits))


In [91]:
def sequence_tagging_macro_f1(y, predict_output, digits=3):
    """
    Adapts the interface between the experiment and the sequence-tagging scoring function
    y : non-padded token_label_matrix
        list of list of strings (IOB_tags)
        y is a list of  variable-length lists of strings, 
        is token-alignmed but not lenght-padded
        
        
    preds : np.array(batch_size, max_sentence_length)
        2-D array with the class predicted by the model,
        for each token of each sentence.
        The attention mask must be applied to exclude the padding tokens.
        
    attention_mask:
        boolean tensor to filter the padding tokens
    """
    probs, preds, attention_mask = predict_output[:3]
    
    y_flat, pred_flat =\
        flatten_predict_output(y, preds, attention_mask)
    
    return utils.safe_macro_f1(y_flat, pred_flat)

### A feed-forward experiment with the ATIS module
* * *

It is straightforward to conduct experiments like the above using `atis.experiment`, which will enable you to do a wider range of experiments without writing or copy-pasting a lot of code. 

In [92]:
def sequence_tagging_fit(X,  y):
    """
        X : contains the embeddings and the attention_mask
            X[0]: embeddings torchTensor([batch_size, max_sentence_length, 768])
            fit() expects a tensor of embeddings
        
            X[1]: attention_mask: torchTensor([batch_size, max_sentence_length])
            expects the tensor input_mask_train
            it is used to mask the padding tokens in X
            each value is 0 or 1.

        y: torchTensor([batch_size, max_sentence_length])
            expects the tensor padded_token_label_matrix_train
            each value is a class string (IOB_tag)
    """        

    # configures the sequence tagging layer
    sequence_tagging_config = {
        "input_embedding_size": 768,
        "hidden_dropout_prob": 0.4,
        "class_weights": None,
        "batch_size": 64,
        "lr": 1e-3,
        "l2_strength": 0,
        "max_iter": 50,   # keep small during debug
        "device": "cuda"
    }
    
    # instantiates the network
    shallow_sf = TorchShallowSequenceTagger(sequence_tagging_config)

    # Fit
    return shallow_sf.fit(X, y)

In [93]:
def sequence_tagging_phi(sentences):
    """
    transformer-based batch tokenzer, encoder, vectorizer, and sequence padding
    """

    final_hidden_states, attention_mask, input_token_ids =\
        batch_encoder_vectorizer(sentences)
    
    return final_hidden_states, attention_mask

In [94]:
%%time 
exp_results = atis.experiment(
    ATIS_HOME,
    phi=None,
    batch_phi=sequence_tagging_phi,
    label_alignment_func=sequence_tagging_label_aligner,
    train_func=sequence_tagging_fit,
    train_reader=atis.train_reader, 
    assess_reader=atis.dev_reader, 
    class_func=atis.slot_filling_func,   # label selector
    metrics_report_func=sequence_tagging_classification_report,
    score_func=sequence_tagging_macro_f1,
    vectorize=False)  

define_graph: num_classes: 121
X: <class 'numpy.ndarray'>,     attention_mask: <class 'numpy.ndarray'>,    y: [4478, 16]
X: (4478, 63, 768),    attention_mask: (4478, 63),    y: (4478, 63)
X: torch.Size([4478, 63, 768]),    attention_mask: torch.Size([4478, 63]),    y: torch.Size([4478, 63])


Finished epoch 50 of 50; error is 4.8276680856943135

len(y_true): 500
preds.shape: (500, 47)
attention_mask.shape: (500, 47)
y_flat (flattened): 9348
preds_flat.shape (flattened): (23500,)
preds_flat.shape (masked): (9348,)
                              precision    recall  f1-score   support

             B-aircraft_code      0.000     0.000     0.000         1
              B-airline_code      1.000     0.778     0.875         9
              B-airline_name      0.969     1.000     0.984        62
              B-airport_code      1.000     0.750     0.857         4
              B-airport_name      1.000     0.750     0.857         4
 B-arrive_date.date_relative      0.000     0.000     0.000         2
      B-arrive_date.day_name      0.000     0.000     0.000        10
    B-arrive_date.day_number      0.500     0.250     0.333         4
    B-arrive_date.month_name      1.000     0.250     0.400         4
B-arrive_date.today_relative      0.000     0.000     0.000         1
      B-arrive_time.end_time      1.000     0.667     0.80

# Fine Tunning

In [95]:
class BertSequenceTaggingModel(nn.Module):
    def __init__(self,
            output_dim,
            hidden_dropout_prob,
            weights_name='bert-base-cased'):
        super(BertSequenceTaggingModel, self).__init__()
        
        self.output_dim = output_dim
        self.weights_name = weights_name
        
        # Graph
        self.bert = BertModel.from_pretrained(self.weights_name)
        self.embed_dim = self.bert.embeddings.word_embeddings.embedding_dim 
        self.dropout = nn.Dropout(hidden_dropout_prob)  
        self.classifier_layer = nn.Linear(self.embed_dim, output_dim)
        
        # init weights
        torch.nn.init.xavier_uniform(self.classifier_layer.weight) 
        if self.classifier_layer.bias is not None:
            self.classifier_layer.bias.data.zero_()
    
    def forward(self, X):
        """Here, `X` is a list of two np.array  
        consisting of the token_ids (an index into the BERT embedding)
        and the attention_mask (a 1 or 0 indicating whether the token 
        is masked). The `fit` method will 
        train all these parameters against a softmax objective.
        
        """
        # separates the indices from the mask
        indices, mask = X
        # Type conversion, since the base class insists on
        # casting this as a FloatTensor, but we ned Long
        # for `bert`.
        indices = indices.long()
        
        # graph execution
        final_hidden_states, cls_output =\
            self.bert(indices, attention_mask=mask)
        
        h = self.dropout(final_hidden_states)
        
        logits = self.classifier_layer(h)
        
        return logits

In [96]:
class BertSequenceTagging(TorchShallowSequenceTagger):
    def __init__(self, weights_name, *args, **kwargs):
        self.weights_name = weights_name
        self.tokenizer = BertTokenizer.from_pretrained(self.weights_name)
        super(BertSequenceTagging, self).__init__(*args, **kwargs)
        
    def define_graph(self):
        """This method is used by `fit`. We override it here to use our
        new BERT-based graph.
        
        """
        self.num_classes = len(self.class2index)   # class2index is set in fit()
        model = BertSequenceTaggingModel(
            output_dim=self.num_classes,
            hidden_dropout_prob=self.hidden_dropout_prob, 
            weights_name=self.weights_name)
        model.train() # flag
        return model
    
    def encode(self, X, max_length=None):
        """The `X` is a list of strings. We use the model's tokenizer
        to get the indices and mask information.
        
        Returns
        -------
        list of [index, mask] pairs, where index is an int and mask
        is 0 or 1.
        
        """
        
        ## IN FINE TUNNIG WE DEAL WITH TOKEN_IDS (indices)
        
        data = self.tokenizer.batch_encode_plus(
            X, 
            max_length=max_length,
            add_special_tokens=True, 
            pad_to_max_length=True,
            return_attention_mask=True)
        indices = np.array(data['input_ids'])
        mask = np.array(data['attention_mask'])
        
        return [indices, mask]


Here's a self-contained illustration, starting from the raw data:

In [97]:
hf_train = list(atis.train_reader(ATIS_HOME, class_func=atis.slot_filling_func))
hf_dev = list(atis.dev_reader(ATIS_HOME, class_func=atis.slot_filling_func))

X_hf_sentence_train, y_train = zip(*hf_train)
X_hf_sentence_dev, y_dev = zip(*hf_dev)

Our model has some standard fine-tuning parameters:

In [98]:
# configures the sequence tagging layer
sequence_tagging_config = {
    "input_embedding_size": None, # not needed for fine-tunning
    "hidden_dropout_prob": 0.4,
    "class_weights": None,
    "batch_size": 32,
    "lr": 0.0002,   # eta
    "l2_strength": 0,
    "max_iter": 4,   # keep small during debug
    "device": "cuda"
}


hf_fine_tune_mod = BertSequenceTagging(
    'bert-base-cased', 
    config=sequence_tagging_config)

Now we can encode them; this step packs together the indices and mask information:

In [99]:
X_indices_mask_train = hf_fine_tune_mod.encode(X_hf_sentence_train)

X_indices_mask_dev = hf_fine_tune_mod.encode(X_hf_sentence_dev)

In [100]:
X = [np.ones(shape=(2, 2, 5)), np.zeros(shape=(2, 2))]
x, m = X
X = [torch.FloatTensor(x),torch.tensor(m, dtype=torch.bool)]
X

[tensor([[[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]],
 
         [[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]]]), tensor([[False, False],
         [False, False]])]

Training this model is resource intensive. Be patient – it will be worth the wait! (This experiment takes about 10 minutes on a machine with an NVIDIA RTX 2080 Max-Q GPU.)

In [101]:
y_train, _, _ =  sequence_tagging_label_aligner(X_hf_sentence_train, y_train)

y_dev, _, _ =  sequence_tagging_label_aligner(X_hf_sentence_dev, y_dev)

In [102]:
%time _ = hf_fine_tune_mod.fit(X_indices_mask_train, y_train)

X: <class 'numpy.ndarray'>,     attention_mask: <class 'numpy.ndarray'>,    y: [4478, 16]
X: (4478, 63),    attention_mask: (4478, 63),    y: (4478, 63)
X: torch.Size([4478, 63]),    attention_mask: torch.Size([4478, 63]),    y: torch.Size([4478, 63])


Finished epoch 4 of 4; error is 4.129124558996409

CPU times: user 3min 30s, sys: 52.6 s, total: 4min 22s
Wall time: 4min 21s


Finally, some predictions on the dev set:

In [103]:
hf_fine_tune_preds = hf_fine_tune_mod.predict(X_indices_mask_dev)

In [104]:
print(sequence_tagging_classification_report(y_dev, hf_fine_tune_preds, digits=3))

len(y_true): 500
preds.shape: (500, 47)
attention_mask.shape: (500, 47)
y_flat (flattened): 9348
preds_flat.shape (flattened): (23500,)
preds_flat.shape (masked): (9348,)
                              precision    recall  f1-score   support

             B-aircraft_code      1.000     1.000     1.000         1
              B-airline_code      0.900     1.000     0.947         9
              B-airline_name      1.000     1.000     1.000        62
              B-airport_code      1.000     0.750     0.857         4
              B-airport_name      1.000     1.000     1.000         4
 B-arrive_date.date_relative      1.000     0.500     0.667         2
      B-arrive_date.day_name      1.000     0.500     0.667        10
    B-arrive_date.day_number      1.000     1.000     1.000         4
    B-arrive_date.month_name      1.000     0.750     0.857         4
B-arrive_date.today_relative      0.000     0.000     0.000         1
      B-arrive_time.end_time      0.500     0.333     0.40