# SLU Joint Model

In [1]:
__author__ = "Adrian Sarno, Jennifer Arnold"
__version__ = "CS224u, Stanford, Spring 2020"

In [2]:
# Set all the random seeds for reproducibility. Only the
# system and torch seeds are relevant for this notebook.
import utils
utils.fix_random_seeds()

In [3]:
# Python imports
import os
import numpy as np
from sklearn.metrics import classification_report
import logging
logger = logging.getLogger()
logger.level = logging.ERROR

In [4]:
# torch imports
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

In [5]:
# HuggingFace import
from transformers import BertTokenizer, BertModel, BertPreTrainedModel

In [6]:
# local imports
import atis
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from torch_rnn_classifier import TorchRNNClassifier, TorchRNNClassifierModel

In [7]:
# CUDA test
import sys; print(sys.version)
import torch; print(torch.__version__, torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]
1.4.0 True
GeForce GTX 1080


In [8]:
hf_weights_name = 'bert-base-cased'
# hf_weights_name = 'bert-base-uncased' - in this case the tokenizer does not split into subwords so often

hf_tokenizer = BertTokenizer.from_pretrained(hf_weights_name)
hf_model = BertModel.from_pretrained(hf_weights_name)

In [9]:
ATIS_HOME = os.path.join("data", "atis")

* * *

### Featurization

#### Batching (normalizing sentence lenghts)

In [10]:
def batch_encoder_vectorizer(input_sentences, max_length=None):
    """
    This function accomplishes two tasks:
    1.  tokenization and sentence-length normalization
    2.  featurization, it calls the bert model to convert tokens to embeddings 
    """
    
    # tokenization, encoding and sentence-length normalization
    tokenizer_output = hf_tokenizer.batch_encode_plus(
        input_sentences, 
        max_length=max_length,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True)
    
    input_token_ids = torch.tensor(tokenizer_output['input_ids'])
    input_mask = torch.tensor(tokenizer_output['attention_mask'])

    # featurization
    with torch.no_grad():
        final_hidden_states, cls_output = \
        hf_model(input_token_ids, attention_mask=input_mask)
    
    # cls_output not used
    # convert to numpy to match the type of all other results (all numpy)
    final_hidden_states = final_hidden_states.detach().cpu().numpy()
    
    return final_hidden_states, np.array(tokenizer_output['attention_mask']), np.array(tokenizer_output['input_ids'])

* * *

#### Modeling

In [11]:
class TorchJointSluModel(nn.Module):
    def __init__(self,
            embed_dim,
            output_iob_dim,
            output_intent_dim,
            dropout_prob):
        super(TorchJointSluModel, self).__init__()
        
        self.embed_dim = embed_dim
        self.output_iob_dim = output_iob_dim
        self.output_intent_dim = output_intent_dim
        
        # Graph
        self.dropout = nn.Dropout(dropout_prob) 
        
        self.iob_classifier_layer = nn.Linear(embed_dim, output_iob_dim)
        self.init_weights(self.iob_classifier_layer) 

        self.intent_classifier_layer = nn.Linear(embed_dim, output_intent_dim)
        self.init_weights(self.intent_classifier_layer) 

    def init_weights(self, layer):
        torch.nn.init.xavier_uniform(layer.weight) 
        if layer.bias is not None:
            layer.bias.data.zero_()        
            
    def forward(self, X):
    
        # separate the feature vectors (embeddings) from the attention_mask
        X, attention_mask = X

        X = self.dropout(X)
        
        logits_iob = self.iob_classifier_layer(X)
        
        # create single embedding per sentence
        sentence_X = X.mean(axis=1)  # for better performance
        
        logits_intent = self.intent_classifier_layer(sentence_X)

        return logits_iob, logits_intent


In [12]:
from torch_model_base import TorchModelBase
from utils import progress_bar

class TorchJointSlu(TorchModelBase):
    """
     Featurization:
        Takes the embeddings already pre-computed.
    
    Classification:
        The simplest token classifier uses just a linear layer.
        The Pytorch linear layer can take as input a tensor of any number of dimensions
        and only the last dimension needs to be specified as input dimension.
   
        https://pytorch.org/docs/master/nn.html#linear   
    """
    def __init__(self, config, **kwargs):
        super(TorchJointSlu, self).__init__(**kwargs)
        
        self.config = config
        self.input_embedding_size = config["input_embedding_size"]
        self.hidden_dropout_prob = config["hidden_dropout_prob"]
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.l2_strength = config["l2_strength"]
        self.max_iter = config["max_iter"]
        self.device = config["device"]
        self.class_weights = config.get("class_weights", None)
        if self.class_weights is not None:
            class_weights = torch.FloatTensor(self.class_weights)
        
    def define_graph(self):
        """
        This is a shallow model. so it does not really define a graph here 
        but it instantiates a model class with the classfier top.
        """     
        self.num_iob_classes = len(self.iob_class2index)   # class2index is set in fit()
        self.num_intent_classes = len(self.intent_class2index)   # class2index is set in fit()
        print(f"define_graph: num_iob_classes: {self.num_iob_classes}")
        print(f"define_graph: num_intent_classes: {self.num_intent_classes}")
        return TorchJointSluModel(
            self.input_embedding_size, 
            self.num_iob_classes, 
            self.num_intent_classes, 
            self.hidden_dropout_prob)

    
    def compute_loss(self, logits, attention_mask, labels):
        loss_fct = CrossEntropyLoss(weight=self.class_weights)

        if attention_mask is not None:

            active_logits = logits.view(-1, self.num_iob_classes)

            active_loss_mask = attention_mask.view(-1) == 1
            active_labels = torch.where(
                active_loss_mask, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
            )
        else:
            num_classes = logits.shape[-1]
            active_logits = logits.view(-1, num_classes)
            active_labels = labels.view(-1)

        # computes the loss between labels and logits 
        loss = loss_fct(active_logits, active_labels)

        return loss
        
            
    def fit(self, X, y, **kwargs):
        """Standard `fit` method.
        
        fit() expects embeddings in X and strings in y.
        The class itself is in charge of encoding the labels.

        Parameters
        ----------
        X : [embeddings, attention_mask]
        y : array-like, a list of lists of string [['O', 'B-fromcity']]
        kwargs : dict
            For passing other parameters. If 'X_dev' is included,
            then performance is monitored every 10 epochs; use
            `dev_iter` to control this number.

        Returns
        -------
        self

        """               
        
        ################################################################
        # Model definition
        ################################################################
        
        # Graph:
        if not hasattr(self, "model"):
            self.compute_class2index(y)  # expects strings, must run before tensorizing y
            self.model = self.define_graph()
        
        # Prime the model for training
        self.model.to(self.device)
        self.model.train()
        
        # Default is torch.optim.Adam
        optimizer = self.optimizer(
            self.model.parameters(),
            lr=self.lr,
            weight_decay=self.l2_strength)    
          
        
        ################################################################
        # Data 
        ################################################################
        # separate the feature vectors (embeddings) from the attention_mask
        X, attention_mask = X
#         print(f"X: {type(X)},     attention_mask: {type(attention_mask)},    y: [{len(y)}, {len(y[0])}]")
         
        # Compute Incremental performance:
        X_dev = kwargs.get('X_dev')
        if X_dev is not None:
            # X_dev contains 2 parameters, X_dev, attention_mask_dev
            # the dev mask stays as numpy as is not used for anything
            # because the class does not compute dev loss, it just stores 
            # the dev predictions. This mask is required by predict but
            # it is just returned in the predict results (never used)            
            dev_iter = kwargs.get('dev_iter', 10)
            
            
        # separate the token labels from the sentence label
        y_iob, y_intent = y

        # encode labels (label vectorization). must run before tensorizing y
        y_iob = self.encode_iob_labels(y_iob)
        y_iob = self.pad_to_max_length(y_iob, X.shape[:2]) # y must have the shape of first 2 dims of X
        y_intent = self.encode_intent_labels(y_intent)
            
        # cast data into PyTorch tensors
        X = torch.tensor(X)
        attention_mask = torch.tensor(attention_mask, dtype=torch.bool)
        y_iob = torch.tensor(y_iob, dtype=torch.long)
        y_intent = torch.tensor(y_intent, dtype=torch.long)

        # Wrap data into a dataset and use a Dataloader for batching
        dataset = torch.utils.data.TensorDataset(X, attention_mask, y_iob, y_intent)
        dataloader = torch.utils.data.DataLoader(
            dataset, batch_size=self.batch_size, 
            shuffle=True, pin_memory=True) 
        

        ################################################################
        # Training process (Gradient Descent)
        ################################################################
        for iteration in range(1, self.config["max_iter"]+1):
            epoch_error = 0.0
            for i, (X_batch, m_batch, y_iob_batch, y_intent_batch) in enumerate(dataloader):
                # load the batch input tensors into GPU memory
                X_batch = X_batch.to(self.device, non_blocking=True)
                m_batch = m_batch.to(self.device, non_blocking=True)
                
                # call forward (mask is not used unless you want to compute the training loss)
                logits_iob, logits_intent = self.model.forward(X=[X_batch, m_batch])

                # load the batch label tensors into GPU memory
                y_iob_batch = y_iob_batch.to(self.device, non_blocking=True)
                y_intent_batch = y_intent_batch.to(self.device, non_blocking=True)
                
                # compute the loss, the gradients and update the weights
                err_iob = self.compute_loss(logits_iob, m_batch, y_iob_batch)
                err_intent = self.compute_loss(logits_intent, None, y_intent_batch)
                err = err_iob + err_intent
                epoch_error += err.item()
                # backprop pass
                optimizer.zero_grad()
                err.backward()
                optimizer.step()

            # Incremental predictions where possible:
            if X_dev is not None and iteration > 0 and iteration % dev_iter == 0:
                self.dev_predictions[iteration] = self.predict(X_dev)
                self.model.train()
            self.errors.append(epoch_error)
            progress_bar(
                "Finished epoch {} of {}; error is {}".format(
                    iteration, self.config["max_iter"], epoch_error))
        return self


    
    def predict_flat(self, X):
        """Predicted classes for the examples in `X`. In flat format for metric functions.

        Parameters
        ----------
        X : np.array
        attention_mask: input mask

        Returns
        -------
        probs: torch.tensor(batch_size * max_sequence_length, num_classes)
        pred_class: numpy.array(batch_size * max_sequence_length)

        """

        predict_output = self.predict(X)
        
        probs_iob, preds_iob = predict_output["probs_iob"], predict_output["preds_iob"]
        
        # Flatten and apply mask        
        probs_iob_flat = probs_iob.flatten()
        probs_iob_flat = probs_iob_flat[attention_mask.flatten() == 1]
        
        preds_iob_flat = preds_iob.flatten()
        preds_iob_flat = preds_iob_flat[attention_mask.flatten() == 1]        
        
        return {"probs_iob_flat": probs_iob_flat, 
                "preds_iob_flat": preds_iob_flat, 
                "probs_intent": probs_intent, 
                "preds_intent": preds_intent, 
                "attention_mask": attention_mask}
    
    def predict(self, X):
        """Predicted classes for the examples in `X`.

        Parameters
        ----------
        X : np.array
        attention_mask: np.array 
            input mask
            # the dev mask stays as numpy as is not used for anything
            # because the class does not compute dev loss, it just stores 
            # the dev predictions. This mask is required by predict but
            # it is just returned in the predict results (never used)

        Returns
        -------
        probs: torch.tensor(batch_size, max_sequence_length, num_classes)
        pred_class: numpy.array(batch_size, max_sequence_length)
        attention_mask

        """
        # get the attention_mask to return it
        _, attention_mask = X

        # compute probabilities and predicted class
        probs_iob, probs_intent = self.predict_proba(X)

        # compute predicted class, maximizing across the last dimension (classes)
        _, pred_iob_class_idx = torch.max(probs_iob, dim=-1)
        pred_iob_class_idx = pred_iob_class_idx.detach().cpu().numpy()
        
        _, pred_intent_class_idx = torch.max(probs_intent, dim=-1)
        pred_intent_class_idx = pred_intent_class_idx.detach().cpu().numpy()
        
        # decode the class indices to class names (IOB_tag)
        preds_iob = [[self.iob_index2class[i] for i in row_class_idx] 
                              for row_class_idx in pred_iob_class_idx]
        preds_iob = np.array(preds_iob)

        preds_intent = [self.intent_index2class[i] for i in pred_intent_class_idx]
        preds_iob = np.array(preds_iob)
        
        # detach prob tensors
        probs_iob = probs_iob.detach().cpu().numpy()
        probs_intent = probs_intent.detach().cpu().numpy()

        return {"probs_iob": probs_iob, 
                "preds_iob": preds_iob, 
                "probs_intent": probs_intent, 
                "preds_intent": preds_intent, 
                "attention_mask": attention_mask}
        

    def predict_proba(self, X):
        """Predicted probabilities for the examples in `X`.

        Parameters
        ----------
        X : np.array

        Returns
        -------
        torch.tensor(batch_size, max_sequence_length, num_classes)

        """

        # Graph:
        if not hasattr(self, "model"):
            # self.class2index must be defined in this case
            self.model = self.define_graph()
        
        # prime the model for prediction-only mode
        self.model.eval()
        with torch.no_grad():
            self.model.to(self.device)
       
            # cast input data into PyTorch tensors
            x, attention_mask = X
            x, attention_mask = torch.tensor(x), torch.tensor(attention_mask, dtype=torch.bool)

            # load the input tensors into GPU memory
            x = x.to(self.device)
            attention_mask = attention_mask.to(self.device)
            
            # call forward 
            # (mask is not used unless you want to compute the training loss)
            logits_iob, logits_intent = self.model.forward(X=[x, attention_mask])
 
            # compute probabilities and predicted class
            probs_iob = nn.Softmax(dim=-1)(logits_iob) # normalize scores along the latest dimension
            probs_intent = nn.Softmax(dim=-1)(logits_intent) # normalize scores along the latest dimension
            
            return probs_iob, probs_intent  # tensor (no_grad)
        
    def compute_class2index(self, y):
        """
        y: 2-D list of (lists of) strings (iob_tags, not indices)
        
        expects strings, must run before tensorizing y,
        must run before defining the graph because it computes
        the output network output (num_classes)
        
        Note:
        if the input type is incorrect and it results in a number of classes is incorrect (very high) 
        it will likely cause a CUDA-OUT-OF-MEMORY error
        """
        # separate the token labels from the sentence label
        y_iob, y_intent = y
        
        # flat list of iob labels
        iob_labels = []
        for y_row in y_iob:
            iob_labels.extend(y_row)
            
        # create iob mapping
        iob_classes = sorted(set(iob_labels))
        self.iob_class2index = dict(zip(iob_classes, range(len(iob_classes))))
        self.iob_index2class = {i:c for c, i in self.iob_class2index.items()}        

        # create intent mapping
        intent_classes = sorted(set(y_intent))
        self.intent_class2index = dict(zip(intent_classes, range(len(intent_classes))))
        self.intent_index2class = {i:c for c, i in self.intent_class2index.items()}        

        
    def encode_intent_labels(self, y_intent):
        """
        y: 1-D list of (lists of) strings (intent)
        
        expects strings, must run before tensorizing y,
        """

        return  [self.intent_class2index[label] for label in y_intent]
        
        
    def encode_iob_labels(self, y):
        """
        y: 2-D list of (lists of) strings (iob_tags, not indices)
        
        expects strings, must run before tensorizing y,
        """
        tag_id_matrix = []
        for iob_tags in y:
            tag_ids = [self.iob_class2index[iob_tag] for iob_tag in iob_tags]
            tag_id_matrix.append(tag_ids)  

        return  tag_id_matrix
    
    def pad_to_max_length(self, jagged_matrix, output_shape):
        padded_matrix = np.zeros(shape=output_shape)
        for i, row in enumerate(jagged_matrix):
            padded_matrix[i, :len(row)] = row 
        return padded_matrix

#### Batch Training and Prediction of the Shallow Slot Filling model (the Shallow model uses Bert embeddings without fine-tunning)
* * *

### Labelling: Alignment, Encoding, Normalize the length of the sequences, Class Weights

#### Compute class weights

In [13]:
def compute_class_weights(class_ids):
    """
        class_ids: 1D tensor, contains one class_id for each example
    """
    # encode the class_ids as onehot
    class_matrix = np.zeros(shape=(len(class_ids), max(class_ids)))
    class_matrix[class_ids] = 1
    
    # set the positive weights as the fraction of negative labels (0) for each class (each column)
    w_p = np.sum(class_matrix == 0, axis=0) / class_matrix.shape[0]

    # set the negative weights as the fraction of positive labels (1) for each class (each column)
    w_n = np.sum(class_matrix == 1, axis=0) / class_matrix.shape[0]

    return class_weights

#### Label and Sub-token Alignment (call WordPiece for each token, output word_to_tok_map and aligned_labels)

In [14]:
def word_start_label_aligner(sentence, word_labels=None):
    """
    Aligns the IOB labels to the word-starting tokens in the list of
    sub-word tokens returned by the WordPiece tokenizer.
    Returns:
    - an array of indices, each pointing to the first sub-token of every word
    - a padded list of labels, which has one element for each sub-token (the first
      sub-token of every word gets the label, the rest get the padding label 'X')
    """
    # Token map will be an int -> int mapping between the `word` index in the sentence and
    # the WordPiece `tokens` index.
    word_start_indices = []
    tokens = ["[CLS]"]
    if word_labels is not None:
        token_labels = ["O"]
    else:
        token_labels = None
    if len(sentence.split()) != len(word_labels):
        print(f"sentence: {len(sentence.split())}, word_labels: {len(word_labels)}")
        print(f"sentence: {sentence.split()}, word_labels: {word_labels}")
    for word_idx, word in enumerate(sentence.split(' ')):
        word_start_indices.append(len(tokens))
        word_tokens = hf_tokenizer.tokenize(word)  # tokenize ONE word 
        tokens.extend(word_tokens)
        if word_labels is not None:
            token_labels.append(word_labels[word_idx])
            if len(word_tokens) > 1:
                token_labels.extend(["X"]*(len(word_tokens)-1))

    tokens.append("[SEP]")
    token_labels.append( "O")
    return token_labels, word_start_indices, tokens

In [15]:
def sequence_tagging_label_aligner(sentences, labels):
    """
    """
    label_matrix, word_start_matrix, token_matrix = [], [], []
    
    for sentence, word_labels in zip(sentences, labels):
        token_labels, word_start_indices, tokens =\
            word_start_label_aligner(sentence, word_labels)
        
        label_matrix.append(token_labels)
        word_start_matrix.append(word_start_indices)
        token_matrix.append(tokens)

    return label_matrix, word_start_matrix, token_matrix

### Metrics

In [16]:
def flatten_predict_output(y_true, preds, attention_mask):
    """
    y_true : list of list of strings (IOB_tags)
        y_true is a list of  variable-length lists of strings, 
        is token-alignmed but not lenght-padded
        
    preds : np.array(batch_size, max_sentence_length)
        2-D array with the class predicted by the model,
        for each token of each sentence.
        The attention mask must be applied to exclude the padding tokens.
        
    attention_mask: np.array(batch_size, max_sentence_length)
        boolean tensor to filter the padding tokens
    
    In order to produce a classification report for sequence tagging, 
    first al the arrays need to be flattened.
    """
    
    print(f"len(y_true): {len(y_true)}")
    print(f"preds.shape: {preds.shape}")
    print(f"attention_mask.shape: {attention_mask.shape}")

    # flatten the sequence labels
    y_flat = []
    for iob_tags in y_true:
        y_flat.extend(iob_tags)
    print(f"y_flat (flattened): {len(y_flat)}")

    # apply mask to remove padding token positions and flatten the matrix
    preds_flat = preds.flatten()
    print(f"preds_flat.shape (flattened): {preds_flat.shape}")
    preds_flat = preds_flat[attention_mask.flatten() == 1]
    print(f"preds_flat.shape (masked): {preds_flat.shape}")
    
    return y_flat, preds_flat

In [17]:
def sequence_tagging_classification_report(y, predict_output, digits=3):
    """
    Adapts the interface between the experiment and the sequence-tagging report function
    y : non-padded token_label_matrix
        list of list of strings (IOB_tags)
        y is a list of  variable-length lists of strings, 
        is token-alignmed but not lenght-padded
        
    preds : np.array(batch_size, max_sentence_length)
        2-D array with the class predicted by the model,
        for each token of each sentence.
        The attention mask must be applied to exclude the padding tokens.
        
    attention_mask:
        boolean tensor to filter the padding tokens
    """
    probs, preds, attention_mask = predict_output[:3]
    print(f"cr-probs: {probs.shape}")
    print(f"cr-preds: {preds.shape}")
    print(f"cr-attention_mask: {attention_mask.shape}")
        
    y_flat, preds_flat =\
        flatten_predict_output(y, preds, attention_mask)

    print(classification_report(y_flat, preds_flat, digits=digits))


In [18]:
def sequence_tagging_macro_f1(y, predict_output, digits=3):
    """
    Adapts the interface between the experiment and the sequence-tagging scoring function
    y : non-padded token_label_matrix
        list of list of strings (IOB_tags)
        y is a list of  variable-length lists of strings, 
        is token-alignmed but not lenght-padded
        
        
    preds : np.array(batch_size, max_sentence_length)
        2-D array with the class predicted by the model,
        for each token of each sentence.
        The attention mask must be applied to exclude the padding tokens.
        
    attention_mask:
        boolean tensor to filter the padding tokens
    """
    probs, preds, attention_mask = predict_output[:3]
    
    y_flat, pred_flat =\
        flatten_predict_output(y, preds, attention_mask)
    
    return utils.safe_macro_f1(y_flat, pred_flat)

# Experiment with Shallow SLU Joint Model

* * *

In [19]:
def evaluate_shallow_slu_joint_model(
    X_train, 
    y_train,
    X_dev, 
    y_dev):
    """
    fit() expects in X and strings in y.
    The class itself is in charge of encoding (a.k.a. 'vectorizing') the labels.
    
    unit test the TorchShallowSequenceTagger.fit() method 
    This test calls the fit method on an untrained model 
    and later calls the metrics report on the dev results of the trained model
    """

    # configure the sequence tagging layer
    joint_config = {
        "input_embedding_size": 768,
        "hidden_dropout_prob": 0.4,
        "objective_weights": None,  # loss penalizes more SF or ID
        "class_weights": None,
        "batch_size": 32,
        "lr": 1e-3,
        "l2_strength": 0,
        "max_iter": 10,
        "device": "cuda"
    }
    
    # Shallow network
    joint_model = TorchJointSlu(joint_config)   

    # unpack train data
    X_train, m_train = X_train
    y_iob_tags_matrix_train, y_intent_train = y_train

    # unpack eval data
    X_dev, m_dev = X_dev
    y_iob_tags_matrix_dev, y_intent_dev = y_dev
    
    # Fit
    # we need to pass embeddings in X and strings in y
    joint_model.fit(
        X=[X_train, m_train],
        y=[y_iob_tags_matrix_train, y_intent_train],
        X_dev=[X_dev, m_dev],
        y_dev=[y_iob_tags_matrix_dev, y_intent_dev]
    )

    # Predict
    predict_output = joint_model.predict(
                        X=[X_dev, m_dev])
    
    probs_iob = predict_output["probs_iob"]
    preds_iob = predict_output["preds_iob"]
    attention_mask = predict_output["attention_mask"]
    
    sequence_tagging_classification_report(y_iob_tags_matrix_dev, (probs_iob, preds_iob, attention_mask))
    print("SF NON-VERBOSE MACRO-F1:", sequence_tagging_macro_f1(y_iob_tags_matrix_dev, 
                                                             (probs_iob, preds_iob, attention_mask)))
    
     
    print("\n\n\nINTENT:")
    print(classification_report(y_intent_dev, predict_output["preds_intent"]))
    print("INTENT NON-VERBOSE MACRO-F1:", utils.safe_macro_f1(y_intent_dev,  predict_output["preds_intent"]))

In [20]:
def read_dataset():
    # read ATIS
    atis_train = list(atis.train_reader(ATIS_HOME, class_func=atis.slot_filling_and_intent_func))
    atis_dev = list(atis.dev_reader(ATIS_HOME, class_func=atis.slot_filling_and_intent_func))

    # Split sentence and label
    X_atis_sentences_train, y_atis_train = zip(*atis_train)
    X_atis_sentences_dev, y_atis_dev = zip(*atis_dev)

    # Split label iob and intent
    y_iob_tags_train, y_intent_train = zip(*y_atis_train)
    y_iob_tags_dev, y_intent_dev = zip(*y_atis_dev)

    # Get token-level representations for all the input rows in the dataset
    final_hidden_states_train, input_mask_train, input_token_ids_train =\
    batch_encoder_vectorizer(X_atis_sentences_train)

    final_hidden_states_dev, input_mask_dev, input_token_ids_dev =\
    batch_encoder_vectorizer(X_atis_sentences_dev)

    # align the labels to the sub-word tokens
    y_iob_tags_matrix_train, _, _ =  sequence_tagging_label_aligner(X_atis_sentences_train, y_iob_tags_train)
    y_iob_tags_matrix_dev, _, _ =  sequence_tagging_label_aligner(X_atis_sentences_dev, y_iob_tags_dev)
    
    # package data
    X_train = (final_hidden_states_train, input_mask_train)
    X_dev = (final_hidden_states_dev, input_mask_dev)
    
    y_train = (y_iob_tags_matrix_train, y_intent_train)
    y_dev = (y_iob_tags_matrix_dev, y_intent_dev)
    
    return X_train, X_dev, y_train, y_dev

In [21]:
def experiment_shallow_slu_joint_model():
    X_train, X_dev, y_train, y_dev = read_dataset()

    evaluate_shallow_slu_joint_model( X_train, y_train,
                                      X_dev, y_dev)

In [22]:
experiment_shallow_slu_joint_model()

define_graph: num_iob_classes: 121
define_graph: num_intent_classes: 21


Finished epoch 10 of 10; error is 50.60976545512676

cr-probs: (500, 47, 121)
cr-preds: (500, 47)
cr-attention_mask: (500, 47)
len(y_true): 500
preds.shape: (500, 47)
attention_mask.shape: (500, 47)
y_flat (flattened): 9348
preds_flat.shape (flattened): (23500,)
preds_flat.shape (masked): (9348,)
                              precision    recall  f1-score   support

             B-aircraft_code      0.000     0.000     0.000         1
              B-airline_code      1.000     0.667     0.800         9
              B-airline_name      0.968     0.968     0.968        62
              B-airport_code      1.000     0.750     0.857         4
              B-airport_name      0.000     0.000     0.000         4
 B-arrive_date.date_relative      0.000     0.000     0.000         2
      B-arrive_date.day_name      0.000     0.000     0.000        10
    B-arrive_date.day_number      0.000     0.000     0.000         4
    B-arrive_date.month_name      0.000     0.000     0.000         4
B-arrive_date.today_relative      0.000     0.000     

# Fine Tunning

In [23]:
class BertJointSluModel(nn.Module):
    def __init__(self,
            output_iob_dim,
            output_intent_dim,
            dropout_prob,
            weights_name='bert-base-cased'):
        super(BertJointSluModel, self).__init__()
        
        self.weights_name = weights_name
        self.output_iob_dim = output_iob_dim
        self.output_intent_dim = output_intent_dim
        
        # Graph
        self.bert = BertModel.from_pretrained(self.weights_name)
        self.embed_dim = self.bert.embeddings.word_embeddings.embedding_dim 

        self.dropout = nn.Dropout(dropout_prob) 
        
        self.iob_classifier_layer = nn.Linear(self.embed_dim, output_iob_dim)
        self.init_weights(self.iob_classifier_layer) 

        self.intent_classifier_layer = nn.Linear(self.embed_dim, output_intent_dim)
        self.init_weights(self.intent_classifier_layer) 

    def init_weights(self, layer):
        torch.nn.init.xavier_uniform(layer.weight) 
        if layer.bias is not None:
            layer.bias.data.zero_()        
                    
    
    def forward(self, X):
        """Here, `X` is a list of two np.array  
        consisting of the token_ids (an index into the BERT embedding)
        and the attention_mask (a 1 or 0 indicating whether the token 
        is masked). The `fit` method will 
        train all these parameters against a softmax objective.
        
        """
        # separates the indices from the mask
        indices, mask = X
        # Type conversion, since the base class insists on
        # casting this as a FloatTensor, but we ned Long
        # for `bert`.
        indices = indices.long()
        
        # graph execution
        final_hidden_states, cls_output =\
            self.bert(indices, attention_mask=mask)
        
        h = self.dropout(final_hidden_states)
        
        logits_iob = self.iob_classifier_layer(h)
        
        # create single embedding per sentence
        sentence_h = h.mean(axis=1)  # for better performance
        
        logits_intent = self.intent_classifier_layer(sentence_h)

        return logits_iob, logits_intent

In [24]:
class BertJointSlu(TorchJointSlu):
    def __init__(self, weights_name, *args, **kwargs):
        self.weights_name = weights_name
        self.tokenizer = BertTokenizer.from_pretrained(self.weights_name)
        super(BertJointSlu, self).__init__(*args, **kwargs)
        
    def define_graph(self):
        """This method is used by `fit`. We override it here to use our
        new BERT-based graph.
        
        """
        self.num_iob_classes = len(self.iob_class2index)   # class2index is set in fit()
        self.num_intent_classes = len(self.intent_class2index)   # class2index is set in fit()
        print(f"define_graph: num_iob_classes: {self.num_iob_classes}")
        print(f"define_graph: num_intent_classes: {self.num_intent_classes}")
        model = BertJointSluModel(
            output_iob_dim=self.num_iob_classes, 
            output_intent_dim=self.num_intent_classes, 
            dropout_prob=self.hidden_dropout_prob, 
            weights_name=self.weights_name)
        model.train() # flag
        return model
    
    def encode(self, X, max_length=None):
        """The `X` is a list of strings. We use the model's tokenizer
        to get the indices and mask information.
        
        Returns
        -------
        list of [index, mask] pairs, where index is an int and mask
        is 0 or 1.
        
        """
        
        ## IN FINE TUNNIG WE DEAL WITH TOKEN_IDS (indices)
        
        data = self.tokenizer.batch_encode_plus(
            X, 
            max_length=max_length,
            add_special_tokens=True, 
            pad_to_max_length=True,
            return_attention_mask=True)
        indices = np.array(data['input_ids'])
        mask = np.array(data['attention_mask'])
        
        return [indices, mask]        

Here's a self-contained illustration, starting from the raw data:

In [28]:
def read_dataset_for_fine_tunning(hf_fine_tune_mod):
    # read ATIS
    atis_train = list(atis.train_reader(ATIS_HOME, class_func=atis.slot_filling_and_intent_func))
    atis_dev = list(atis.dev_reader(ATIS_HOME, class_func=atis.slot_filling_and_intent_func))

    # Split sentence and label
    X_atis_sentences_train, y_atis_train = zip(*atis_train)
    X_atis_sentences_dev, y_atis_dev = zip(*atis_dev)

    # Split label iob and intent
    y_iob_tags_train, y_intent_train = zip(*y_atis_train)
    y_iob_tags_dev, y_intent_dev = zip(*y_atis_dev)

    # align the labels to the sub-word tokens
    y_iob_tags_matrix_train, _, _ =  sequence_tagging_label_aligner(X_atis_sentences_train, y_iob_tags_train)
    y_iob_tags_matrix_dev, _, _ =  sequence_tagging_label_aligner(X_atis_sentences_dev, y_iob_tags_dev)
    
    # encode the inputs
    X_indices_mask_train = hf_fine_tune_mod.encode(X_atis_sentences_train)
    X_indices_mask_dev = hf_fine_tune_mod.encode(X_atis_sentences_dev)  

    # package data
    X_train = X_indices_mask_train
    X_dev = X_indices_mask_dev
    
    y_train = (y_iob_tags_matrix_train, y_intent_train)
    y_dev = (y_iob_tags_matrix_dev, y_intent_dev)
    
    return X_train, X_dev, y_train, y_dev

Our model has some standard fine-tuning parameters:

In [34]:
# configures the sequence tagging layer
joint_slu_config = {
    "input_embedding_size": None, # not needed for fine-tunning
    "hidden_dropout_prob": 0.4,
    "class_weights": None,
    "batch_size": 32,
    "lr": 0.0002,   # eta
    "l2_strength": 0,
    "max_iter": 8,   # keep small during debug
    "device": "cuda"
}


hf_fine_tune_mod = BertJointSlu(
    'bert-base-cased', 
    config=joint_slu_config)

In [35]:
X_train, X_dev, y_train, y_dev =\
    read_dataset_for_fine_tunning(hf_fine_tune_mod)

In [36]:
%time _ = hf_fine_tune_mod.fit(X_train, y_train)

define_graph: num_iob_classes: 121
define_graph: num_intent_classes: 21


Finished epoch 8 of 8; error is 31.448452878743413

CPU times: user 9min 23s, sys: 2min 26s, total: 11min 49s
Wall time: 11min 48s


Finally, some predictions on the dev set:

In [37]:
predict_output = hf_fine_tune_mod.predict(X_dev)

In [38]:
probs_iob = predict_output["probs_iob"]
preds_iob = predict_output["preds_iob"]
attention_mask = predict_output["attention_mask"]

y_iob_tags_matrix_dev, y_intent_dev = y_dev
    
sequence_tagging_classification_report(y_iob_tags_matrix_dev, (probs_iob, preds_iob, attention_mask))
print("SF NON-VERBOSE MACRO-F1:", sequence_tagging_macro_f1(y_iob_tags_matrix_dev, 
                                                         (probs_iob, preds_iob, attention_mask)))


print("\n\n\nINTENT:")
print(classification_report(y_intent_dev, predict_output["preds_intent"]))
print("INTENT NON-VERBOSE MACRO-F1:", utils.safe_macro_f1(y_intent_dev,  predict_output["preds_intent"]))

cr-probs: (500, 47, 121)
cr-preds: (500, 47)
cr-attention_mask: (500, 47)
len(y_true): 500
preds.shape: (500, 47)
attention_mask.shape: (500, 47)
y_flat (flattened): 9348
preds_flat.shape (flattened): (23500,)
preds_flat.shape (masked): (9348,)
                              precision    recall  f1-score   support

             B-aircraft_code      1.000     1.000     1.000         1
              B-airline_code      0.778     0.778     0.778         9
              B-airline_name      0.984     0.984     0.984        62
              B-airport_code      1.000     0.750     0.857         4
              B-airport_name      0.333     0.500     0.400         4
 B-arrive_date.date_relative      0.333     0.500     0.400         2
      B-arrive_date.day_name      0.857     0.600     0.706        10
    B-arrive_date.day_number      1.000     0.500     0.667         4
    B-arrive_date.month_name      1.000     1.000     1.000         4
B-arrive_date.today_relative      0.000     0.000     