# SLU Models -- Intent Determination (ATIS Dataset)

In [1]:
__author__ = "Adrian Sarno, Jennifer Arnold"
__version__ = "CS224u, Stanford, Spring 2020"

In [2]:
# Set all the random seeds for reproducibility. Only the
# system and torch seeds are relevant for this notebook.
import utils
utils.fix_random_seeds()

In [3]:
import sys; print(sys.version)
import torch; print(torch.__version__, torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
1.4.0 False


In [4]:
import os
import numpy as np
import torch.nn as nn
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from torch_rnn_classifier import TorchRNNClassifier, TorchRNNClassifierModel
from torch_rnn_classifier import TorchRNNClassifier
from sklearn.metrics import classification_report
from transformers import BertModel, BertTokenizer
import logging
logger = logging.getLogger()
logger.level = logging.ERROR

import atis

In [5]:
hf_weights_name = 'bert-base-cased'

hf_tokenizer = BertTokenizer.from_pretrained(hf_weights_name)
hf_model = BertModel.from_pretrained(hf_weights_name)

### Tokenization Example (Wordpiece tokenizer)

In [16]:
example_texts = [
    "1000",
    "1,000,000.00",
    "night flight from oakland to new york"]

for e in example_texts:
    ids = hf_tokenizer.encode(e, add_special_tokens=True)
    print(ids, hf_tokenizer.convert_ids_to_tokens(ids))

[101, 6087, 102] ['[CLS]', '1000', '[SEP]']
[101, 122, 117, 1288, 117, 1288, 119, 3135, 102] ['[CLS]', '1', ',', '000', ',', '000', '.', '00', '[SEP]']
[101, 1480, 3043, 1121, 8760, 1931, 1106, 1207, 26063, 4661, 102] ['[CLS]', 'night', 'flight', 'from', 'oak', '##land', 'to', 'new', 'yo', '##rk', '[SEP]']


# Utility Functions

### Featurization 

In [None]:
def bert_single_sentence_featurizer(bert_tokenizer, bert_model, sentence):
    # we tokenize a single sentence with encode(), so no need for mask
    input_ids = bert_tokenizer.encode(sentence, add_special_tokens=True)
    X = torch.tensor([input_ids])
    with torch.no_grad():
        final_hidden_states, cls_output = bert_model(X)
        return final_hidden_states.squeeze(0).numpy()    # squeeze batch dimension ( the batch size is 1)

In [6]:
def bert_single_sentence_featurizer(sentence):
    s = sentence
    input_ids = hf_tokenizer.encode(s, add_special_tokens=True)
    X = torch.tensor([input_ids])
    with torch.no_grad():
        final_hidden_states, cls_output = hf_model(X)
        return final_hidden_states.squeeze(0).numpy() 

In [7]:
def bert_single_sentence_featurizer_for_classification(sentence):
    reps = bert_single_sentence_featurizer(sentence)
    reps = reps.mean(axis=0)
    return reps   

### Batching (normalizing sentence lengths)

In [None]:
input_sentences = np.zeros(shape=4013)
for k in range(0, len(input_sentences)-1000, 1000):
    batch_sentences = input_sentences[k:k+1000]
    print(len(batch_sentences))

In [None]:
def classifier_featurizer(hf_tokenizer, hf_model, input_sentences):
    states_list = []
    print(len(input_sentences))
    for k in range(0, len(input_sentences)-1000, 1000):
        batch_sentences = input_sentences[k:k+1000]
        print(len(batch_sentences))
        input_token_ids, _, final_hidden_states_avg, _ =\
        bert_batch_featurizer(hf_tokenizer, hf_model, batch_sentences)
        states_list.append(final_hidden_states_avg)
        print(final_hidden_states_avg.shape)
    return torch.cat(states_list, dim=0)

In [8]:
def bert_batch_featurizer(hf_tokenizer, hf_model, input_sentences):
    tokenizer_output = hf_tokenizer.batch_encode_plus(
        input_sentences, 
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True)
    
    X_input_token_ids = torch.tensor(tokenizer_output['input_ids'])
    X_input_mask = torch.tensor(tokenizer_output['attention_mask'])
    with torch.no_grad():    
        final_hidden_states, cls_output = hf_model(
            X_input_token_ids, attention_mask=X_input_mask)        
        
    # get a sentence representation for the classifier
    row_list = []
    for row_hidden_states, row_mask in zip(final_hidden_states, X_input_mask):
        row_hidden_states_avg = row_hidden_states[row_mask == 1].mean(axis=0)
        row_list.append(row_hidden_states_avg)
    final_hidden_states_avg = torch.stack(row_list, dim=0)
        
    return tokenizer_output, final_hidden_states, final_hidden_states_avg, cls_output

In [None]:
input_token_ids, final_hidden_states, final_hidden_states_avg, cls_output =\
bert_batch_featurizer(hf_tokenizer, hf_model, example_texts)

print("tables returned: {},  sentences(rows): {},  tokens(columns): {}".format(len(input_token_ids), len(input_token_ids['input_ids']), len(input_token_ids['input_ids'][0])))
print(input_token_ids.keys())
print('input_token_ids:', input_token_ids['input_ids'][0])
print('attention_mask:', input_token_ids['attention_mask'][0])
final_hidden_states.shape, cls_output.shape

print(final_hidden_states_avg.shape)

# SLU Models
* * *

## Intent Determination

### Data Loading

In [9]:
ATIS_HOME = os.path.join("data", "atis")

Prepare dataset (already split into train, dev, test)

In [10]:
atis_train = list(atis.train_reader(ATIS_HOME, class_func=atis.intent_class_func))
atis_dev = list(atis.dev_reader(ATIS_HOME, class_func=atis.intent_class_func))

In [11]:
atis_train[5]

("i'm looking for a flight from charlotte to las vegas that stops in st. louis hopefully a dinner flight how can i find that out",
 'atis_flight')

Separate sentences and labels into separate lists

In [12]:
X_atis_sentences_train, y_atis_train = zip(*atis_train)
X_atis_sentences_dev, y_atis_dev = zip(*atis_dev)

In [13]:
y_atis_train[5]

'atis_flight'

### Featurization

Obtain a sentence-level representation  

In [None]:
%time input_token_ids_train, sf_hidden_states_train, final_hidden_states_avg_train, cls_output_train =\
bert_batch_featurizer(hf_tokenizer, hf_model, X_atis_sentences_train)

In [None]:
%time input_token_ids_dev, sf_hidden_states_dev, final_hidden_states_avg_dev, cls_output_dev =\
bert_batch_featurizer(hf_tokenizer, hf_model, X_atis_sentences_dev)

In [None]:
final_hidden_states_avg_train.shape

In [None]:
final_hidden_states_avg_dev.shape

Create a summary of the whole sentence by averaging the sequence of hidden-states for the whole input sequence.

    """
    - cls_output (torch.FloatTensor: of shape (batch_size, hidden_size))
    The cls_output variable contains the sentence level embedding returned from the transformer.Model forward method. 
    This is what the HuggingFace documentation says about it:
    Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained 
    from the next sentence prediction (classification) objective during pre-training.
    This output is usually not a good summary of the semantic content of the input, you’re often better with averaging or pooling the sequence of hidden-states for the whole input sequence.
    """

### Model Fitting

Now that all the examples are featurized, we can fit a model and evaluate it:

In [None]:
id_classifier = TorchShallowNeuralClassifier(max_iter=100, hidden_dim=300)

In [None]:
%time _ = id_classifier.fit(X_atis_sentences_train, y_atis_train)

In [None]:
hf_preds = id_classifier.predict(X_dev)

# sklearn classification report
print(classification_report(y_atis_dev, hf_preds, digits=3))

In [None]:
# DSTC8
id_classifier = TorchShallowNeuralClassifier(max_iter=500, hidden_dim=50)
%time _ = id_classifier.fit(final_hidden_states_avg_train, y_dstc8_train[:len(final_hidden_states_avg_train)])
hf_preds = id_classifier.predict(final_hidden_states_avg_dev)

# sklearn classification report
print(classification_report(y_dstc8_dev[:len(final_hidden_states_avg_dev)], hf_preds, digits=3))

* * * 

### A feed-forward experiment with the ATIS module

It is straightforward to conduct experiments like the above using `atis.experiment`, which will enable you to do a wider range of experiments without writing or copy-pasting a lot of code. 

In [14]:
def fit_shallow_network(X, y, **kwargs):
    mod = TorchShallowNeuralClassifier(
        max_iter=500, hidden_dim=50)
    mod.fit(X, y)
    return mod

In [15]:
%%time 
_ = atis.experiment(
    atis_home=ATIS_HOME,
    phi=bert_single_sentence_featurizer_for_classification,  # featurizer
    batch_phi=None,
    label_alignment_func=None,
    train_func=fit_shallow_network,
    train_reader=atis.train_reader, 
    assess_reader=atis.dev_reader, 
    class_func=atis.intent_class_func,
    vectorize=False)  # Pass in the BERT hidden state directly!

Finished epoch 500 of 500; error is 0.0024766515707597136

                               precision    recall  f1-score   support

            atis_abbreviation      1.000     0.882     0.938        17
                atis_aircraft      0.769     0.909     0.833        11
                 atis_airfare      0.881     0.974     0.925        38
atis_airfare#atis_flight_time      0.000     0.000     0.000         1
                 atis_airline      0.833     0.833     0.833        18
                 atis_airport      1.000     0.333     0.500         3
                atis_capacity      0.500     1.000     0.667         1
                    atis_city      0.000     0.000     0.000         1
                atis_distance      1.000     0.333     0.500         3
                  atis_flight      0.978     0.978     0.978       357
     atis_flight#atis_airfare      1.000     0.500     0.667         2
               atis_flight_no      0.000     0.000     0.000         0
             atis_flight_time      0.750     0.667     0.706         9
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### BERT fine-tuning with Hugging Face

The above experiments are quite successful – BERT gives us a reliable boost compared to other methods we've explored for the ATIS task. However, we might expect to do even better if we fine-tune the BERT parameters as part of fitting our ATIS classifier. To do that, we need to incorporate the Hugging Face BERT model into our classifier. This too is quite straightforward.

The most important step is to create an `nn.Module` subclass that has, for its parameters, both the BERT model and parameters for our own classifier:

In [17]:
class HfBertClassifierModel(nn.Module):
    def __init__(self, n_classes, weights_name='bert-base-cased'):
        super().__init__()
        self.n_classes = n_classes
        self.weights_name = weights_name
        self.bert = BertModel.from_pretrained(self.weights_name)
        self.hidden_dim = self.bert.embeddings.word_embeddings.embedding_dim
        # The only new parameters -- the classifier layer:
        self.W = nn.Linear(self.hidden_dim, self.n_classes)
        
    def forward(self, X):
        """Here, `X` is an np.array in which each element is a pair 
        consisting of an index into the BERT embedding and a 1 or 0
        indicating whether the token is masked. The `fit` method will 
        train all these parameters against a softmax objective.
        
        """
        indices = X[: , 0, : ]
        # Type conversion, since the base class insists on
        # casting this as a FloatTensor, but we ned Long
        # for `bert`.
        indices = indices.long()
        mask = X[: , 1, : ]      
        (final_hidden_states, cls_output) = self.bert(
            indices, attention_mask=mask)
        
        # reps = cls_output
        reps = final_hidden_states.mean(axis=1)  # for better performance
        return self.W(reps)

For the training and prediction interface, we can somewhat opportunistically subclass `TorchShallowNeuralClassifier` so that we don't have to write any of our own data-handling, training, or prediction code:

In [18]:
class HfBertClassifier(TorchShallowNeuralClassifier):
    def __init__(self, weights_name, *args, **kwargs):
        self.weights_name = weights_name
        self.tokenizer = BertTokenizer.from_pretrained(self.weights_name)
        super().__init__(*args, **kwargs)
        
    def define_graph(self):
        """This method is used by `fit`. We override it here to use our
        new BERT-based graph.
        
        """
        bert = HfBertClassifierModel(
            self.n_classes_, weights_name=self.weights_name)
        
        # The following does not train the model
        bert.train()   # this just sets a flag that enables modification of pretrained weights
        return bert
    
    def encode(self, X, max_length=None):
        """The `X` is a list of strings. We use the model's tokenizer
        to get the indices and mask information.
        
        Returns
        -------
        list of [index, mask] pairs, where index is an int and mask
        is 0 or 1.
        
        """
        data = self.tokenizer.batch_encode_plus(
            X, 
            max_length=max_length,
            add_special_tokens=True, 
            pad_to_max_length=True,
            return_attention_mask=True)
        indices = data['input_ids']
        mask = data['attention_mask']
        return [[i, m] for i, m in zip(indices, mask)]

Here's a self-contained illustration, starting from the raw data:

In [19]:
hf_train = list(atis.train_reader(ATIS_HOME, class_func=atis.intent_class_func))
hf_dev = list(atis.dev_reader(ATIS_HOME, class_func=atis.intent_class_func))

X_hf_sentence_train, y_hf_train = zip(*hf_train)
X_hf_sentence_dev, y_hf_dev = zip(*hf_dev)

In [20]:
print(X_hf_sentence_dev[1].split())
print(y_hf_dev[1][0])

len(X_hf_sentence_dev[1].split()), len(y_hf_dev[1][0])

['show', 'me', 'all', 'round', 'trip', 'flights', 'between', 'houston', 'and', 'las', 'vegas']
a


(11, 1)

Our model has some standard fine-tuning parameters:

In [21]:
hf_fine_tune_mod = HfBertClassifier(
    'bert-base-cased', 
    batch_size=16, # Crucial; large batches will eat up all your memory!
    max_iter=4, 
    eta=0.00002)

Now we can encode them; this step packs together the indices and mask information:

In [22]:
X_hf_indices_train = hf_fine_tune_mod.encode(X_hf_sentence_train)

X_hf_indices_dev = hf_fine_tune_mod.encode(X_hf_sentence_dev)

Training this model is resource intensive. Be patient – it will be worth the wait! (This experiment takes about 10 minutes on a machine with an NVIDIA RTX 2080 Max-Q GPU.)

In [23]:
%time _ = hf_fine_tune_mod.fit(X_hf_indices_train, y_hf_train)

Finished epoch 4 of 4; error is 5.8049565391265792

CPU times: user 7h 35min 27s, sys: 32min 32s, total: 8h 8min
Wall time: 1h 7min 50s


Finally, some predictions on the dev set:

In [24]:
hf_fine_tune_preds = hf_fine_tune_mod.predict(X_hf_indices_dev)

In [25]:
print(classification_report(hf_fine_tune_preds, y_hf_dev, digits=3))

                               precision    recall  f1-score   support

            atis_abbreviation      0.941     1.000     0.970        16
                atis_aircraft      1.000     0.846     0.917        13
                 atis_airfare      1.000     0.905     0.950        42
atis_airfare#atis_flight_time      0.000     0.000     0.000         0
                 atis_airline      1.000     0.947     0.973        19
                 atis_airport      0.667     1.000     0.800         2
                atis_capacity      1.000     1.000     1.000         1
                    atis_city      1.000     1.000     1.000         1
                atis_distance      1.000     1.000     1.000         3
                  atis_flight      0.980     0.997     0.989       351
     atis_flight#atis_airfare      1.000     0.667     0.800         3
             atis_flight_time      1.000     0.900     0.947        10
             atis_ground_fare      1.000     1.000     1.000         3
     

The above is just one of the many possible ways to fine-tune BERT using our course modules or new modules you write. The crux of it is creating an `nn.Module` that combines the BERT parameters with your model's new parameters.

* * *