# Part 1: Load Dataset + Quickstart of Pretrained Backbones

In [None]:
import pandas as pd
import mxnet as mx
import gluonnlp
from gluonnlp.utils import set_seed
mx.npx.set_np()
set_seed(123)

## Load the Dataset

Let's download two datasets from the [GLUE benchmark](https://gluebenchmark.com/):
- The Standford Sentiment Treebank (SST-2)
- Semantic Textual Similarity Benchmark (STS-B)

We will use these two throughout the tutorial.

To download the dataset, we will simply use the `nlp_data` command. The downloaded dataset are preprocessed to the [parquet](https://parquet.apache.org/) format that can be loaded by [pandas](https://pandas.pydata.org/).

In [2]:
!nlp_data prepare_glue --benchmark glue -t sst
!nlp_data prepare_glue --benchmark glue -t sts
!ls glue

Downloading glue to "glue". Selected tasks = sst
Processing sst...
Found!
Downloading glue to "glue". Selected tasks = sts
Processing sts...
Found!
sst  sts


In [3]:
train_df = pd.read_parquet('glue/sst/train.parquet')
valid_df = pd.read_parquet('glue/sst/dev.parquet')

In [4]:
train_df.head(10)

Unnamed: 0,sentence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0
5,that 's far too tragic to merit such superfici...,0
6,demonstrates that the director of such hollywo...,1
7,of saucy,1
8,a depressed fifteen-year-old 's suicidal poetry,0
9,are more deeply thought through than in most `...,1


In [5]:
train_df = pd.read_parquet('glue/sts/train.parquet')
valid_df = pd.read_parquet('glue/sts/dev.parquet')

In [6]:
train_df.head(10)

Unnamed: 0,sentence1,sentence2,genre,score
0,A plane is taking off.,An air plane is taking off.,main-captions,5.0
1,A man is playing a large flute.,A man is playing a flute.,main-captions,3.8
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,main-captions,3.8
3,Three men are playing chess.,Two men are playing chess.,main-captions,2.6
4,A man is playing the cello.,A man seated is playing the cello.,main-captions,4.25
5,Some men are fighting.,Two men are fighting.,main-captions,4.25
6,A man is smoking.,A man is skating.,main-captions,0.5
7,The man is playing the piano.,The man is playing the guitar.,main-captions,1.6
8,A man is playing on a guitar and singing.,A woman is playing an acoustic guitar and sing...,main-captions,2.2
9,A person is throwing a cat on to the ceiling.,A person throws a cat on the ceiling.,main-captions,5.0


## Quickstart of Pretrained Backbones

A bunch of recent papers, especially [BERT](https://arxiv.org/pdf/1810.04805.pdf), have led a new trend for solving NLP problems: 1) pretrain a backbone model on a large corpus, 2) finetune the backbone on a specific NLP task.

GluonNLP provides the interface for accessing to the pretrained backbone models.

In [7]:
from gluonnlp.models import get_backbone
model_name = 'google_en_cased_bert_base'
model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(model_name)

In [8]:
print('- Model Class:')
print(model_cls)
print('\n- Configuration:')
print(cfg)
print('\n- Tokenizer:')
print(tokenizer)
print('\n- Path of the weights:')
print(local_params_path)

- Model Class:
<class 'gluonnlp.models.bert.BertModel'>

- Configuration:
INITIALIZER:
  bias: ['zeros']
  embed: ['truncnorm', 0, 0.02]
  weight: ['truncnorm', 0, 0.02]
MODEL:
  activation: gelu
  attention_dropout_prob: 0.1
  compute_layout: auto
  dtype: float32
  hidden_dropout_prob: 0.1
  hidden_size: 3072
  layer_norm_eps: 1e-12
  layout: NT
  max_length: 512
  num_heads: 12
  num_layers: 12
  num_token_types: 2
  pos_embed_type: learned
  units: 768
  vocab_size: 28996
VERSION: 1

- Tokenizer:
HuggingFaceWordPieceTokenizer(
   vocab_file = /home/ubuntu/.mxnet/models/nlp/google_en_cased_bert_base/vocab-c1defaaa.json
   unk_token = [UNK], sep_token = [SEP], cls_token = [CLS]
   pad_token = [PAD], mask_token = [MASK]
   clean_text = True, handle_chinese_chars = True
   strip_accents = False, lowercase = False
   wordpieces_prefix = ##
   vocab = Vocab(size=28996, unk_token="[UNK]", pad_token="[PAD]", cls_token="[CLS]", sep_token="[SEP]", mask_token="[MASK]")
)

- Path of the weight

To create a new backbone model in Gluon, you can just use the following commands:

In [9]:
backbone = model_cls.from_cfg(cfg)
backbone.hybridize()
backbone.load_parameters(local_params_path)
print(backbone)

BertModel(
  (encoder): BertTransformer(
    (all_layers): HybridSequential(
      (0): TransformerEncoderLayer(
        (dropout_layer): Dropout(p = 0.1, axes=())
        (attn_qkv): Dense(768 -> 2304, linear)
        (attention_proj): Dense(768 -> 768, linear)
        (attention_cell): MultiHeadAttentionCell(
           query_units=768,
           num_heads=12,
           attention_dropout=0.1,
           scaled=True,
           normalized=False,
           layout="NTK",
           use_einsum=False,
           dtype=float32
        )
        (layer_norm): LayerNorm(eps=1e-12, axis=-1, center=True, scale=True, in_channels=768)
        (ffn): PositionwiseFFN(
        	units=768,
        	hidden_size=3072,
        	activation_dropout=0.0,
        	activation=gelu,
        	dropout=0.1,
        	normalization=layer_norm,
        	layer_norm_eps=1e-12,
        	pre_norm=False,
        	dtype=float32
        )
      )
      (1): TransformerEncoderLayer(
        (dropout_layer): Dropout(p =

You can directly use the `backbone` to extract contextual embeddings:

In [10]:
text_input = 'GluonNLP helps practitioners solve NLP problems.'
token_ids = tokenizer.encode(text_input, int)
token_ids = mx.np.array([[tokenizer.vocab.cls_id] + token_ids + [tokenizer.vocab.sep_id]])
token_types = mx.np.array([0] * len(token_ids[0]))
valid_length = mx.np.array([len(token_ids[0])])
print('Token IDs=', token_ids)
print('Token Types=', token_types)
print('Valid Length=', valid_length)
mlm_embeddings, cls_embedding = backbone(token_ids, token_types, valid_length)

Token IDs= [[  101.   144.  7535.  1320. 20734.  2101.  6618. 16681.  9474. 21239.
   2101.  2645.   119.   102.]]
Token Types= [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Valid Length= [14.]


In [11]:
print(cls_embedding.shape)
print(cls_embedding)

(1, 768)
[[-0.57272696  0.35954106  0.9992546  -0.970326    0.90788835  0.93265927
   0.92036456 -0.99692816 -0.8863071  -0.63977534  0.9243325   0.9911606
  -0.9990826  -0.9992712   0.8418865  -0.9091129   0.9655406  -0.5252916
  -0.9997559  -0.77614385 -0.6188369  -0.99916065  0.18092567  0.96592814
   0.90377444  0.06120971  0.95132345  0.99970615  0.80378264 -0.5750337
   0.11013095 -0.9629547   0.9514687  -0.9962427   0.07688154  0.4343017
   0.86048555 -0.19628136  0.8895614  -0.959134   -0.56514496 -0.88191223
   0.66912913 -0.5302047   0.93865705  0.02508944 -0.02480373 -0.11807826
  -0.11206948  0.99931514 -0.85191596  0.74189985 -0.99826777  0.91011924
   0.96564406  0.41362172  0.9757109   0.10942441 -0.9995759   0.10298529
   0.9245132   0.30454144  0.8265128   0.06407724  0.3205736  -0.38333276
  -0.8760772  -0.02513863 -0.45231512  0.17965323 -0.08799271  0.24097772
   0.92556524 -0.80420023  0.02304729 -0.8587904   0.01149753 -0.9991972
   0.9223567   0.99969673  0.85430

In [12]:
print(mlm_embeddings.shape)
print(mlm_embeddings)

(1, 14, 768)
[[[ 0.6234134  -0.07291081 -0.13999614 ... -0.22016764  0.27736056
    0.05864484]
  [ 0.672965   -0.6641126   0.23682861 ... -0.04360078  0.05051699
   -0.04530283]
  [ 0.55150485 -0.3550599   0.5144368  ...  0.64902544 -0.00809672
    0.20128815]
  ...
  [ 0.22296664  0.14223897 -0.05181786 ...  0.10244231 -0.605693
   -0.03737953]
  [ 1.2198194  -0.51079684  0.3234357  ...  0.20491412  0.6366281
   -0.59139895]
  [ 1.1782554  -0.5468905   0.40015438 ...  0.22755586  0.58317333
   -0.5389005 ]]]


Apart from BERT, GluonNLP has provided other backbone models including the recent models like [XLMR](https://arxiv.org/pdf/1911.02116.pdf), [ALBERT](https://arxiv.org/pdf/1909.11942.pdf), [ELECTRA](https://openreview.net/pdf?id=r1xMH1BtvB), and [MobileBERT](https://arxiv.org/pdf/2004.02984.pdf). We can use `list_backbone_names` to list all the backbones that are supported in GluonNLP.

In [13]:
from gluonnlp.models import list_backbone_names
list_backbone_names()

['google_albert_base_v2',
 'google_albert_large_v2',
 'google_albert_xlarge_v2',
 'google_albert_xxlarge_v2',
 'google_en_cased_bert_base',
 'google_en_cased_bert_large',
 'google_en_cased_bert_wwm_large',
 'google_en_uncased_bert_base',
 'google_en_uncased_bert_large',
 'google_en_uncased_bert_wwm_large',
 'google_multi_cased_bert_base',
 'google_zh_bert_base',
 'gluon_electra_small_owt',
 'google_electra_base',
 'google_electra_large',
 'google_electra_small',
 'google_uncased_mobilebert',
 'fairseq_roberta_base',
 'fairseq_roberta_large',
 'fairseq_xlmr_base',
 'fairseq_xlmr_large',
 'fairseq_bart_base',
 'fairseq_bart_large']

With the help of the command, let's generate a table that shows the number of params of each backbone model.

In [None]:
from gluonnlp.utils.misc import count_parameters
param_num_l = []
for name in list_backbone_names():
    print(name)
    model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, load_backbone=False)
    model = model_cls.from_cfg(cfg)
    model.hybridize()
    model.initialize()
    total_num_params, fixed_num_params = count_parameters(model.collect_params())
    param_num_l.append((name, total_num_params))

google_albert_base_v2
google_albert_large_v2


  'will not be correct.'.format(k))
  'will not be correct.'.format(k))
  'will not be correct.'.format(k))


google_albert_xlarge_v2
google_albert_xxlarge_v2
google_en_cased_bert_base
google_en_cased_bert_large
google_en_cased_bert_wwm_large
google_en_uncased_bert_base
google_en_uncased_bert_large
google_en_uncased_bert_wwm_large
google_multi_cased_bert_base
google_zh_bert_base
gluon_electra_small_owt
google_electra_base
google_electra_large
google_electra_small
google_uncased_mobilebert
fairseq_roberta_base
fairseq_roberta_large
fairseq_xlmr_base


### Quick Start with BERT

Let's load the BERT model first. The architecture of BERT is illustrated as follows:

In [None]:
model_name = 'google_en_cased_bert_base'


In [None]:
print(model_cls)
print(local_params_path)
print(cfg)

In [None]:
backbone = model_cls.from_cfg(cfg)
backbone.hybridize()
backbone.load_parameters(local_params_path)

In [None]:
print(backbone)

In [None]:
text_input = 'GluonNLP helps practitioners solve NLP problems.'
token_ids = mx.np.array([[tokenizer.vocab.cls_id] + tokenizer.encode(text_input, int) + [tokenizer.vocab.sep_id]])
token_types = mx.np.array([0] * len(token_ids[0]))
valid_length = mx.np.array([len(token_ids[0])])
print('Token IDs=', token_ids)
print('Token Types=', token_types)
print('Valid Length=', valid_length)
mlm_embeddings, cls_embedding = backbone(token_ids, token_types, valid_length)

In [None]:
print(mlm_embeddings.shape)
print(mlm_embeddings)

In [None]:
print(cls_embedding.shape)
print(cls_embedding)

### Usage of Tokenizer and Vocab in GluonNLP

In [None]:
print(tokenizer)

In [None]:
original_string = "GluonNLP helps practitioners solve NLP problems."
print('Original:')
print('\t', original_string)
print('To string tokens:')
print('\t', tokenizer.encode(original_string))
print('To integer values:')
print('\t', tokenizer.encode(original_string, int))
print('Vocabulary of the tokenizer:')
print('\t', tokenizer.vocab)

In [None]:
print(tokenizer.vocab['helps'])
print(tokenizer.vocab.all_tokens[6618])
print(tokenizer.vocab.special_tokens)
print(tokenizer.vocab.cls_token)
print(tokenizer.vocab.mask_token)
print(tokenizer.vocab.unk_token)
print(tokenizer.encode('😁 means smile'))

### Load Other Backbone Models

#### - [ALBERT](https://arxiv.org/pdf/1909.11942.pdf):

In [None]:
model_cls, cfg, tokenizer, local_params_path, _ = get_backbone('google_albert_large_v2')
backbone = model_cls.from_cfg(cfg)
print(cfg)
print()
print(tokenizer)

#### - [ELECTRA](https://arxiv.org/pdf/2003.10555.pdf):

In [None]:
model_cls, cfg, tokenizer, local_params_path, _ = get_backbone('google_electra_base')
backbone = model_cls.from_cfg(cfg)
print(cfg)

#### - [MobileBERT](https://arxiv.org/pdf/2004.02984.pdf)

In [None]:
model_cls, cfg, tokenizer, local_params_path, _ = get_backbone('google_uncased_mobilebert')
backbone = model_cls.from_cfg(cfg)
backbone.load_parameters(local_params_path)
print(cfg)

## Write Model for Text Prediction

Insert Figure to describe how to build the network.


### Example-1: Sentiment Analysis

In [None]:
def preprocess_data(df, feature_columns, label_column, tokenizer, max_length=128, use_label=True):
    out = []
    if isinstance(feature_columns, str):
        feature_columns = [feature_columns]
    cls_id = tokenizer.vocab.cls_id
    sep_id = tokenizer.vocab.sep_id
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        # Token IDs =      [CLS]    token_ids1       [SEP]      token_ids2         [SEP]
        # Segment IDs =      0         0               0           1                 1
        encoded_text_l = [tokenizer.encode(row[col_name], int) for col_name in feature_columns]
        trimmed_lengths = get_trimmed_lengths([len(ele) for ele in encoded_text_l],
                                              max_length=max_length - len(feature_columns) - 1,
                                              do_merge=True)
        token_ids = [cls_id] + sum([ele[:length] + [sep_id]
                          for length, ele in zip(trimmed_lengths, encoded_text_l)], [])
        token_types = [0] + sum([[i % 2] * (length + 1) for i, length in enumerate(trimmed_lengths)], [])
        valid_length = len(token_ids)
        feature = (token_ids, token_types, valid_length)
        if use_label:
            label = row[label_column]
            out.append((feature, label))
        else:
            out.append(feature)
    return out

In [None]:
class TextPredictionNet(nn.HybridBlock):
    def __init__(self, backbone, in_units, out_units):
        """Construct the TextPrediction Network

        Parameters
        ----------
        backbone
            The backbone model
        in_units
            The units of the features extracted by the backbone model
        out_units
            The number of output units
        """
        super().__init__()
        self.backbone = backbone
        self.out_proj = nn.Dense(in_units=in_units,
                                 units=out_units,
                                 flatten=False)

    def hybrid_forward(self, F, data, token_types, valid_length):
        """

        Parameters
        ----------
        F
        data
            The input data.
            The shape is (batch_size, seq_length)
        token_types
            The type of each token.
        valid_length
            The valid length of each sample.
            Shape is (batch_size,)

        Returns
        -------
        out
            Shape is (batch_size, units)
        """
        _, pooled_out = self.backbone(data, token_types, valid_length)
        out = self.out_proj(pooled_out)
        return out

    def initialize_with_pretrained_backbone(self, backbone_params_path, ctx=None):
        """Initialize the network with pretrained backbone

        Parameters
        ----------
        backbone_params_path
        ctx

        Returns
        -------

        """
        self.backbone.load_parameters(backbone_params_path, ctx=ctx)
        self.out_proj.initialize(ctx=ctx)

In [None]:
model_cls, cfg, tokenizer, local_params_path, _ = get_backbone('google_uncased_mobilebert')
backbone = model_cls.from_cfg(cfg)

net = TextPredictionNet(backbone, backbone.units, 2)
net.hybridize()
ctx_l = get_mxnet_available_ctx()
net.initialize_with_pretrained_backbone(local_params_path, ctx_l)

In [None]:
train_df = pd.read_parquet('glue/sst/train.parquet')
rng_state = np.random.RandomState(123)
train_perm = rng_state.permutation(len(train_df))
# Just use 2000 samples for training
train_df = train_df.iloc[train_perm[:2000]]
valid_df = pd.read_parquet('glue/sst/dev.parquet')
train_processed = preprocess_data(train_df,
                                  feature_columns=['sentence'],
                                  label_column='label',
                                  tokenizer=tokenizer,
                                  use_label=True)
dev_processed = preprocess_data(valid_df,
                                feature_columns=['sentence'],
                                label_column='label',
                                tokenizer=tokenizer,
                                use_label=False)

After processing,
- Train Sample: `((token_ids, token_types, valid_length), label)`
- Valid Sample: `(token_ids, token_types, valid_length)`

We construct the batchify function based on this observation.

In [None]:
print(train_processed[0])
print(dev_processed[0])

In [None]:
train_batchify = bf.Group(bf.Group(bf.Pad(), bf.Pad(), bf.Stack()),
                          bf.Stack())
dev_batchify = bf.Group(bf.Pad(), bf.Pad(), bf.Stack())

Next, we write the training loop. We use the Triangular learning rate scheduler

In [None]:
def train(batch_size, dataset, batchify_function, net, ctx_l,
          num_epochs, lr=1E-4, wd=0.01, max_grad_norm=1.0, warmup_ratio=0.1):
    assert batch_size % len(ctx_l) == 0
    per_device_batch_size = batch_size // len(ctx_l)
    epoch_num_updates = len(dataset) // batch_size
    max_update = epoch_num_updates * num_epochs
    warmup_steps = int(np.ceil(max_update * warmup_ratio))
    dataloader = DataLoader(dataset,
                            batch_size=per_device_batch_size,
                            batchify_fn=batchify_function,
                            num_workers=4,
                            shuffle=True)
    dataloader = grouper(repeat(dataloader), len(ctx_l))
    lr_scheduler = PolyScheduler(max_update=max_update,
                                 base_lr=lr,
                                 warmup_begin_lr=0.0,
                                 pwr=1,
                                 final_lr=0.0,
                                 warmup_steps=warmup_steps,
                                 warmup_mode='linear')
    optimizer_params = {'learning_rate': lr,
                        'wd': wd,
                        'lr_scheduler': lr_scheduler}
    trainer = mx.gluon.Trainer(net.collect_params(),
                               'adamw',
                               optimizer_params)
    params = [p for p in net.collect_params().values() if p.grad_req != 'null']
    log_loss = 0
    log_gnorm = 0
    log_step = 0
    log_interval = int(epoch_num_updates * 0.1)
    for i in range(max_update):
        sample_l = next(dataloader)
        loss_l = []
        for sample, ctx in zip(sample_l, ctx_l):
            (token_ids, token_types, valid_length), label = sample
            # Move to the corresponding context
            token_ids = mx.np.array(token_ids, ctx=ctx)
            token_types = mx.np.array(token_types, ctx=ctx)
            valid_length = mx.np.array(valid_length, ctx=ctx)
            label = mx.np.array(label, ctx=ctx)
            with mx.autograd.record():
                scores = net(token_ids, token_types, valid_length)
                logits = mx.npx.log_softmax(scores, axis=-1)
                loss = - mx.npx.pick(logits, label)
                loss_l.append(loss.mean() / len(ctx_l))
        for loss in loss_l:
            loss.backward()
        trainer.allreduce_grads()
        # Begin Norm Clipping
        total_norm, ratio, is_finite = clip_grad_global_norm(params, max_grad_norm)
        trainer.update(1.0)
        step_loss = sum([loss.asnumpy() for loss in loss_l])
        log_loss += step_loss
        log_gnorm += total_norm
        log_step += 1
        if log_step >= log_interval or i == max_update - 1:
            print('[Iter {} / {}] avg nll = {}, avg gradient norm = {}'.format(i + 1, max_update, log_loss / log_step, log_gnorm / log_step))
            log_loss = 0
            log_gnorm = 0
            log_step = 0

In [None]:
train(32, train_processed, train_batchify, net, ctx_l, 3, lr=1E-4)

In [None]:
def predict(batch_size, dataset, batchify_function, net, ctx_l):
    assert batch_size % len(ctx_l) == 0
    per_device_batch_size = batch_size // len(ctx_l)
    dataloader = DataLoader(dataset,
                            batch_size=per_device_batch_size,
                            batchify_fn=batchify_function,
                            shuffle=False)
    pred = []
    for sample_l in grouper(dataloader, len(ctx_l)):
        for sample, ctx in zip(sample_l, ctx_l):
            if sample is None:
                continue
            token_ids, token_types, valid_length = sample
            token_ids = mx.np.array(token_ids, ctx=ctx)
            token_types = mx.np.array(token_types, ctx=ctx)
            valid_length = mx.np.array(valid_length, ctx=ctx)
            scores = net(token_ids, token_types, valid_length)
            probs = mx.npx.softmax(scores, axis=-1)
            pred.append(probs.asnumpy())
    pred = np.concatenate(pred, axis=0)
    return pred

In [None]:
pred = predict(64, dev_processed, dev_batchify, net, ctx_l)
accuracy = (pred.argmax(axis=-1) == valid_df['label']).sum() / len(valid_df)
print('Accuracy of the Dev Set=', accuracy)