<center><h2>ALTeGraD 2022<br>Lab Session 2: Transfer learning for NLP</h2> 27 / 10 / 2022<br> M. Kamal Eddine, H. Abdine<br><br>


<b>Student name:</b> Sicheng MAO

</center>

<br><br>
In this lab we will:
* Implement and pretrain a language model with transformer architecture.
* Use the pretrained model (transfer learning) to perform a sentiment analysis task which consists of classifying some books reviews into positive and negative ones.
* Compare the performance of the pretrained model to a model trained from scratch.
 <br>
 
<b>The deadline for this lab is November 14, 2022 11:59 PM.</b> More details about the submission and the architecture for this lab can be found in the handout PDF.

In [None]:
import math

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### The Model

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        '''
        ntokens: the size of vocabulary
        nhid: the hidden dimension of the model.
        We assume that embedding_dim = nhid
        nlayers: the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
        nhead: the number of heads in the multiheadattention models
        dropout: the dropout value
         '''
        self.model_type = "Transformer"
        self.encoder = nn.Embedding(num_embeddings = ntoken, embedding_dim = nhid) # fill me, nhid = the dim_embed
        self.pos_encoder = PositionalEncoding(nhid=nhid) #fill me, the PositionalEncoding class is implemented in the next cell
        encoder_layers = nn.TransformerEncoderLayer(d_model = nhid, nhead = nhead, dim_feedforward = nhid, dropout = dropout) #fill me we assume nhid = d_model = dim_feedforward
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers = nlayers) #fill me
        self.nhid = nhid
        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = (
            mask.float()
            .masked_fill(mask == 0, float("-inf"))
            .masked_fill(mask == 1, float(0.0))
        )
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        '''
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        '''
        src = self.encoder(src) * math.sqrt(self.nhid)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        return output


class ClassificationHead(nn.Module):
    def __init__(self, nhid, nclasses):
        super(ClassificationHead, self).__init__()
        self.decoder = nn.Linear(nhid, nclasses)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        output = self.decoder(src)
        return output
    
class Model(nn.Module):
    def __init__(self, ntoken, nhead, nhid, nlayers, nclasses, dropout=0.5):
        super(Model, self).__init__()
        self.base = TransformerModel(ntoken, nhead, nhid, nlayers)
        self.classifier = ClassificationHead(nhid, nclasses)

    def forward(self, src, src_mask):
        # base model
        x = self.base(src, src_mask)
        # classifier model
        output = self.classifier(x)
        return output

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, nhid, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, nhid)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, nhid, 2).float() * (-math.log(10000.0) / nhid)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0), :]
        return self.dropout(x)

Let's verify if our model works, by applying one inference step

In [None]:
ntokens = 100 # the size of vocabulary
nhid = 200  # hidden dimension
nlayers = 4  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # the number of heads in the multiheadattention models
dropout = 0  # the dropout value

model = Model(ntokens, nhead, nhid, nlayers, ntokens, dropout).to(device)
dummy_input = torch.tensor([[2, 6, 2, 5, 43, 21]]).to(device)
src_mask = model.base.generate_square_subsequent_mask(1).to(device)
out = model.forward(dummy_input, src_mask)

print(out.shape) # is it the right shape?

torch.Size([1, 6, 100])


In [None]:
print(out[0][0])

tensor([ 0.0240, -0.5466,  0.1815,  0.1748,  0.8017, -0.5656, -0.0887,  0.2141,
         0.9976, -0.4842, -0.0672,  0.8824,  0.5316,  1.0324, -0.7642,  0.3703,
         0.5367, -0.5249,  0.1036,  0.5801, -0.4989, -0.9560, -0.1651,  1.1669,
         0.4758,  0.8491,  0.0785, -0.3663, -0.0288,  0.3055,  0.0033,  0.7649,
         0.5026, -0.2414,  0.1216, -0.1953, -2.0676,  0.4640,  1.2271,  0.1394,
        -0.9316, -0.0174, -0.8105, -0.7888,  1.5579,  0.5271,  2.2154, -0.3891,
        -0.4305,  0.3774,  0.3409,  0.3291,  0.9109,  1.6863,  0.3618, -0.7223,
        -0.6035, -1.4683, -0.7683,  0.8661,  0.3306, -0.2432, -1.5032,  1.0471,
        -0.1976,  0.3164,  0.1305, -0.6207, -0.4299, -0.8531, -1.0503, -0.0069,
         1.2915, -0.7423,  0.1569, -0.8888, -0.0588, -0.3371,  0.6364,  0.8523,
        -0.9431, -0.6721,  0.5300,  1.9028, -0.2463,  0.4552,  0.9895, -0.8640,
        -0.6995, -0.8975, -0.7785,  0.1270,  0.9556,  0.3297,  0.0050, -0.4078,
         0.6364, -1.9100,  0.4490, -0.69

## Vocabulary and Tokenization

In [None]:
!wget https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/dict.txt
!head -5 dict.txt

--2022-11-04 17:35:07--  https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/dict.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 577587 (564K) [text/plain]
Saving to: ‘dict.txt.1’


2022-11-04 17:35:07 (39.8 MB/s) - ‘dict.txt.1’ saved [577587/577587]

▁d 1
es 1
▁l 1
en 1
on 1


In [None]:
path_vocab = "dict.txt"
token2ind = {"<sos>": 0, "<pad>": 1, "<eos>": 2, "<oov>": 3} # the 4 first indices are reserved to special tokens
with open(path_vocab, "r") as f:
    for idx, line in enumerate(f):
        word = line.split()[0].strip()
        token2ind[word] = idx + 4 #fill me

ind2token = {idx: word for (word, idx) in token2ind.items()} #fill me

print(ind2token[1111])

▁trop


### Data Loader


In [None]:
token2ind.get('=',3)

3

In [None]:
import numpy
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset


class Dataset(Dataset):
    def __init__(
        self,
        path_documents,
        path_labels=None,
        token2ind={},
        max_len=512,
        task="language_modeling",
    ):
        self.task = task
        self.max_len = max_len
        self.token2ind = token2ind
        self.documents = []
        self.labels = []
        with open(path_documents, "r") as f1:
            for line in f1:
                self.documents.append(line.strip())
        if task == "classification":
            with open(path_labels, "r") as f1:
                for line in f1:
                    self.labels.append(int(line.strip()))
            assert len(self.labels) == len(self.documents)

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, index):
        sequence = self.documents[index].split()
        if len(sequence) > self.max_len - 1:
            sequence = sequence[: self.max_len - 1]
        # <oov> index is 3
        source_sequence =  [token2ind.get(token, 3) for token in sequence]#fill me (constract the input sequence using token2ind, sequence and special tokens)
        if self.task == "language_modeling":
            target = source_sequence[1:]
            target.append(self.token2ind["<eos>"])
        elif self.task == "classification":
            target = [self.labels[index]]
        sample = {
            "source_sequence": torch.tensor(source_sequence),
            "target": torch.tensor(target),
        }
        return sample


def MyCollator(batch):
    source_sequences = pad_sequence(
        #we use padding to match the length of the sequences in the same batch
        [sample["source_sequence"] for sample in batch], padding_value=token2ind["<pad>"]
    )
    target = pad_sequence(
        [sample["target"] for sample in batch], padding_value=token2ind["<pad>"]
    )
    return source_sequences, target.reshape(-1)


def get_loader(
    path_documents,
    path_labels=None,
    token2ind={},
    max_len=512,
    batch_size=32,
    task="language_modeling",
):
    dataset = Dataset(
        path_documents,
        path_labels=path_labels,
        token2ind=token2ind,
        max_len=512,
        task=task,
    )
    data_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=MyCollator,
        pin_memory=True,
        drop_last=True,
    )
    return data_loader

## The Training

In [None]:
from tqdm import tqdm

In [None]:
def train(
    path_data_train,
    path_labels_train=None,
    path_data_valid=None,
    save_interval=-1,
    log_interval=5,
    task="language_modeling",
    batch_size=32,
):
    model.train()
    total_loss = 0.0
    ntokens = len(token2ind)
    data_loader = get_loader(
        path_data_train,
        path_labels_train,
        token2ind,
        task=task,
        batch_size=batch_size,
    )
    losses = []
    
    for idx, data in tqdm(enumerate(data_loader), total=len(data_loader)): #step 1
        optimizer.zero_grad()
        src_mask = model.base.generate_square_subsequent_mask(data[0].size(0)).to(
            device
        )
        input = data[0].to(device)
        # print(input.shape)
        output = model(input, src_mask) #step 2
        # print(output.shape)
        if task == 'classification':
            #last vector only
            # output = torch.argmax#fill me 
            pass
        output = output.view(-1, output.shape[-1])
        target = data[1] #fill me
        target = target.to(device)
        loss = criterion(output, target) #fill me, Cross entropy check next cells
        #fill me step 3
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # prevent exploding gradient 
        #fill me step 4
        optimizer.step()
        total_loss += loss.item() 
        if idx % log_interval == 0 and idx > 0:
            cur_loss = total_loss / log_interval
            print(
                "| epoch {:3d} | {:5d}/{:5d} steps | "
                "loss {:5.5f} | ppl {:8.3f}".format(
                    epoch, idx, len(data_loader), cur_loss, math.exp(cur_loss),
                )
            )
            losses.append(cur_loss)
            total_loss = 0
    return losses

In [None]:
ntokens = len(token2ind) #fill me # the size of vocabulary
nhid = 200  # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 4  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # the number of heads in the multiheadattention models
dropout = 0  # the dropout value

nclasses = 2 # for classification task only

#notice here that the nclasses argument = ntokens (for language modeling task, we predict the next token)
model = Model(ntokens, nhead, nhid, nlayers, ntokens, dropout).to(device)

In [None]:
# optimization paramerters

criterion = nn.CrossEntropyLoss(ignore_index=token2ind['<pad>'])
lr = 0.0003  # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
!wget https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/pretraining_subset.txt
path_data_train = "pretraining_subset.txt"

--2022-11-04 17:35:08--  https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/pretraining_subset.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10146460 (9.7M) [text/plain]
Saving to: ‘pretraining_subset.txt.1’


2022-11-04 17:35:08 (229 MB/s) - ‘pretraining_subset.txt.1’ saved [10146460/10146460]



In [None]:
len(token2ind)

50001

In [None]:
#pretraining on a tiny subset
log_interval = 500
epochs = 2
for epoch in range(1, epochs + 1): #5
    train(
        path_data_train,
        save_interval=-1,
        task='language_modeling', # fill me
        batch_size=16,
        log_interval=log_interval,
    )

 16%|█▌        | 504/3125 [00:26<02:22, 18.35it/s]

| epoch   1 |   500/ 3125 steps | loss 7.58990 | ppl 1978.122


 32%|███▏      | 1003/3125 [00:53<01:40, 21.19it/s]

| epoch   1 |  1000/ 3125 steps | loss 6.84134 | ppl  935.744


 48%|████▊     | 1503/3125 [01:17<01:17, 21.05it/s]

| epoch   1 |  1500/ 3125 steps | loss 6.57576 | ppl  717.488


 64%|██████▍   | 2003/3125 [01:42<00:54, 20.63it/s]

| epoch   1 |  2000/ 3125 steps | loss 6.39134 | ppl  596.657


 80%|████████  | 2505/3125 [02:07<00:29, 21.32it/s]

| epoch   1 |  2500/ 3125 steps | loss 6.22672 | ppl  506.093


 96%|█████████▌| 3004/3125 [02:31<00:06, 20.09it/s]

| epoch   1 |  3000/ 3125 steps | loss 6.12084 | ppl  455.247


100%|██████████| 3125/3125 [02:37<00:00, 19.78it/s]
 16%|█▌        | 503/3125 [00:25<02:52, 15.21it/s]

| epoch   2 |   500/ 3125 steps | loss 5.86362 | ppl  351.998


 32%|███▏      | 1003/3125 [00:52<01:42, 20.73it/s]

| epoch   2 |  1000/ 3125 steps | loss 5.78315 | ppl  324.781


 48%|████▊     | 1503/3125 [01:17<01:30, 17.85it/s]

| epoch   2 |  1500/ 3125 steps | loss 5.76092 | ppl  317.642


 64%|██████▍   | 2003/3125 [01:44<00:50, 22.26it/s]

| epoch   2 |  2000/ 3125 steps | loss 5.70088 | ppl  299.132


 80%|████████  | 2502/3125 [02:07<00:29, 21.05it/s]

| epoch   2 |  2500/ 3125 steps | loss 5.67662 | ppl  291.961


 96%|█████████▌| 3004/3125 [02:33<00:05, 21.95it/s]

| epoch   2 |  3000/ 3125 steps | loss 5.64476 | ppl  282.805


100%|██████████| 3125/3125 [02:39<00:00, 19.54it/s]


In [None]:
print(model)

Model(
  (base): TransformerModel(
    (encoder): Embedding(50001, 200)
    (pos_encoder): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
          )
          (linear1): Linear(in_features=200, out_features=200, bias=True)
          (dropout): Dropout(p=0.5, inplace=False)
          (linear2): Linear(in_features=200, out_features=200, bias=True)
          (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.5, inplace=False)
          (dropout2): Dropout(p=0.5, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): No

In [None]:
50001 * 200

10000200

In [None]:
for name, param in model.named_parameters():
  print(name, param.size(), param.requires_grad)

base.encoder.weight torch.Size([50001, 200]) True
base.transformer_encoder.layers.0.self_attn.in_proj_weight torch.Size([600, 200]) True
base.transformer_encoder.layers.0.self_attn.in_proj_bias torch.Size([600]) True
base.transformer_encoder.layers.0.self_attn.out_proj.weight torch.Size([200, 200]) True
base.transformer_encoder.layers.0.self_attn.out_proj.bias torch.Size([200]) True
base.transformer_encoder.layers.0.linear1.weight torch.Size([200, 200]) True
base.transformer_encoder.layers.0.linear1.bias torch.Size([200]) True
base.transformer_encoder.layers.0.linear2.weight torch.Size([200, 200]) True
base.transformer_encoder.layers.0.linear2.bias torch.Size([200]) True
base.transformer_encoder.layers.0.norm1.weight torch.Size([200]) True
base.transformer_encoder.layers.0.norm1.bias torch.Size([200]) True
base.transformer_encoder.layers.0.norm2.weight torch.Size([200]) True
base.transformer_encoder.layers.0.norm2.bias torch.Size([200]) True
base.transformer_encoder.layers.1.self_attn.

## Text Generation


In [None]:
!wget https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/pretrained_model_4layers.pt

model = Model(ntokens, nhead, nhid, nlayers, ntokens).to(device) 

#load the checkpoint
checkpoint = torch.load('pretrained_model_4layers.pt') 
#load state dict
model.load_state_dict(checkpoint['model_state_dict']) 

--2022-11-04 17:40:26--  https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/pretrained_model_4layers.pt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 88093955 (84M) [application/octet-stream]
Saving to: ‘pretrained_model_4layers.pt.1’


2022-11-04 17:40:31 (340 MB/s) - ‘pretrained_model_4layers.pt.1’ saved [88093955/88093955]



<All keys matched successfully>

In [None]:
!pip install sentencepiece   # uncomment this if you are using google colab

!wget https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/sentencepiece.french.model

import sentencepiece as spm

s = spm.SentencePieceProcessor(model_file='sentencepiece.french.model') #load sentencepiece model

#examples
encoded = s.encode_as_pieces("Bonjour les amis!")
decoded = s.decode_pieces(encoded)
print(encoded)
print(decoded)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2022-11-04 17:40:34--  https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/sentencepiece.french.model
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115362 (1.1M) [application/octet-stream]
Saving to: ‘sentencepiece.french.model.1’


2022-11-04 17:40:34 (57.9 MB/s) - ‘sentencepiece.french.model.1’ saved [1115362/1115362]

['▁Bonjour', '▁les', '▁amis', '!']
Bonjour les amis!


In [None]:
def infer_next_token(sent):
    model.eval()
    sent_pieces = s.encode_as_pieces(sent)
    source = [token2ind['<sos>']] + [token2ind[el] for el in sent_pieces] # list of tokens
    source = torch.tensor(source).to(device)
    source = source.reshape(-1, 1)
    src_mask = model.base.generate_square_subsequent_mask(source.size(0)).to(device)
    out = model(source, src_mask)
    next_token_ind = None #fill me
    return next_token_ind, out
    
def infer_next_tokens(sent, max_len=50):
    # to be implemented
    cur_token = sent
    tokens = s.encode_as_pieces(sent)
    for i in range(max_len):
      next_token_ind , _ = infer_next_token(cur_token)
      cur_token = ind2token(next_token_ind)
      if cur_token == '<eos>':
        break
      else:
        tokens.append(cur_token)
    sent_gen = s

SyntaxError: ignored

In [None]:
sent = "Bonjour les"
infer_next_tokens(sent)

### Supervised task

In [None]:
!wget https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/cls-books/train.review.spm
!wget https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/cls-books/train.label
!wget https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/cls-books/test.review.spm
!wget https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/cls-books/test.label

path_data_train = "train.review.spm"
path_labels_train = "train.label"

path_data_valid = "test.review.spm"
path_labels_valid = "test.label"

In [None]:
# a function to evaluate the validation accuracy of the model.
def evaluate_accuracy(data_loader):
    #to be implemented

In [None]:
#save the base model to be loaded later in the fine-tuning phase
torch.save({"model_state_dict": model.base.state_dict(),}, "pretrained_model_4layers_no_class_head.pt")

In [None]:
from_scratch_settings = [True, False]

from_scratch_valid_acc = []
pretrained_valid_acc = []
lr = 0.0001

for from_scratch in from_scratch_settings:
    model = Model(ntokens, nhead, nhid, nlayers, 2, dropout).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    if not from_scratch:
        print("=====PRETRAINED MODEL======")
        #load checkpoint
        checkpoint = torch.load("pretrained_model_4layers_no_class_head.pt")
        #load state dict
        model.base.load_state_dict(checkpoint['model_state_dict'])
    else:
        print("=====Trainig FROM SCRATCH======")
    epochs = 15
    for epoch in range(1, epochs + 1):
        train(
            path_data_train,
            path_labels_train,
            save_interval=-1,
            task='classification',
            batch_size=8,
            log_interval=50,
        )
        acc = evaluate_accuracy(
            get_loader(
                path_data_valid,
                path_labels_valid,
                token2ind=token2ind,
                batch_size=20,
                task='classification',
            )
        )
        if from_scratch:
            from_scratch_valid_acc.append(acc)
        else:
            pretrained_valid_acc.append(acc)
    print()

In [None]:
#Visualize the accuracy