# Lab 01 : Language Model with Transformers -- solution

### Task   

The goal is to learn to predict the next word from an input sequence with a language model transformer.

The dataset is PTB.



In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import utils

### GPU

It is recommended to run this code on GPU:<br> 
* Time for 1 epoch on GPU : 48 sec w/ Google Colab Tesla P100-PCIE-16GB <br>

In [2]:
# check if GPU or mps is available

if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


### Download Penn Tree Bank

The tensor train_data consists of 20 columns of 46,479 words.<br>
The tensor test_data consists of 20 columns of 4,121 words.

In [4]:
data_path

'../../data/'

In [3]:
from utils import check_ptb_dataset_exists
data_path=check_ptb_dataset_exists()

train_data  =  torch.load(data_path+'ptb/train_data.pt')
test_data   =  torch.load(data_path+'ptb/test_data.pt')

print(  train_data.size()  )
print(  test_data.size()   )

torch.Size([46479, 20])
torch.Size([4121, 20])


In [6]:
! pwd

/Users/atoukoffikougbanhoun/Desktop/WorkSpace/LLM/base_model/base/lab01_language_model


### Some constants associated with the data set

In [5]:
bs = 20
vocab_size = 10000

### Make an attention net class

In [6]:

def generate_positional_encoding(seq_length, dim):
    assert dim == 2* (dim//2) # check if dim is divisible by 2
    pe = torch.zeros(seq_length, dim)
    position = torch.arange(0, seq_length, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / dim))
    pe[:,0::2] = torch.sin(position * div_term)
    pe[:,1::2] = torch.cos(position * div_term)
    return pe        
   
class AttentionHead(nn.Module):
    def __init__(self, d, d_head, dropout):
        super().__init__()
        self.LN_MHA = nn.LayerNorm(d_head)
        self.LN_MLP = nn.LayerNorm(d_head)
        self.query = nn.Linear(d, d_head, bias=False) # query embedding layer
        self.key = nn.Linear(d, d_head, bias=False) # key embedding layer
        self.value = nn.Linear(d, d_head) # value embedding layer
        self.dropout = nn.Dropout(dropout)
    def forward(self, H): # size(H)=[batch_size, seq_length, d]
        batch_size = H.size(0); batch_len = H.size(1)
        H_in = H # Add residual connection (RC)
        # Compute a single attention head H = Softmax( QK^T / d^0.5 ) V
        Q = self.query(H) # size=[batch_size, batch_length, d]        
        K = self.key(H) # size=[batch_size, batch_length, d]
        V = self.value(H) # size=[batch_size, batch_length, d]
        attention_score = Q @ K.transpose(2,1) * Q.size(2)**-0.5 # QK^T/sqrt(d), (B,L,d) @ (B,d,L) => (B,L,L), size=[batch_size, batch_length, batch_length)
        mask = torch.tril(torch.ones(batch_len,batch_len)).long().to(device) # mask to use previous tokens only : { token(<=t) }, size=[batch_len,batch_len]
        attention_score = attention_score.masked_fill(mask==0, value=float('-inf')) # softmax(-inf)=0 prevents using next tokens for prediction, size=(batch_size, batch_len, batch_len)
        attention_score = torch.softmax(attention_score, dim=2) # sum weights = 1, size=[batch_size, batch_length, batch_len)
        attention_score = self.dropout(attention_score) # dropout attention scores
        H_HA = attention_score @ V # softmax( QK^T / sqrt(d) ) V, (B,L,L) @ (B,L,d) => (B,L,d), size=[batch_size, batch_length, d)
        return H_HA # return prediction scores for next token

class MultipleAttentionHead(nn.Module):
    def __init__(self, d, num_heads, dropout):
        super().__init__()
        d_head = d // num_heads # dim_head = d // num_heads, usually dimension per head is 64
        assert d == d_head * num_heads # check divisibility
        self.MHA = nn.ModuleList([ AttentionHead(d, d_head, dropout) for _ in range(num_heads) ])
        self.WO = nn.Linear(d, d) # combination layer
        self.dropout = nn.Dropout(dropout)
    def forward(self, H): # size(H)=[batch_size, seq_length, d]
        batch_size = H.size(0); seq_length = H.size(1)
        H_heads = []
        for HA_layer in self.MHA:
            H_heads.append(HA_layer(H)) # size=[batch_size, seq_length, d_head]
        H_heads = torch.cat(H_heads, dim=2) # size=[batch_size, seq_length, d]            
        H_heads = self.dropout(H_heads) # dropout attention activations
        H = self.WO(H_heads) # size=[batch_size, seq_length, d]
        return H
        
class TransformerBlock(nn.Module):
    def __init__(self, d, num_heads, dropout):
        super().__init__()
        self.LN_MHA = nn.LayerNorm(d)
        self.LN_MLP = nn.LayerNorm(d)
        self.MHA = MultipleAttentionHead(d, num_heads, dropout)
        self.MLP = nn.Sequential(nn.Linear(d,4*d), nn.ReLU(), nn.Dropout(dropout), nn.Linear(4*d,d))        
    def forward(self, H): # size=[batch_size, seq_length, d]
        # Multiple Attention Heads w/ layer normalization (LN), residual connection (RC)
        H = H + self.MHA(self.LN_MHA(H)) # size=[batch_size, seq_length, d]
        # MLP w/ layer normalization (LN), residual connection (RC)
        H = H + self.MLP(self.LN_MLP(H)) # size=[batch_size, seq_length, d]
        return H # size=[batch_size, seq_length, d]
        
        
class Transformer_decoder(nn.Module):
    def __init__(self, d, num_heads, num_blocks, seq_length, dropout):
        super().__init__()
        self.TR_Blocks = nn.ModuleList([ TransformerBlock(d, num_heads, dropout) for _ in range(num_blocks) ]) 
    def forward(self, batch_seq, pos_enc):
        H = batch_seq.transpose(1,0) # size=[batch_size, seq_length, d]
        batch_size = H.size(0); batch_len = H.size(1)
        # Add positional encoding  
        pos_enc = pos_enc.unsqueeze(dim=0) # size=[1,          seq_length, d]
        H = H + pos_enc                    # size=[batch_size, seq_length, d]
        # Apply transformer blocks 
        for TR_Block in self.TR_Blocks:
            H = TR_Block(H)
        # Output
        H = H.permute(1,0,2)  # size=[batch_length, batch_size, d]
        return H # return prediction scores for next token


class ANN(nn.Module):
    
    def __init__(self, d, num_heads, num_blocks, seq_length, dropout):
        super(ANN, self).__init__()
        self.decoder = Transformer_decoder(d, num_heads, num_blocks, seq_length, dropout)
    
    def forward(self, g_seq , pos ):
        h_dec_seq = self.decoder( g_seq , pos )
        return h_dec_seq 
    

class attention_net(nn.Module):

    def __init__(self, d, num_heads, num_blocks, seq_length, dropout):
        super(attention_net, self).__init__()  
        self.layer1 = nn.Embedding( vocab_size  , hidden_size  )
        self.layer2 = ANN(d, num_heads, num_blocks, seq_length, dropout)
        self.layer3 = nn.Linear(    hidden_size , vocab_size   )

    def forward(self, word_seq, pos ):
        g_seq     =   self.layer1( word_seq ) # size=(seq_length, bs, hidden_dim) 
        h_seq     =   self.layer2( g_seq , pos ) # size=(seq_length, bs, hidden_dim) 
        score_seq =   self.layer3( h_seq ) # size=(seq_length, bs, vocab_size)
        return score_seq 


### Function to evaluate the network on the test set

In [7]:
def eval_on_test_set():

    net.eval()

    running_loss=0
    num_batches=0    
       
    for count in range( 0 , 4120-seq_length ,  seq_length) :
               
        minibatch_data =  test_data[ count   : count+seq_length   ]
        minibatch_label = test_data[ count+1 : count+seq_length+1 ]
        pos = generate_positional_encoding(seq_length, hidden_size)
        
        minibatch_data = minibatch_data.to(device)
        minibatch_label = minibatch_label.to(device)
        pos = pos.to(device)

        scores = net( minibatch_data, pos )
        
        minibatch_label = minibatch_label.view(  bs*seq_length ) 
        scores = scores.view(  bs*seq_length , vocab_size)
        
        loss = criterion(scores, minibatch_label) 
        
        running_loss += loss.item()
        num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )


### Build the net. Choose the hidden size to be 128, the number of heads to be 4, and the number of blocks 2. 
### How many parameters in total?

In [8]:
hidden_size = 128 
num_heads = 4
num_blocks = 2
dropout = 0.95
seq_length = 100

net = attention_net(hidden_size, num_heads, num_blocks, seq_length, dropout)
print(net)
utils.display_num_param(net)

attention_net(
  (layer1): Embedding(10000, 128)
  (layer2): ANN(
    (decoder): Transformer_decoder(
      (TR_Blocks): ModuleList(
        (0-1): 2 x TransformerBlock(
          (LN_MHA): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (LN_MLP): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (MHA): MultipleAttentionHead(
            (MHA): ModuleList(
              (0-3): 4 x AttentionHead(
                (LN_MHA): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
                (LN_MLP): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
                (query): Linear(in_features=128, out_features=32, bias=False)
                (key): Linear(in_features=128, out_features=32, bias=False)
                (value): Linear(in_features=128, out_features=32, bias=True)
                (dropout): Dropout(p=0.95, inplace=False)
              )
            )
            (WO): Linear(in_features=128, out_features=128, bias=True)
            (dropout): 

### Send the network to the GPU

In [9]:
net = net.to(device)

### Choose the loss to be the cross-entropy and the optimizer to be Adam, as well as the following important hyperparameters: 
* initial learning rate = 0.001
* sequence length = 30

In [10]:
criterion = nn.CrossEntropyLoss()

my_lr = 0.001
optimizer = torch.optim.Adam(net.parameters(), lr=my_lr)

pos = generate_positional_encoding(seq_length, hidden_size) # size=(seq_length, hidden_dim)

### Do 5 passes through the training set
### Observe the train perplexity and the test perplexity

In [13]:

epochs = 10
start=time.time()
for epoch in range(epochs):

    # divide the learning rate by 3 except after the first epoch
    if epoch >= 2:
        optimizer.param_groups[0]['lr'] /= 1.1 
        my_lr = optimizer.param_groups[0]['lr']
    
    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
    for count in range( 0 , 46478-seq_length ,  seq_length):
        
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch and the positional encoding
        minibatch_data = train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]    
        pos = generate_positional_encoding(seq_length, hidden_size) # size=(seq_length, hidden_dim) 
        
        # send them to the gpu
        minibatch_data = minibatch_data.to(device)
        minibatch_label = minibatch_label.to(device)
        pos = pos.to(device)
        
        # forward the minibatch through the net        
        scores = net( minibatch_data, pos ) # size=(seq_length, bs, vocab_size)

        # reshape the scores and labels to huge batch of size bs*seq_length
        scores = scores.view(  bs*seq_length , vocab_size) # size=(seq_length/2.bs, vocab_size)
        minibatch_label = minibatch_label.view(  bs*seq_length ) # size=(seq_length/2.bs, vocab_size)
       
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(scores, minibatch_label)
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        optimizer.step()
        
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    eval_on_test_set() 



epoch= 0 	 time= 12.252074956893921 	 lr= 0.00021762913579014855 	 exp(loss)= 47.75411980533963
test: exp(loss) =  198.86872910301688

epoch= 1 	 time= 24.949244022369385 	 lr= 0.00021762913579014855 	 exp(loss)= 46.808071795932634
test: exp(loss) =  201.7638837078004

epoch= 2 	 time= 37.64756107330322 	 lr= 0.00019784466890013504 	 exp(loss)= 45.74088648725345
test: exp(loss) =  204.05088999030417

epoch= 3 	 time= 50.35393691062927 	 lr= 0.00017985878990921367 	 exp(loss)= 44.77842558755509
test: exp(loss) =  205.866611376765

epoch= 4 	 time= 63.071797132492065 	 lr= 0.00016350799082655786 	 exp(loss)= 43.917690402149
test: exp(loss) =  207.79286331998063

epoch= 5 	 time= 75.7859559059143 	 lr= 0.0001486436280241435 	 exp(loss)= 43.14814033926362
test: exp(loss) =  209.65413923664576

epoch= 6 	 time= 88.49476408958435 	 lr= 0.00013513057093103952 	 exp(loss)= 42.46079378528039
test: exp(loss) =  211.39795050339245

epoch= 7 	 time= 101.21930289268494 	 lr= 0.0001228459735736723 

### Choose one sentence (taken from the test set)

In [19]:
sentence1 = "some analysts expect oil prices to remain relatively"

sentence2 = "over the next days and weeks they say investors should look for stocks to"

sentence3 = "prices averaging roughly $ N a barrel higher in the third"

sentence4 = "i think my line has been very consistent mrs. hills said at a news"

sentence5 = "this appears particularly true at gm which had strong sales in"

# or make your own sentence.  No capital letter or punctuation allowed. Each word must be in the allowed vocabulary.
sentence6 = "be your self,the world"

# SELECT THE SENTENCE HERE
mysentence = sentence6

### Display the the network prediction for the next word

In [20]:
minibatch_data = utils.sentence2vector(mysentence)
minibatch_data = torch.cat((minibatch_data, minibatch_data), dim=0) # copy-paste the test sequence to use the same attention window size for each word
pos = generate_positional_encoding(minibatch_data.size(0), hidden_size) 

minibatch_data = minibatch_data.to(device)
pos = pos.to(device)   

net.eval()
scores = net( minibatch_data, pos )
scores = scores[-1,:] # select the last score vector for the prediction of the next word from the input sequence
scores = scores[0].unsqueeze(0).unsqueeze(0)

print(mysentence, '... \n')
utils.show_next_word(scores)


You entered a word which is not in the vocabulary.
Make sure that you do not have any capital letters
be your self,the world ... 

37.0%	 <eos>
20.9%	 war
8.2%	 's
6.1%	 series
5.4%	 with
1.9%	 <unk>
1.6%	 and
1.4%	 of
1.2%	 bank
0.8%	 at
0.8%	 for
0.7%	 the
0.7%	 capital
0.7%	 as
0.6%	 by
0.6%	 said
0.5%	 in
0.4%	 new
0.4%	 according
0.4%	 a
0.4%	 private
0.3%	 central
0.3%	 to
0.3%	 that
0.3%	 but
0.2%	 based
0.2%	 quite
0.2%	 mortgage
0.2%	 after
0.2%	 announced
