In [None]:
!nvidia-smi

Fri Mar 24 12:15:00 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp -r "/content/gdrive/My Drive/EVA8_S11_Course_Docs/BERT" "."

In [None]:
from torch.utils.data import Dataset
import torch.nn.functional as F
from collections import Counter
from os.path import exists
import torch.optim as optim
import torch.nn as nn
import numpy as np
import random
import torch
import math
import re

In [None]:
# =============================================================================
# Transformer
# =============================================================================
def attention(q, k, v, mask = None, dropout = None):
    scores = q.matmul(k.transpose(-2, -1))
    scores /= math.sqrt(q.shape[-1])

    #mask
    scores = scores if mask is None else scores.masked_fill(mask == 0, -1e3)

    scores = F.softmax(scores, dim = -1)
    scores = dropout(scores) if dropout is not None else scores
    output = scores.matmul(v)
    return output

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, out_dim, dropout=0.1):
        super().__init__()

        self.linear = nn.Linear(out_dim, out_dim*3)

        self.n_heads = n_heads
        self.out_dim = out_dim
        self.out_dim_per_head = out_dim // n_heads
        self.out = nn.Linear(out_dim, out_dim)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, t):
         return t.reshape(t.shape[0], -1, self.n_heads, self.out_dim_per_head)

    def forward(self, x, y=None, mask=None):
        #in decoder, y comes from encoder. In encoder, y=x
        y = x if y is None else y

        qkv = self.linear(x) # BS * SEQ_LEN * (3*EMBED_SIZE_L)
        q = qkv[:, :, :self.out_dim] # BS * SEQ_LEN * EMBED_SIZE_L
        k = qkv[:, :, self.out_dim:self.out_dim*2] # BS * SEQ_LEN * EMBED_SIZE_L
        v = qkv[:, :, self.out_dim*2:] # BS * SEQ_LEN * EMBED_SIZE_L

        #break into n_heads
        q, k, v = [self.split_heads(t) for t in (q,k,v)]  # BS * SEQ_LEN * HEAD * EMBED_SIZE_P_HEAD
        q, k, v = [t.transpose(1,2) for t in (q,k,v)]  # BS * HEAD * SEQ_LEN * EMBED_SIZE_P_HEAD

        #n_heads => attention => merge the heads => mix information
        scores = attention(q, k, v, mask, self.dropout) # BS * HEAD * SEQ_LEN * EMBED_SIZE_P_HEAD
        scores = scores.transpose(1,2).contiguous().view(scores.shape[0], -1, self.out_dim) # BS * SEQ_LEN * EMBED_SIZE_L
        out = self.out(scores)  # BS * SEQ_LEN * EMBED_SIZE

        return out

class FeedForward(nn.Module):
    def __init__(self, inp_dim, inner_dim, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(inp_dim, inner_dim)
        self.linear2 = nn.Linear(inner_dim, inp_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        #inp => inner => relu => dropout => inner => inp
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

class EncoderLayer(nn.Module):
    def __init__(self, n_heads, inner_transformer_size, inner_ff_size, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(n_heads, inner_transformer_size, dropout)
        self.ff = FeedForward(inner_transformer_size, inner_ff_size, dropout)
        self.norm1 = nn.LayerNorm(inner_transformer_size)
        self.norm2 = nn.LayerNorm(inner_transformer_size)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x2 = self.norm1(x)
        x = x + self.dropout1(self.mha(x2, mask=mask))
        x2 = self.norm2(x)
        x = x + self.dropout2(self.ff(x2))
        return x

# Positional Embedding
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_seq_len = 80):
        super().__init__()
        self.d_model = d_model
        pe = torch.zeros(max_seq_len, d_model)
        pe.requires_grad = False
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.pe[:,:x.size(1)] #x.size(1) = seq_len

In [None]:
class Transformer(nn.Module):
    def __init__(self, n_code, n_heads, embed_size, inner_ff_size, n_embeddings, seq_len, dropout=.1, saved_embed = None):
        super().__init__()

        #model input
        self.embeddings = nn.Embedding(n_embeddings, embed_size)
        self.pe = PositionalEmbedding(embed_size, seq_len)

        #backbone
        encoders = []
        for i in range(n_code):
            encoders += [EncoderLayer(n_heads, embed_size, inner_ff_size, dropout)]
        self.encoders = nn.ModuleList(encoders)

        #language model
        self.norm = nn.LayerNorm(embed_size)
        self.linear = nn.Linear(embed_size, n_embeddings, bias=False)

    def forward(self, x):
        x = self.embeddings(x)
        x = x + self.pe(x)
        for encoder in self.encoders:
            x = encoder(x)
        x = self.norm(x)
        x = self.linear(x)
        return x

In [None]:
# =============================================================================
# Dataset
# =============================================================================
class SentencesDataset(Dataset):
    #Init dataset
    def __init__(self, sentences, vocab, seq_len):
        dataset = self

        dataset.sentences = sentences
        dataset.vocab = vocab + ['<ignore>', '<oov>', '<mask>']
        dataset.vocab = {e:i for i, e in enumerate(dataset.vocab)}
        dataset.rvocab = {v:k for k,v in dataset.vocab.items()}
        dataset.seq_len = seq_len

        #special tags
        dataset.IGNORE_IDX = dataset.vocab['<ignore>'] #replacement tag for tokens to ignore
        dataset.OUT_OF_VOCAB_IDX = dataset.vocab['<oov>'] #replacement tag for unknown words
        dataset.MASK_IDX = dataset.vocab['<mask>'] #replacement tag for the masked word prediction task

    #fetch data
    def __getitem__(self, index, p_random_mask=0.15):
        dataset = self

        #while we don't have enough word to fill the sentence for a batch
        s = []
        while len(s) < dataset.seq_len:
            s.extend(dataset.get_sentence_idx(index % len(dataset)))
            index += 1

        #ensure that the sequence is of length seq_len
        s = s[:dataset.seq_len]
        [s.append(dataset.IGNORE_IDX) for i in range(dataset.seq_len - len(s))] #PAD ok

        #apply random mask
        rand_rand = random.random()
        s = [(dataset.MASK_IDX, w) if rand_rand < p_random_mask else (w, dataset.IGNORE_IDX) for w in s]

        return {'input': torch.Tensor([w[0] for w in s]).long(),
                'target': torch.Tensor([w[0] for w in s]).long()}

    #return length
    def __len__(self):
        return len(self.sentences)

    #get words id
    def get_sentence_idx(self, index):
        dataset = self
        s = dataset.sentences[index]
        s = [dataset.vocab[w] if w in dataset.vocab else dataset.OUT_OF_VOCAB_IDX for w in s]
        return s

In [None]:
# =============================================================================
# Methods / Class
# =============================================================================
def get_batch(loader, loader_iter):
    try:
        batch = next(loader_iter)
    except StopIteration:
        loader_iter = iter(loader)
        batch = next(loader_iter)
    return batch, loader_iter

In [None]:
# =============================================================================
# #Init
# =============================================================================
print('initializing..')
batch_size = 1024
seq_len = 20
embed_size = 128
inner_ff_size = embed_size * 4
n_heads = 8
n_code = 8
n_vocab = 40000
dropout = 0.1
# n_workers = 12

#optimizer
optim_kwargs = {'lr':1e-4, 'weight_decay':1e-4, 'betas':(.9,.999)}

initializing..


In [None]:
# =============================================================================
# Input
# =============================================================================
#1) load text
print('loading text...')
pth = './BERT/training.txt'
# pth = './BERT/small_training.txt'
sentences = open(pth).read().lower().split('\n')

#2) tokenize sentences (can be done during training, you can also use spacy udpipe)
print('tokenizing sentences...')
special_chars = ',?;.:/*!+-()[]{}"\'&'
sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in sentences]
sentences = [[w for w in s if len(w)] for s in sentences]

#3) create vocab if not already created
print('creating/loading vocab...')
pth = './BERT/vocab.txt'
if not exists(pth):
    words = [w for s in sentences for w in s]
    vocab = Counter(words).most_common(n_vocab) #keep the N most frequent words
    vocab = [w[0] for w in vocab]
    open(pth, 'w+').write('\n'.join(vocab))
else:
    vocab = open(pth).read().split('\n')

print(vocab[4], vocab[1], vocab[-1])

#4) create dataset
print('creating dataset...')
dataset = SentencesDataset(sentences, vocab, seq_len)
# kwargs = {'num_workers':n_workers, 'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
data_loader = torch.utils.data.DataLoader(dataset, **kwargs)

loading text...
tokenizing sentences...
creating/loading vocab...
and . glowed
creating dataset...


In [None]:
# =============================================================================
# Model
# =============================================================================
#init model
print('initializing model...')
model = Transformer(n_code, n_heads, embed_size, inner_ff_size, len(dataset.vocab), seq_len, dropout)
model = model.cuda()
model.to("cuda")

initializing model...


Transformer(
  (embeddings): Embedding(23948, 128)
  (pe): PositionalEmbedding()
  (encoders): ModuleList(
    (0): EncoderLayer(
      (mha): MultiHeadAttention(
        (linear): Linear(in_features=128, out_features=384, bias=True)
        (out): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (1): EncoderLayer(
      (mha): MultiHeadAttention(
        (linear): Linear(in_features=128, out_features=384, bias=True)
        (out): Linear(in_features=128, out_featu

In [None]:
# =============================================================================
# Optimizer
# =============================================================================
print('initializing optimizer and loss...')
optimizer = optim.Adam(model.parameters(), **optim_kwargs)
loss_model = nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)

initializing optimizer and loss...


In [None]:
# =============================================================================
# Train
# =============================================================================
print('training...')
print_each = 20
model.train()
batch_iter = iter(data_loader)
n_iteration = 5_000
for it in range(n_iteration):
    #get batch
    batch, batch_iter = get_batch(data_loader, batch_iter)

    #infer
    masked_input = batch['input']
    masked_target = batch['target']

    masked_input = masked_input.cuda(non_blocking=True)
    masked_target = masked_target.cuda(non_blocking=True)
    output = model(masked_input)
    #compute the cross entropy loss
    output_v = output.view(-1,output.shape[-1])
    target_v = masked_target.view(-1,1).squeeze()
    loss = loss_model(output_v, target_v)
    #compute gradients
    loss.backward()

    #apply gradients
    optimizer.step()

    #print step
    if it % print_each == 0:
        print('it:', it,
              ' | loss', np.round(loss.item(),2),
              ' | Δw:', round(model.embeddings.weight.grad.abs().sum().item(),3))

    #reset gradients
    optimizer.zero_grad()

# =============================================================================
# Saving the embeddings for future use
# =============================================================================
print('saving embeddings for future use...')
print(f'model.embeddings.weight.shape - {model.embeddings.weight.shape}, len(dataset.rvocab) : {len(dataset.rvocab)}')
N = len(dataset.rvocab)
np.savetxt('values_03242023b.tsv', np.round(model.embeddings.weight.detach().cpu().numpy()[0:N], 2), delimiter='\t', fmt='%1.2f')
s = [dataset.rvocab[i] for i in range(N)]
open('names_032423b.tsv', 'w+').write('\n'.join(s) )

print('end')

training...
it: 0  | loss 10.26  | Δw: 4.178
it: 20  | loss 8.14  | Δw: 3.108
it: 40  | loss 7.2  | Δw: 2.912
it: 60  | loss 6.61  | Δw: 2.711
it: 80  | loss 6.05  | Δw: 2.488
it: 100  | loss 5.71  | Δw: 2.427
it: 120  | loss 5.22  | Δw: 2.242
it: 140  | loss 4.85  | Δw: 2.143
it: 160  | loss 4.56  | Δw: 2.096
it: 180  | loss 4.27  | Δw: 2.033
it: 200  | loss 3.89  | Δw: 1.91
it: 220  | loss 3.76  | Δw: 1.931
it: 240  | loss 3.47  | Δw: 1.865
it: 260  | loss 3.17  | Δw: 1.769
it: 280  | loss 3.01  | Δw: 1.747
it: 300  | loss 2.82  | Δw: 1.69
it: 320  | loss 2.66  | Δw: 1.658
it: 340  | loss 2.55  | Δw: 1.646
it: 360  | loss 2.42  | Δw: 1.622
it: 380  | loss 2.28  | Δw: 1.577
it: 400  | loss 2.18  | Δw: 1.553
it: 420  | loss 2.12  | Δw: 1.556
it: 440  | loss 1.96  | Δw: 1.483
it: 460  | loss 1.88  | Δw: 1.454
it: 480  | loss 1.84  | Δw: 1.467
it: 500  | loss 1.7  | Δw: 1.391
it: 520  | loss 1.64  | Δw: 1.365
it: 540  | loss 1.58  | Δw: 1.351
it: 560  | loss 1.51  | Δw: 1.301
it: 580  | 

### **Quick inferencing with above trained model weights**

In [None]:
#Loading the Test Input
pth = './BERT/small_training.txt'
test_sentences = open(pth).read().lower().split('\n')

print('tokenizing test_sentences...')
special_chars = ',?;.:/*!+-()[]{}"\'&'
test_sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in test_sentences]
test_sentences = [[w for w in s if len(w)] for s in test_sentences]
print(f'len(test_sentences) : {len(test_sentences)}')

#creating test_dataset, test_data_loader
print('creating test_dataset and test_data_loader...')
batch_size = 5
test_dataset = SentencesDataset(test_sentences, vocab, seq_len)
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
test_data_loader = torch.utils.data.DataLoader(test_dataset, **kwargs)

# Creating test_batch and test_input and test_target with 5 sentences
test_batch_iter = iter(test_data_loader)
test_batch, test_batch_iter = get_batch(test_data_loader, test_batch_iter)
print(f"len(test_batch['input']) : {len(test_batch['input'])} len(test_batch['target']) : {len(test_batch['target'])}")
test_input = test_batch['input']
test_target = test_batch['target']
test_input = test_input.cuda(non_blocking=True)
test_target = test_target.cuda(non_blocking=True)
print(test_input.shape, test_target.shape)

# Prediction
model.eval()
with torch.no_grad():
      test_out = model(test_input)
      test_out_f = test_out.argmax(dim=-1)
print(test_out.shape, test_out_f.shape)
print(test_input)
print(test_out_f)

# Using test_dataset.rvocab for decoding the output
test_input_2 = test_input.squeeze(0).tolist()
test_output  = test_out_f.squeeze(0).tolist()
s_in = []
s_out = []
for i in range(batch_size):
   s_in = []
   print(f'*** Sentence {i} ***')
   for elem in test_input_2[i]:
       s_in.append(test_dataset.rvocab[elem])
   print(f'Input : {s_in}')
   s_out = []
   for elem in test_output[i]:
       s_out.append(test_dataset.rvocab[elem])
   print(f'Output: {s_out}')

tokenizing test_sentences...
len(test_sentences) : 5
creating test_dataset and test_data_loader...
len(test_batch['input']) : 5 len(test_batch['target']) : 5
torch.Size([5, 20]) torch.Size([5, 20])
torch.Size([5, 20, 23948]) torch.Size([5, 20])
tensor([[    5,    68,    89,     6,  3234,    11, 23946,  4731, 13220,   190,
          4542, 23946,   189,     1,     5,    93,    12,    19,  1898, 23946],
        [   13,    29, 23946, 23946,     0,    45,    36,   887,    75,     6,
          3263,     9, 23946,  4731, 13220,    13, 23946,     4,    45,    36],
        [ 3650,     0,    45,    36,  2034,   111, 23946,     1,     5,    68,
            89,     6,  3234,    11, 23946,  4731, 13220,   190,  4542, 23946],
        [    5,    93,    12,    19,  1898, 23946,    45,   217, 13220,     1,
            37, 10934,   114,     5,   107,    19,  4542, 23946,    14,    13],
        [   37, 10934,   114,     5,   107,    19,  4542, 23946,    14,    13,
            29, 23946, 23946,     0,    

### **Inferencing at a later point (w/o training model). We will use the embeddings saved from last training**

In [None]:
#Loading the Test Input & create vocab if not already created

print('loading testing text...')
pth = './BERT/small_training.txt'
test_sentences = open(pth).read().lower().split('\n')

print('tokenizing test_sentences...')
special_chars = ',?;.:/*!+-()[]{}"\'&'
test_sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in test_sentences]
test_sentences = [[w for w in s if len(w)] for s in test_sentences]
print(f'len(test_sentences) : {len(test_sentences)}')

print('creating/loading vocab...')
pth = './BERT/vocab.txt'
if not exists(pth):
    words = [w for s in sentences for w in s]
    vocab = Counter(words).most_common(n_vocab) #keep the N most frequent words
    vocab = [w[0] for w in vocab]
    open(pth, 'w+').write('\n'.join(vocab))
else:
    vocab = open(pth).read().split('\n')

loading testing text...
tokenizing test_sentences...
len(test_sentences) : 6
creating/loading vocab...


In [None]:
#creating test_dataset, test_data_loader

print('creating test_dataset and test_data_loader...')
batch_size = 2
test_dataset = SentencesDataset(test_sentences, vocab, seq_len)
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
test_data_loader = torch.utils.data.DataLoader(test_dataset, **kwargs)

initializing..
creating test_dataset and test_data_loader...


In [None]:
# initializingmodel (Use this step only if are not training the model)

print('initializing..')
batch_size = 2
seq_len = 20
embed_size = 128
inner_ff_size = embed_size * 4
n_heads = 8
n_code = 8
n_vocab = 40000
dropout = 0.1

model = Transformer(n_code, n_heads, embed_size, inner_ff_size, len(test_dataset.vocab), seq_len, dropout)
model = model.cuda()

# Getting the saved embedding from LAST training
saved_embedding = np.genfromtxt(fname='./BERT/values_03242023a.tsv', delimiter='\t')
print(f'saved_embedding.shape : {saved_embedding.shape} type(saved_embedding) : {type(saved_embedding)}')
# Loading the model embedding with the pretrained weight we just got above
model.embeddings.weight.data.copy_(torch.from_numpy(saved_embedding))
print(f'model.embeddings.weight.shape - {model.embeddings.weight.shape}')

saved_embedding.shape : (23948, 128) type(saved_embedding) : <class 'numpy.ndarray'>
model.embeddings.weight.shape - torch.Size([23948, 128])


In [None]:
# Creating test_batch and test_input and test_target with just 1 sentence
test_batch_iter = iter(test_data_loader)
test_batch, test_batch_iter = get_batch(test_data_loader, test_batch_iter)
print(f"len(test_batch['input']) : {len(test_batch['input'])} len(test_batch['target']) : {len(test_batch['target'])}")
test_input = test_batch['input'][0].unsqueeze(0)
test_target = test_batch['target'][0].unsqueeze(0)
test_input = test_input.cuda(non_blocking=True)
test_target = test_target.cuda(non_blocking=True)
print(test_input.shape, test_target.shape, test_input)

len(test_batch['input']) : 2 len(test_batch['target']) : 2
torch.Size([1, 20]) torch.Size([1, 20]) tensor([[   37, 10934,   114,     5,   107,    19,  4542, 23946,    14,    13,
            29, 23946, 23946,     0,    45,    36,   887,    75,     6,  3263]],
       device='cuda:0')


In [None]:
# Prediction
model.eval()
with torch.no_grad():
      test_out = model(test_input)
      test_out_f = test_out.argmax(dim=-1)
print(test_out.shape, test_out_f.shape)
print(test_input)
print(test_out_f)

torch.Size([1, 20, 23948]) torch.Size([1, 20])
tensor([[   37, 10934,   114,     5,   107,    19,  4542, 23946,    14,    13,
            29, 23946, 23946,     0,    45,    36,   887,    75,     6,  3263]],
       device='cuda:0')
tensor([[ 9884,  1912, 17347,  1885, 17609, 16511, 22379, 18563, 13211, 21702,
         19497, 22380,  8627, 17609,  7169, 18382,  7146, 15389, 17347,  6336]],
       device='cuda:0')


In [None]:
# Using test_dataset.rvocab for decoding the output
test_input_2 = test_input.squeeze(0).tolist()
test_output  = test_out_f.squeeze(0).tolist()
s_in = []
for elem in test_input_2:
   s_in.append(test_dataset.rvocab[elem])
print(s_in)
s_out = []
for elem in test_output:
   s_out.append(test_dataset.rvocab[elem])
print(s_out)

['what', 'dimensions', 'can', 'i', 'give', 'for', 'text', '<oov>', '?', 'in', 'this', '<oov>', '<oov>', ',', 'we', 'will', 'learn', 'how', 'to', 'create']
['accessary', 'aim', 'joyed', 'victory', 'capacities', 'cannibally', 'spangle', 'minotaurs', 'corporate', 'seymour', 'wheer', 'allots', 'pursuing', 'capacities', 'searching', 'fightest', 'bolts', '123', 'joyed', 'castles']
