In [1]:
import yaml
import torch

from transformers import BertTokenizer, BertForMaskedLM


In [2]:
import archs.data_loader as dl

In [3]:
PATH_TO_CONFIG = './config/config.yml'

In [4]:
BATCH_SIZE = 16
SHUFFLE = True

# loading the config

In [5]:
with open(PATH_TO_CONFIG, 'r') as f:
    config = yaml.safe_load(f)

# loading the job descriptions

In [6]:
d_l = dl.FileLoader(config['train_path'])

In [7]:
d_l.load()

In [8]:
d_l.ds_dict.keys()

dict_keys(['id_competence', 'lbl_competence', 'type_competence'])

In [9]:
sequences = d_l.ds_dict['lbl_competence']

# loading the tokenizer and the model

About 8'30'' to download more than 430 Mo

In [10]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Example

In [11]:
inputs = tokenizer(sequences, return_tensors = 'pt', max_length = 512, truncation = True, padding = 'max_length')


In [12]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [13]:
inputs['input_ids'][:2]

tensor([[ 101, 9530, 8566,  ...,    0,    0,    0],
        [ 101, 5461, 2139,  ...,    0,    0,    0]])

In [14]:
inputs['input_ids'][:2].shape

torch.Size([2, 512])

## Creation of the label tensor in the dict

In [15]:
inputs['labels'] = inputs['input_ids'].detach().clone()

this will be our target

detach() means we can't track the gradients of this tensor, which is ok  
clone() returns a copy of the input (pass by value, the original won't be affected)

In [16]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

## Creation of the masks

In [17]:
# create random array of floats with equal dimensions to input_ids tensor by sampling from the uniform law U]0,1[
rand = torch.rand(inputs['input_ids'].shape)


In [18]:
rand.shape

torch.Size([14021, 512])

In [19]:
rand

tensor([[0.4476, 0.1555, 0.2437,  ..., 0.6338, 0.6412, 0.8486],
        [0.8560, 0.4294, 0.2178,  ..., 0.1396, 0.8583, 0.9111],
        [0.6338, 0.3991, 0.3644,  ..., 0.4509, 0.7025, 0.9178],
        ...,
        [0.7766, 0.1855, 0.7548,  ..., 0.7993, 0.5181, 0.1268],
        [0.1580, 0.0079, 0.2531,  ..., 0.5295, 0.3534, 0.6385],
        [0.7431, 0.2713, 0.2097,  ..., 0.6401, 0.0312, 0.3916]])

separative token = 102  
begining token = 101  
padding token = 0  

We want 'False' wherever we have one of these  
We want to mask 15% of the words, that is, each token is masked with probability 1

'True' elements are the one to be masked

In [20]:
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [21]:
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [22]:
mask_arr[0].nonzero()

tensor([[ 6],
        [12],
        [15],
        [16],
        [20],
        [21],
        [22],
        [23],
        [27],
        [35]])

nonzero() returns the indices of the non False values i.e. the 'True' values

We want this for every sequence, and as a list

In [23]:
torch.flatten(mask_arr[0].nonzero()).tolist()

[6, 12, 15, 16, 20, 21, 22, 23, 27, 35]

In [24]:
selection = []

for row in range(inputs['input_ids'].shape[0]):
    
    selection.append(torch.flatten(mask_arr[row].nonzero()).tolist())

In [25]:
selection[:3]

[[6, 12, 15, 16, 20, 21, 22, 23, 27, 35], [], [5]]

now we set every values that has these indices in input_ids to 103 wich is our mask token

In [26]:
for row in range(inputs['input_ids'].shape[0]):

    inputs['input_ids'][row, selection[row]] = 103

In [27]:
inputs['input_ids']

tensor([[  101,  9530,  8566,  ...,     0,     0,     0],
        [  101,  5461,  2139,  ...,     0,     0,     0],
        [  101, 21183, 24411,  ...,     0,     0,     0],
        ...,
        [  101,  1041,  2497,  ...,     0,     0,     0],
        [  101,   103,  1011,  ...,     0,     0,     0],
        [  101,  3968,  1011,  ...,     0,     0,     0]])

## DataLoader

three methods needed : __init__(), __get_item__(), __length__()

__get_item__() returns a doctionary formatted batch of the inputs  
__length__() so you can check the length of a dataset

In [40]:
class JobDescriptionDataset(torch.utils.data.Dataset):

    def __init__(self, encodings):

        self.encodings = encodings


    def __getitem__(self, idx):

        return {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}


    def __len__(self):

        return self.encodings['input_ids'].shape[0]

In [41]:
dataset = JobDescriptionDataset(encodings = inputs)

In [42]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = SHUFFLE)

In [43]:
from tqdm import tqdm

In [44]:
torch.cuda.is_available()

True

In [45]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [46]:
model.to(device)


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [47]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [49]:
torch.cuda.empty_cache()

In [50]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  # Remove the CWD from sys.path while we load stuff.
  0%|          | 0/877 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 5.81 GiB total capacity; 3.93 GiB already allocated; 58.56 MiB free; 3.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF