In [1]:
import yaml
import torch

from transformers import BertTokenizer, BertForMaskedLM


In [2]:
import archs.data_loader as dl

SyntaxError: unexpected EOF while parsing (data_loader.py, line 33)

In [3]:
PATH_TO_CONFIG = './config/config.yml'

In [38]:
BATCH_SIZE = 16
SHUFFLE = True

# loading the config

In [4]:
with open(PATH_TO_CONFIG, 'r') as f:
    config = yaml.safe_load(f)

# loading the job descriptions

In [5]:
d_l = dl.DataLoader(config['train_path'])

In [6]:
d_l.load()

In [7]:
d_l.ds_dict.keys()

dict_keys(['id_competence', 'lbl_competence', 'type_competence'])

In [8]:
sequences = d_l.ds_dict['lbl_competence']

# loading the tokenizer and the model

About 8'30'' to download more than 430 Mo

In [10]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Example

In [11]:
inputs = tokenizer(sequences, return_tensors = 'pt', max_length = 512, truncation = True, padding = 'max_length')


In [13]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [15]:
inputs['input_ids'][:2]

tensor([[ 101, 9530, 8566,  ...,    0,    0,    0],
        [ 101, 5461, 2139,  ...,    0,    0,    0]])

In [16]:
inputs['input_ids'][:2].shape

torch.Size([2, 512])

## Creation of the label tensor in the dict

In [None]:
inputs['labels'] = inputs['input_ids'].detach().clone()

this will be our target

detach() means we can't track the gradients of this tensor, which is ok  
clone() returns a copy of the input (pass by value, the original won't be affected)

In [17]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

## Creation of the masks

In [21]:
# create random array of floats with equal dimensions to input_ids tensor by sampling from the uniform law U]0,1[
rand = torch.rand(inputs['input_ids'].shape)


In [22]:
rand.shape

torch.Size([14021, 512])

In [23]:
rand

tensor([[0.5709, 0.3009, 0.5886,  ..., 0.8202, 0.6083, 0.5662],
        [0.3738, 0.7536, 0.9477,  ..., 0.0819, 0.6572, 0.0556],
        [0.9984, 0.7048, 0.9634,  ..., 0.7943, 0.6839, 0.5461],
        ...,
        [0.9750, 0.2480, 0.2650,  ..., 0.6227, 0.8955, 0.3383],
        [0.2328, 0.0390, 0.8431,  ..., 0.3585, 0.4164, 0.1794],
        [0.6978, 0.3681, 0.3541,  ..., 0.1726, 0.7106, 0.3240]])

separative token = 102  
begining token = 101  
padding token = 0  

We want 'False' wherever we have one of these  
We want to mask 15% of the words, that is, each token is masked with probability 1

'True' elements are the one to be masked

In [24]:
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [25]:
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [26]:
mask_arr[0].nonzero()

tensor([[12],
        [19],
        [21],
        [26],
        [33]])

nonzero() returns the indices of the non False values i.e. the 'True' values

We want this for every sequence, and as a list

In [27]:
torch.flatten(mask_arr[0].nonzero()).tolist()

[12, 19, 21, 26, 33]

In [28]:
selection = []

for row in range(inputs['input_ids'].shape[0]):
    
    selection.append(torch.flatten(mask_arr[row].nonzero()).tolist())

In [30]:
selection[:3]

[[12, 19, 21, 26, 33], [4], [9]]

now we set every values that has these indices in input_ids to 103 wich is our mask token

In [34]:
for row in range(inputs['input_ids'].shape[0]):

    inputs['input_ids'][row, selection[row]] = 103

In [35]:
inputs['input_ids']

tensor([[  101,  9530,  8566,  ...,     0,     0,     0],
        [  101,  5461,  2139,  ...,     0,     0,     0],
        [  101, 21183, 24411,  ...,     0,     0,     0],
        ...,
        [  101,  1041,  2497,  ...,     0,     0,     0],
        [  101,   103,  1011,  ...,     0,     0,     0],
        [  101,  3968,  1011,  ...,     0,     0,     0]])

## DataLoader

three methods needed : __init__(), __get_item__(), __length__()

__get_item__() returns a doctionary formatted batch of the inputs  
__length__() so you can check the length of a dataset

In [40]:
class JobDescriptionDataset(torch.utils.data.Dataset):

    def __init__(self, encodings):

        self.encodings = encodings


    def __get_item__(self, idx):

        return {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}


    def __len__(self):

        return self.encodings['input_ids'].shape[0]

In [41]:
dataset = JobDescriptionDataset(encodings = inputs)

In [42]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = SHUFFLE)