In [1]:
import yaml
import torch

from transformers import BertTokenizer, BertForMaskedLM


In [2]:
from transformers import CamembertForMaskedLM
from transformers import CamembertTokenizer

In [7]:
import sys
sys.path.insert(0, "./../archs")


In [8]:
import archs.data_loader as dl

In [9]:
PATH_TO_CONFIG = './../../data/config/config.yml'
PATH_TO_DATA = './../../src/../data/ref_comp.csv'

In [10]:
BATCH_SIZE = 16
SHUFFLE = True

# loading the config

In [11]:
with open(PATH_TO_CONFIG, 'r') as f:
    config = yaml.safe_load(f)

FileNotFoundError: [Errno 2] No such file or directory: './../../data/config/config.yml'

# loading the job descriptions

In [12]:
d_l = dl.FileLoader(PATH_TO_DATA)

In [13]:
d_l.load()

14021it [00:00, 553245.50it/s]


In [14]:
d_l.ds_dict.keys()

dict_keys(['id_competence', 'lbl_competence', 'type_competence'])

In [15]:
sequences = d_l.ds_dict['lbl_competence']

# loading the tokenizer and the model

About 8'30'' to download more than 430 Mo

In [17]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
tokenizer_fr = CamembertTokenizer.from_pretrained("camembert-base")
model_fr = CamembertForMaskedLM.from_pretrained("camembert-base")

In [17]:
model_fr = CamembertForMaskedLM.from_pretrained('camembert-base')

# Example

In [19]:
inputs = tokenizer_fr(sequences, return_tensors = 'pt', max_length = 512, truncation = True, padding = 'max_length')


In [20]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [21]:
inputs['input_ids'][:2]

tensor([[    5,  1331, 10456,  ...,     1,     1,     1],
        [    5, 22015,     8,  ...,     1,     1,     1]])

In [22]:
inputs['input_ids'][:2].shape

torch.Size([2, 512])

## Creation of the label tensor in the dict

In [23]:
inputs['labels'] = inputs['input_ids'].detach().clone()

this will be our target

detach() means we can't track the gradients of this tensor, which is ok  
clone() returns a copy of the input (pass by value, the original won't be affected)

In [24]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

## Creation of the masks

In [25]:
# create random array of floats with equal dimensions to input_ids tensor by sampling from the uniform law U]0,1[
rand = torch.rand(inputs['input_ids'].shape)


In [26]:
rand.shape

torch.Size([14021, 512])

In [27]:
rand

tensor([[0.9470, 0.1504, 0.4359,  ..., 0.8485, 0.2831, 0.7317],
        [0.5547, 0.4782, 0.9859,  ..., 0.1994, 0.8249, 0.8880],
        [0.5799, 0.7499, 0.0296,  ..., 0.1928, 0.9083, 0.4599],
        ...,
        [0.4396, 0.7220, 0.6353,  ..., 0.3495, 0.0502, 0.0050],
        [0.2415, 0.7356, 0.0095,  ..., 0.7183, 0.1983, 0.6903],
        [0.5453, 0.7300, 0.9522,  ..., 0.0193, 0.3576, 0.2386]])

separative token = 102  
begining token = 101  
padding token = 0  

We want 'False' wherever we have one of these  
We want to mask 15% of the words, that is, each token is masked with probability 1

'True' elements are the one to be masked

In [28]:
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [29]:
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False,  True,  True],
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ...,  True, False, False]])

In [30]:
mask_arr[0].nonzero()

tensor([[  8],
        [  9],
        [ 22],
        [ 25],
        [ 28],
        [ 35],
        [ 43],
        [ 54],
        [ 57],
        [ 64],
        [ 67],
        [ 69],
        [ 90],
        [ 99],
        [100],
        [105],
        [116],
        [124],
        [133],
        [136],
        [140],
        [143],
        [149],
        [156],
        [157],
        [165],
        [174],
        [176],
        [185],
        [198],
        [201],
        [227],
        [235],
        [236],
        [250],
        [257],
        [259],
        [261],
        [281],
        [283],
        [285],
        [298],
        [306],
        [308],
        [317],
        [319],
        [338],
        [357],
        [359],
        [364],
        [365],
        [379],
        [381],
        [388],
        [396],
        [399],
        [401],
        [406],
        [418],
        [424],
        [432],
        [435],
        [437],
        [439],
        [446],
        [466],
        [4

nonzero() returns the indices of the non False values i.e. the 'True' values

We want this for every sequence, and as a list

In [31]:
torch.flatten(mask_arr[0].nonzero()).tolist()

[8,
 9,
 22,
 25,
 28,
 35,
 43,
 54,
 57,
 64,
 67,
 69,
 90,
 99,
 100,
 105,
 116,
 124,
 133,
 136,
 140,
 143,
 149,
 156,
 157,
 165,
 174,
 176,
 185,
 198,
 201,
 227,
 235,
 236,
 250,
 257,
 259,
 261,
 281,
 283,
 285,
 298,
 306,
 308,
 317,
 319,
 338,
 357,
 359,
 364,
 365,
 379,
 381,
 388,
 396,
 399,
 401,
 406,
 418,
 424,
 432,
 435,
 437,
 439,
 446,
 466,
 468,
 490,
 491,
 500]

In [32]:
selection = []

for row in range(inputs['input_ids'].shape[0]):
    
    selection.append(torch.flatten(mask_arr[row].nonzero()).tolist())

In [33]:
selection[:3]

[[8,
  9,
  22,
  25,
  28,
  35,
  43,
  54,
  57,
  64,
  67,
  69,
  90,
  99,
  100,
  105,
  116,
  124,
  133,
  136,
  140,
  143,
  149,
  156,
  157,
  165,
  174,
  176,
  185,
  198,
  201,
  227,
  235,
  236,
  250,
  257,
  259,
  261,
  281,
  283,
  285,
  298,
  306,
  308,
  317,
  319,
  338,
  357,
  359,
  364,
  365,
  379,
  381,
  388,
  396,
  399,
  401,
  406,
  418,
  424,
  432,
  435,
  437,
  439,
  446,
  466,
  468,
  490,
  491,
  500],
 [22,
  25,
  27,
  28,
  33,
  43,
  47,
  65,
  66,
  71,
  76,
  79,
  92,
  97,
  118,
  121,
  144,
  155,
  161,
  183,
  188,
  202,
  215,
  225,
  244,
  254,
  260,
  266,
  268,
  286,
  289,
  297,
  316,
  332,
  347,
  362,
  366,
  370,
  371,
  372,
  385,
  388,
  402,
  404,
  410,
  413,
  424,
  425,
  427,
  430,
  457,
  465,
  466,
  467,
  479,
  481,
  485,
  492,
  504],
 [2,
  4,
  14,
  15,
  17,
  25,
  30,
  42,
  54,
  57,
  59,
  69,
  79,
  90,
  94,
  97,
  99,
  105,
  108,
  110,
  11

now we set every values that has these indices in input_ids to 103 wich is our mask token

In [34]:
for row in range(inputs['input_ids'].shape[0]):

    inputs['input_ids'][row, selection[row]] = 103

In [35]:
inputs['input_ids']

tensor([[    5,  1331, 10456,  ...,     1,     1,     1],
        [    5, 22015,     8,  ...,     1,     1,     1],
        [    5, 16423,   103,  ...,     1,     1,     1],
        ...,
        [    5,   360,   385,  ...,     1,   103,   103],
        [    5, 18101,   103,  ...,     1,     1,     1],
        [    5,    21,  5354,  ...,   103,     1,     1]])

## DataLoader

three methods needed : __init__(), __get_item__(), __length__()

__get_item__() returns a doctionary formatted batch of the inputs  
__length__() so you can check the length of a dataset

In [36]:
class JobDescriptionDataset(torch.utils.data.Dataset):

    def __init__(self, encodings):

        self.encodings = encodings


    def __getitem__(self, idx):

        return {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}


    def __len__(self):

        return self.encodings['input_ids'].shape[0]

In [37]:
dataset = JobDescriptionDataset(encodings = inputs)

In [38]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = SHUFFLE)

In [39]:
from tqdm import tqdm

In [40]:
torch.cuda.is_available()

True

In [41]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [None]:
model.to(device)


In [42]:
model_fr.to(device)


CamembertForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

In [44]:
from transformers import AdamW

# activate training mode
model_fr.train()
# initialize optimizer
optim = AdamW(model_fr.parameters(), lr=5e-5)

In [45]:
torch.cuda.empty_cache()

In [46]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model_fr(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  # Remove the CWD from sys.path while we load stuff.
  0%|          | 0/877 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 5.81 GiB total capacity; 3.95 GiB already allocated; 67.94 MiB free; 4.00 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF