### Installs

In [None]:
%%capture
!pip install transformers==4.28.0;
!pip install datasets;
!pip install wandb;
!pip uninstall -y transformers accelerate
!pip install transformers accelerate
!pip install -U transformers torch-summary
!pip install tqdm


In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Preprocess and Data Load

### Load model

In [None]:
%%capture

from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers.data.data_collator import default_data_collator
from transformers import AutoTokenizer

model_checkpoint = "model_route"

model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.2)


### Load Data

In [None]:
import datasets
from datasets import load_dataset, load_from_disk, Features, Sequence, Value, Dataset, DatasetDict

dataset_csv = load_dataset("csv", data_files={'train': 'your_train_data.csv', 'test': 'your_test_data.csv'})

print(dataset_csv['test']['String'][0])

## Cleanse registers without Time Expression


In [None]:
#Dataset antes de la limpieza
print(f"DATASET BEFORE CLEANSE\n")
print(dataset_csv['train'])
print(dataset_csv['test'])

In [None]:
#Gets all the indices of the registers without a Time Expression
from tqdm.auto import tqdm

values = ['date', 'time', 'duration', 'set']

####TRAIN####
indices_train_positive = []
is_value = False
with tqdm(total=len(dataset_csv['train'])) as pbar:
  for i, row in enumerate(dataset_csv['train']):
    if row['String'] == None:
      is_value = False
    else:
          for value in values:
            if value in row['String'].lower():
              is_value = True
    if is_value == True:
      indices_train_positive.append(i)
    is_value = False
    pbar.update()

####TEST####
indices_test_positive = []
is_value = False
with tqdm(total=len(dataset_csv['test'])) as pbar:
  for j, row in enumerate(dataset_csv['test']):
    if row['String'] == None:
      is_value = False
    else:
          for value in values:
            if value in row['String'].lower():
              is_value = True
    if is_value == True:
      indices_test_positive.append(j)
    is_value = False
    pbar.update()
print('Registers with at least one expression Train: ' + str(len(indices_train_positive)))
print('Registers without expression Train: ' + str(len(dataset_csv['train'])-len(indices_train_positive)))
print('Registers with at least one expression Test: ' + str(len(indices_test_positive)))
print('Registers without expression Test: ' + str(len(dataset_csv['test'])-len(indices_test_positive)))

In [10]:
#Divide and conquer
def binary_search(arr, target):
    left, right = 0, len(arr) - 1

    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return True  # Found the target
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
 
    return False  # Target not found

In [None]:
#Cleans the registers without Time Expresisons using divide and conquer
#TRAIN
with tqdm(total=len(dataset_csv['train'])) as pbar:
  indices_train_positive_aux = []
  for i in range(len(dataset_csv['train'])):
    if binary_search(indices_train_positive, i):
      indices_train_positive_aux.append(i)
    pbar.update()
  dataset_csv['train'] = dataset_csv['train'].select(indices_train_positive_aux)
  del indices_train_positive_aux
#TEST
with tqdm(total=len(dataset_csv['test'])) as pbar:
  indices_test_positive_aux = []
  for i in range(len(dataset_csv['test'])):
    if binary_search(indices_test_positive, i):
      indices_test_positive_aux.append(i)
    pbar.update()
  dataset_csv['test'] = dataset_csv['test'].select(indices_test_positive_aux)
  del indices_test_positive_aux

## Tokenize

In [None]:
#New tokens. Only needed for not fine-tuned models

#DATE and TIME
parts_of_year = ['WE','SP','SU','WI','FA','Q','Q1','Q2','Q3','Q4','Q5','Q6','Q7','Q8','Q9','T','T1','T2','T3','T4','T5','T6','T7','T8','T9','HALF','HALF1','HALF2']
parts_of_day = ['TAF','TMO','TMI','TEV','TNI','TDT']
fuzzy_expresiones = ['PRESENT', 'PAST', 'FUTURE']

#DURATION and SET
parts_of_duration_set = ['Y','M','W','D','H','MIN','S']

#Operations
operations = ['SUMA','RESTA','NEUTRO', 'DAYW', 'MONTHS']

#Extra Numbers
xx_dd = [] #XXDD (XX90 -> 12-05-90)
dd_5 = [] #DD.D (12.5)
dd_25 = [] #DD.D (12.25)
dd_75 = [] #DD.D (12.75)

for i in range(0,100):
    if i < 10:
        xx_dd.append('XX0' + str(i)) #Changes XX01 for XX1
        dd_5.append(str(i) + '.5')
        dd_25.append(str(i) + '.25')
        dd_75.append(str(i) + '.75')
    else:
        xx_dd.append('XX' + str(i))

#MISC
misc = ['X','XX','XXXX']
misc_2 = ['date','time','set','duration']

total_new_tokens = misc + dd_75 + dd_5 + dd_25 + xx_dd + operations + parts_of_duration_set + fuzzy_expresiones + parts_of_day + parts_of_year
total_new_tokens = [s.lower() for s in total_new_tokens] 
total_new_tokens += total_new_tokens + misc_2
total_new_tokens = list(set(total_new_tokens) - set(tokenizer.vocab.keys()))

print(f'New tokenizer length {len(tokenizer)}')

new_tokens = tokenizer.add_tokens(total_new_tokens)

print('New Tokens: ' + str(new_tokens))
print(f'New tokenizer length {len(tokenizer)}')

model.resize_token_embeddings(len(tokenizer))
del new_tokens

In [6]:
#TOKENIZE THE TEXT
def tokenize_function_for_chunk(examples):
    result = tokenizer(examples["String"])#, padding='max_length', truncation=True, max_length=512)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


In [None]:
columns_to_remove = []
tokenized_datasets_for_chunk = dataset_csv.map(
    tokenize_function_for_chunk, remove_columns=columns_to_remove, batched=True
)

print(tokenized_datasets_for_chunk)

## Make chunks

In [None]:
from datasets import Dataset, DatasetDict
from tqdm.auto import tqdm

#Sentences are concatenated until the CHUNK_SIZE is reached or exceeded. 
#If CHUNK_SIZE is exceeded, it is filled in with pad

tokenized_chunked_datasets = DatasetDict()
for dataset in tokenized_datasets_for_chunk:
  new_input_ids = [[]]
  new_attention_mask = [[]]
  new_word_ids = [[]]
  new_sentences = [[]]
  act_pos = 0
  act_len = 0
  list_files = dataset_csv[dataset]['File']
  list_sentences = dataset_csv[dataset]['Sentence']
  act_file = list_files[0]
  with tqdm(total = len(tokenized_datasets_for_chunk[dataset])) as pbar:
    for i, row in enumerate(tokenized_datasets_for_chunk[dataset]):
      if len(row['input_ids']) > chunk_size:
          False
      else:
          #If the chunk length is exceeded
          if len(row['input_ids']) + act_len >= chunk_size:
            #Padding
            #The tokens that are missing up to chunk size
            padding = chunk_size - len(new_input_ids[act_pos])
            new_input_ids[act_pos].insert(0, tokenizer.cls_token_id)
            new_input_ids[act_pos].extend([tokenizer.pad_token_id] * (padding - 1))
            new_attention_mask[act_pos].extend([0] * padding)
            #The word id of <pad> is None. That of all special tokens in fact.
            new_word_ids[act_pos].insert(0, None)
            new_word_ids[act_pos].extend([None] * (padding - 1))
            #Next
            act_pos += 1
            new_input_ids.append([])
            new_word_ids.append([])
            new_attention_mask.append([])
            new_sentences.append([])
            #Reset
            act_len = 0
    
          #Save actual register.
          new_input_ids[act_pos].extend(row['input_ids'][1:len(row['input_ids'])])
          new_word_ids[act_pos].extend(row['word_ids'][1:len(row['word_ids'])])
          new_attention_mask[act_pos].extend(row['attention_mask'][1:len(row['attention_mask'])])
          new_sentences[act_pos].append(list_sentences[i])
          act_len += len(row['input_ids'])
      pbar.update()

  #Last register
  if len(new_input_ids[len(new_input_ids) - 1]) != chunk_size:
    #The tokens that are missing up to chunk size
    padding = chunk_size - len(new_input_ids[act_pos])
    new_input_ids[act_pos].insert(0, tokenizer.cls_token_id)
    new_input_ids[act_pos].extend([tokenizer.pad_token_id] * (padding - 1))
    new_attention_mask[act_pos].extend([0] * padding)
    #The word id of <pad> is None. That of all special tokens in fact.
    new_word_ids[act_pos].insert(0, None)
    new_word_ids[act_pos].extend([None] * (padding - 1))
  
  dict_ = dict({'input_ids': new_input_ids, 'word_ids': new_word_ids, 'attention_mask': new_attention_mask, 'sentences': new_sentences})
  del list_sentences
  del list_files
  del new_input_ids
  del new_attention_mask
  del new_word_ids
  del new_sentences
  tokenized_chunked_datasets[dataset] = Dataset.from_dict(dict_)
  del dict_

## Add Labels

In [9]:
#Adds the labels column
import copy
def add_labels(examples):
  examples['labels'] = copy.deepcopy(examples['input_ids'])
  return examples
tokenized_chunked_datasets_labels = tokenized_chunked_datasets.map(add_labels, batched=True, remove_columns=['sentences']) 
tokenized_chunked_datasets_labels_train_dev = tokenized_chunked_datasets_labels.shuffle() #Shuffles the dataset randomly

# TRAIN

## Mask overwritte datacollator XLM

In [13]:
import random
import warnings
from collections.abc import Mapping
from dataclasses import dataclass
from random import randint
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
import math
import numpy as np
import torch
from transformers import DataCollatorForLanguageModeling

class MyDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
    def __init__(self, tokenizer, mlm=True, mlm_probability=0.15):
        # You can add or modify the constructor if needed
        super().__init__(tokenizer=tokenizer, mlm=mlm, mlm_probability=mlm_probability)
        
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        # Handle dict or lists with proper padding and conversion to tensor.
        if isinstance(examples[0], Mapping):
            word_ids = [d.pop('word_ids') for d in examples] #Quita los word_ids y los guarda
            batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
        else:
            word_ids = [d.pop('word_ids') for d in examples]
            batch = {
                "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
            }

        # If special token mask has been preprocessed, pop it from the dict.
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        if self.mlm:
            batch["input_ids"], batch["labels"] = self.torch_mask_tokens(
                batch["input_ids"], word_ids, special_tokens_mask=special_tokens_mask
            )
        else:
            labels = batch["input_ids"].clone()
            if self.tokenizer.pad_token_id is not None:
                labels[labels == self.tokenizer.pad_token_id] = -100
            batch["labels"] = labels
        return batch

    def torch_mask_tokens(self, inputs: Any, word_ids: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
        import torch
        global counter
        global T #Total training steps
        global T_epoch #Training steps per epoch
        inputs, labels = mask_roberta_cir_xlm(inputs, inputs.clone(), word_ids)
        return inputs, labels

#### With masking decay function

In [5]:
import copy
from transformers.data.data_collator import default_data_collator
import random
#Cir 55%, expresión 30%, type 15%
def mask_roberta_cir_xlm(inputs, labels, word_ids):
      global counter #counter total
      global counter_epoch #counter per epoch/training cycle 
      global counter_eval #counter per test cycle
      global T #Total training steps
      global T_epoch #Training steps per epoch
      global T_eval #Eval steps per epoch
      #global vuelta
      #global vueltas
    
  #try:
      #--------Lineal Function Parameters--------#
      per = 0.5
      max_lineal = 1
      min_lineal = 0.7
      m = (min_lineal-1)/(T*per)
      #--------Logistic Function Parameters--------#
      #min_logistic = 0.2
      #a = 1/min_lineal
      #p = ((1/min_logistic) - a)/(math.e**(per)-1)
      #--------Cosine Function Parameters--------#
      min_cos = 0.2
      max_cos = min_lineal
      a = (max_cos-min_cos)/2
      b = min_cos
      #Flags
      lesser_ = False
      expression_ = False
      masked_type = False 
      masked_expresion = False 
      masked_cir = False 
      masked_act_word = False
      maskable_act_word = False
      word_id_act_word = -1
      input_id_aux = []
      greater_id = tokenizer.encode(' >')[1:len(tokenizer.encode(' >'))-1][0]
      lesser_id = tokenizer.encode('<A')[1:len(tokenizer.encode('<A'))-1][0]
      lesser_id_2 = tokenizer.encode('A<')[1:len(tokenizer.encode('A<'))-1][1]
      types_ids = tokenizer.encode('<date <duration <time <set')[1:len(tokenizer.encode('<date <duration <time <set'))-1] #DATE|DURATION|TIME|SET 
      types_ids = list(filter((lesser_id).__ne__, types_ids))
      len_cir = [9, 8, 10, 8] #DATE|DURATION|TIME|SET #Length of the VIR of each type
      unk_token = -100 
      
      probabilities = [0.4, 0.4, 0.25] #Probs masking type|string|VIR
      prob_mask_word = 0.1 #probs masking regular token
      for i, example in enumerate(inputs):
        input_id_aux = []
        lesser_ = False
        expression_ = False
        
        #Evaluation
        if counter_epoch%T_epoch == 0 and counter_epoch != 0:
            probs_global = 1
        #Training
        else :
            #Lineal function
            if counter <= T * per:
                probs_global = m*counter + max_lineal
            #Coseno / Logistic
            else: 
                t_ = counter - (T*per+1) #- (T*per+1) #Regularizes the current position so that t_=0 when it starts generating probs with the logistic/cosine function
                
                #Cosine
                probs_global = (1+math.cos((math.pi*t_)/((T-(T*per+1))))) * a + b
                #Logistic
                #probs_global = 1/((a-p)+(p*math.e**(t_/T)))
        for j, input_id in enumerate(example):
          #If this token belongs to the same word as the previous token and the previous token was masked, this token is masked as well.
          if maskable_act_word and word_id_act_word == word_ids[i][j]:
              inputs[i][j] = tokenizer.mask_token_id
          else:
              masked_act_word = False
              maskable_act_word = False
              word_id_act_word = -1
              #If I encounter a ‘<’ it may be that I have started an expression
              if (input_id == lesser_id or input_id == lesser_id_2) and expression_ == False:
                lesser_ = True
                labels[i][j] = unk_token
                input_id_aux = [] #Saves the ids of expression tokens
                masked_type = False #To find out if something has already been masked inside the expression
                masked_expresion = False 
                masked_cir = False
              #If I encounter a ‘>’ it means that the expression is over. 
              elif input_id == greater_id and expression_:
                lesser_ = False
                expression_ = False
                #'>' is never masked
                labels[i][j] = unk_token
                  
                #The expression has already been closed, so I mask based on input_id_aux.
                #To know the length of the cir I need to know the type of the expression
                len_cir_aux = [len_cir[n] for n, type_aux in enumerate(types_ids) if input_id_aux[0] == type_aux][0]
                #Probs masking something = 100%. To reduce the overall probability multiply the array of probabilities by the reducing quotient.
                random_value = random.random()
                #Masking probabilities taking into account overall probabilities
                act_probabilities = [num * probs_global for num in probabilities]
                for n, input_id_expresion in enumerate(input_id_aux):
                    #Probs masking type
                    if n == 0:
                        if random_value < act_probabilities[0]:
                            inputs[i][j-len(input_id_aux) + n] = tokenizer.mask_token_id #j is the end position of the expression, subtract the length of the expression and add n, which is the current position in the expression.
                            masked_type = True
                        else:
                          labels[i][j-len(input_id_aux) + n] = unk_token
                    #Probs masking expression
                    if (n >= 1 and n<len(input_id_aux)-len_cir_aux) and masked_type == False:
                        if random_value < act_probabilities[0] + act_probabilities[1]:
                            inputs[i][j-len(input_id_aux) + n] = tokenizer.mask_token_id #j is the end position of the expression, subtract the length of the expression and add n, which is the current position in the expression.
                            masked_expresion = True
                        else:
                          labels[i][j-len(input_id_aux) + n] = unk_token
                    #Probs masking cir
                    elif n >= len(input_id_aux)-len_cir_aux and (masked_expresion == False and masked_type == False)  and (random_value < act_probabilities[0] + act_probabilities[1] + act_probabilities[2]):
                        inputs[i][j-len(input_id_aux) + n] = tokenizer.mask_token_id #j is the end position of the expression, subtract the length of the expression and add n, which is the current position in the expression.
                        masked_cir = True
                    else:
                      labels[i][j-len(input_id_aux) + n] = unk_token   
              #If I have already encountered < and this token is part of [DATE, TIME, DURATION, SET] I am within an expression most probably
              elif (input_id in types_ids and lesser_) or expression_:
                expression_ = True
                input_id_aux.append(input_id)
              else:
                  lesser_ = False
                  expression_ = False
                  input_id_aux = []
                  #If not pad
                  if input_id not in tokenizer.all_special_ids:
                      if word_id_act_word != word_ids[i][j]:
                        masked_act_word = True
                      else:
                        masked_act_word = False
                      random_value = random.random()
                      #Probability taking into account the overall probability
                      act_prob_mask_word = prob_mask_word * probs_global
                      #Do mask the word
                      if random_value < act_prob_mask_word and masked_act_word:
                          inputs[i][j] = tokenizer.mask_token_id
                          maskable_act_word = True
                      #Do not mask the word
                      else:
                          labels[i][j] = unk_token
                  #If pad no mask
                  else:
                      labels[i][j] = unk_token
                  word_id_act_word = word_ids[i][j]
        
        #Training
        if counter_epoch%T_epoch != 0 or counter_epoch == 0:
            counter += 1
            counter_epoch += 1
        #Evaluation
        elif counter_eval%T_eval != 0 or counter_eval == 0:
            counter_eval += 1
        #Finish evaluation continue Training
        if counter_eval == T_eval:
            counter_epoch = 0 #Resets the variable for the next training cycle. The global counter is not reset. This variable is only for knowing when training and when evaluating.
            counter_eval = 0 #Resets the variable for the next evaluation
            
      return inputs, labels
  
  #except Exception as e:
      #print('Error en el proceso de enmascaramiento, step ' + str(counter))
      #print(e)
      #print('INPUTS:')
      #for inp in inputs:
      #    print(inp)
      #print('INPUT:')
      #print(inputs[i])
      #print('------------------------------------------------')
      #return inputs, labels


#### WITHOUT masking decay function

In [14]:
import copy
from transformers.data.data_collator import default_data_collator
import random
#Cir 55%, expresión 30%, type 15%
#SIN FUNCIÓN DE ENMASCARAMIENTO, SIEMPRE AL 100%
def mask_roberta_cir_xlm(inputs, labels, word_ids):
      global counter #counter total
      global counter_epoch #counter por cada epoch /ciclo de entrenamiento
      global counter_eval #counter por cada ciclo de evaluación
      global T #Total training steps
      global T_epoch #Training steps per epoch
      global T_eval #Eval steps per epoch
      #global vuelta
      #global vueltas
    
  #try:
      #Flags
      lesser_ = False
      expression_ = False
      masked_type = False 
      masked_expresion = False
      masked_cir = False 
      masked_act_word = False
      maskable_act_word = False
      word_id_act_word = -1 
      input_id_aux = []
      greater_id = tokenizer.encode(' >')[1:len(tokenizer.encode(' >'))-1][0]
      lesser_id = tokenizer.encode('<A')[1:len(tokenizer.encode('<A'))-1][0]
      lesser_id_2 = tokenizer.encode('A<')[1:len(tokenizer.encode('A<'))-1][1]
      types_ids = tokenizer.encode('<date <duration <time <set')[1:len(tokenizer.encode('<date <duration <time <set'))-1] #DATE|DURATION|TIME|SET 
      types_ids = list(filter((lesser_id).__ne__, types_ids))
      len_cir = [9, 8, 10, 8] #DATE|DURATION|TIME|SET #Length of each cir
      unk_token = -100
      
      probabilities = [0.4, 0.4, 0.25] #Probs masking type, cir, expression
      prob_mask_word = 0.1 #0.05
      probs_global = 0.8 
      probs_global_mask_word = 1 #Probs masking tokens outside expression
      
      for i, example in enumerate(inputs):
        input_id_aux = []
        lesser_ = False
        expression_ = False
        
        
        for j, input_id in enumerate(example):
          #If this token belongs to the same word as the previous token and the previous token was masked, this token is masked as well.
          if maskable_act_word and word_id_act_word == word_ids[i][j]:
              inputs[i][j] = tokenizer.mask_token_id
          else:
              masked_act_word = False
              maskable_act_word = False
              word_id_act_word = -1
              #If I encounter a ‘<’ it may be that I have started an expression
              if (input_id == lesser_id or input_id == lesser_id_2) and expression_ == False:
                lesser_ = True
                labels[i][j] = unk_token
                input_id_aux = [] 
                masked_type = False 
                masked_expresion = False
                masked_cir = False
              #If I encounter a ‘>’ it means that the expression is over. 
              elif input_id == greater_id and expression_:
                lesser_ = False
                expression_ = False
                labels[i][j] = unk_token
                  
                #To know the length of the cir I have to know the type of expression.
                len_cir_aux = [len_cir[n] for n, type_aux in enumerate(types_ids) if input_id_aux[0] == type_aux][0]
                random_value = random.random()
                random_value_global = random.random()
                #Probabilities of masking taking into account the overall probabilities
                act_probabilities = [num * probs_global for num in probabilities]
                #Probs of masking
                for n, input_id_expresion in enumerate(input_id_aux):
                    if n == 0:
                        if random_value < act_probabilities[0]:
                            inputs[i][j-len(input_id_aux) + n] = tokenizer.mask_token_id #j es posición final de la expresión, se le resta la longitud de la expresión y se le va sumando n, que es la posición acutal en la expresión
                            masked_type = True
                        else:
                          labels[i][j-len(input_id_aux) + n] = unk_token
                    if (n >= 1 and n<len(input_id_aux)-len_cir_aux) and masked_type == False:
                        if random_value < act_probabilities[0] + act_probabilities[1]:
                            inputs[i][j-len(input_id_aux) + n] = tokenizer.mask_token_id
                            masked_expresion = True
                        else:
                          labels[i][j-len(input_id_aux) + n] = unk_token
                    elif n >= len(input_id_aux)-len_cir_aux and (masked_expresion == False and masked_type == False)  and (random_value < act_probabilities[0] + act_probabilities[1] + act_probabilities[2]):
                        inputs[i][j-len(input_id_aux) + n] = tokenizer.mask_token_id
                        masked_cir = True
                    else:
                      labels[i][j-len(input_id_aux) + n] = unk_token   
              elif (input_id in types_ids and lesser_) or expression_:
                expression_ = True
                input_id_aux.append(input_id)
              else:
                  lesser_ = False
                  expression_ = False
                  input_id_aux = []
                  if input_id not in tokenizer.all_special_ids:
                      if word_id_act_word != word_ids[i][j]:
                        masked_act_word = True
                      else:
                        masked_act_word = False
                      random_value = random.random()
                      act_prob_mask_word = prob_mask_word * probs_global_mask_word
                      if random_value < act_prob_mask_word and masked_act_word:
                          inputs[i][j] = tokenizer.mask_token_id
                          maskable_act_word = True
                      else:
                          labels[i][j] = unk_token
                  else:
                      labels[i][j] = unk_token
                  word_id_act_word = word_ids[i][j]
            
      return inputs, labels
  
  #except Exception as e:
      #print('Error en el proceso de enmascaramiento, step ' + str(counter))
      #print(e)
      #print('INPUTS:')
      #for inp in inputs:
      #    print(inp)
      #print('INPUT:')
      #print(inputs[i])
      #print('------------------------------------------------')
      #return inputs, labels


## Freezing layers

In [None]:
for name, param in model.named_parameters():
     #if name.startswith("roberta.encoder.layer.1."): # choose whatever you like here
     #    param.requires_grad = False
     #if name.startswith("roberta.encoder.layer.2."): # choose whatever you like here
     #    param.requires_grad = False
     #if name.startswith("roberta.encoder.layer.3."): # choose whatever you like here
     #    param.requires_grad = False
     #if name.startswith("roberta.encoder.layer.4."): # choose whatever you like here
     #    param.requires_grad = False
     #if name.startswith("roberta.encoder.layer.5."): # choose whatever you like here
     #    param.requires_grad = False
     #if name.startswith("roberta.encoder.layer.6."): # choose whatever you like here
     #    param.requires_grad = False
     if name.startswith("roberta.encoder.layer.7."): # choose whatever you like here
         param.requires_grad = False
     if name.startswith("roberta.encoder.layer.8."): # choose whatever you like here
         param.requires_grad = False
     if name.startswith("roberta.encoder.layer.9."): # choose whatever you like here
         param.requires_grad = False
     if name.startswith("roberta.encoder.layer.10."): # choose whatever you like here
         param.requires_grad = False
     if name.startswith("roberta.encoder.layer.11."): # choose whatever you like here
         param.requires_grad = False
     if name.startswith("roberta.encoder.layer.12."): # choose whatever you like here
         param.requires_grad = False

## Params and Train

In [None]:
import math
from math import floor, ceil

num_cards = ! nvidia-smi --query-gpu=name --format=csv,noheader | wc -l

model_name = model_checkpoint.split("/")[-1]
batch_size_ = 10
lr = 8e-5
wd = 0.01
epochs = 3
#Total Training Steps
T = floor((len(tokenized_chunked_datasets_labels_train_dev["train"])*epochs) / (batch_size_*int(num_cards[0])))

args = TrainingArguments(
    f"{model_name}",
    save_strategy = 'epoch',
    evaluation_strategy = "epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size_,
    per_device_eval_batch_size=batch_size_,
    num_train_epochs=epochs,
    weight_decay=wd,
    logging_steps = 1,
    load_best_model_at_end = True,
    remove_unused_columns=False, 
    seed = 42,
    #push_to_hub=True,
    #report_to = 'wandb',
)

In [31]:
counter = 0
counter_epoch = 0
counter_eval = 0

#Total steps
T = ceil((len(tokenized_chunked_datasets_labels_train_dev["train"])) / (batch_size_*int(num_cards[0])))*epochs
#Steps per training epoch
T_epoch = ceil(len(tokenized_chunked_datasets_labels_train_dev["train"])/ (batch_size_*int(num_cards[0])))
#Steps per evaluation epoch
T_eval = ceil(len(tokenized_chunked_datasets_labels_train_dev["test"])/ (batch_size_*int(num_cards[0])))

data_collator = MyDataCollatorForLanguageModeling(tokenizer=tokenizer)

trainer = Trainer(
      model,
      args,
      train_dataset=tokenized_chunked_datasets_labels_train_dev["train"],
      eval_dataset=tokenized_chunked_datasets_labels_train_dev["test"],
      tokenizer=tokenizer,
      data_collator=data_collator,
  )

In [18]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33masdc-s5[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,1.572,1.894682
2,1.0885,1.776431
3,0.8449,1.429577




TrainOutput(global_step=393, training_loss=1.6128436866910707, metrics={'train_runtime': 559.514, 'train_samples_per_second': 13.957, 'train_steps_per_second': 0.702, 'total_flos': 3641726511286272.0, 'train_loss': 1.6128436866910707, 'epoch': 3.0})

In [28]:
#Save the model
model_name = 'XLM_normalization_BEST_MODEL'
trainer.save_model(model_name)

# Predict Tokens (genera los n primeros tokens predichos por el modelo)

In [None]:
#Gets the first n predictions (num_predicts) for an input (double_mask_sentence)
def predict_tokens(double_mask_sentence, model_, tokenizer):
  num_predicts = 1

  double_mask_input = tokenizer.encode(double_mask_sentence, return_tensors="pt").to(device)
  double_mask_input

  double_mask_token_logits = model_(double_mask_input)[0].squeeze().detach()
  is_masked = torch.where(double_mask_input == tokenizer.mask_token_id, 1, 0)
  masked_idxs = torch.nonzero(is_masked)
  probs= torch.softmax(double_mask_token_logits[masked_idxs[:,1]], dim=1)

  top_vocab_idxes = torch.topk(probs, num_predicts)
  lista = []
    
  for token_id in top_vocab_idxes[1]:
      lista.append(tokenizer.decode(token_id))
  return lista

In [None]:
#Load Trainer

args = TrainingArguments(
    f"temp-finetuned",
)
data_collator = MyDataCollatorForLanguageModeling(tokenizer=tokenizer)

trainer = Trainer(
      model,
      args,
      tokenizer=tokenizer,
      data_collator=data_collator,
  )

In [None]:
#Generate a TE value
import torch

sentence = '<duration 5 weeks <mask><mask><mask><mask><mask><mask><mask><mask> >'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ = trainer.model.to(device)
predicciones = predict_tokens(sentence, model_, trainer.tokenizer)
print(' '.join(predicciones))


In [None]:
#Generate a duration for each possibility
list = ['years', 'months', 'weeks', 'days', 'hours', 'minutes', 'seconds']
a = '<duration 12 '
b = ' <mask><mask><mask><mask><mask><mask><mask><mask> >'
for i in list:
    sentence = a + i + b
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_ = trainer.model.to(device)
    predicciones = predict_tokens(sentence, model_, trainer.tokenizer)
    print(f'{i}:{" ".join(predicciones)}')