<a href="https://colab.research.google.com/github/alexisdr/uned-tfg/blob/main/UNED-TFG-3-train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Parameters

---



In [24]:
base_path = '/drive/My Drive/CorpusPFG/'

#Datasets procesados
dataset_path = base_path + 'Dataset'

#Model parameters
#CHECKPOINT = "allenai/led-base-16384"
#CHECKPOINT = "allenai/longformer-base-4096"
#CHECKPOINT = "bert-base-multilingual-cased"
CHECKPOINT = "PlanTL-GOB-ES/bsc-bio-es"
#CHECKPOINT = "PlanTL-GOB-ES/bsc-bio-ehr-es"
#CHECKPOINT = "PlanTL-GOB-ES/longformer-base-4096-biomedical-clinical-es"
NUM_EPOCHS = 1
BATCH_SIZE = 16
METRIC_NAME = "f1"
HUGGING_FACE_TOKEN = "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"

METRIC_AVERGE = "micro"

subir_a_hugging_faces = True
reducir_tamanyo_dataset_para_pruebas = False

## Set-up environment

First, we install the libraries which we'll use: HuggingFace Transformers and Datasets.

In [25]:
!pip install -q datasets evaluate transformers[sentencepiece] accelerate
# To run the training on TPU, you will need to uncomment the following line:
#!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3021, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2815, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 160, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 241, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 499, in run
    conflicts = self._determine_conflicts(to_install)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 

## Load dataset

Next, let's load a multi-label text classification dataset from files.


In [26]:
from google.colab import drive

drive.mount('/drive')

In [None]:
from datasets import DatasetDict

dataset = DatasetDict.load_from_disk(dataset_path)

In [None]:
dataset

Let's check the an example of the training split:

In [None]:
if (reducir_tamanyo_dataset_para_pruebas):
  dataset_train_reducido = dataset['train'].train_test_split(test_size=0.1)
  dataset['train']=dataset_train_reducido['test']

  dataset_test_reducido = dataset['test'].train_test_split(test_size=0.1)
  dataset['test']=dataset_test_reducido['test']

In [None]:
dataset['train'][0]

In [None]:
dataset['train'].features

The dataset consists of tweets, labeled with one or more emotions. 

Let's create a list that contains the labels, as well as 2 dictionaries that map labels to integers and back.

In [None]:
from datasets import ClassLabel

class2label = dataset['train'].features["label"]
id2label = {idx:label for idx, label in enumerate(class2label._int2str)}
label2id = class2label._str2int

print(class2label)
print(id2label)
print(label2id)

In [None]:
class2label.int2str(256)

## Preprocess data

As models like BERT don't expect text as direct input, but rather `input_ids`, etc., we tokenize the text using the tokenizer. Here I'm using the `AutoTokenizer` API, which will automatically load the appropriate tokenizer based on the checkpoint on the hub.

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, DataCollatorForLanguageModeling, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import numpy as np
from datasets import load_metric

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT, use_auth_token=HUGGING_FACE_TOKEN)

def preprocess_data(example):
  # toma el texto
  text = example["text"]
  # lo codifica con tokenizador
  encoding = tokenizer(text, padding="max_length", truncation=True)  
  
  labels = example["label_list"]
  # crea una matriz del tamaño del texto y las clases a entrenar
  labels_matrix = np.zeros((len(text), class2label.num_classes))

  for clase in labels:
    labels_matrix[:, clase] = 1

  # crea un vector del tamaño de las clases a entrenar
  #label_array = np.zeros(class2label.num_classes)
  # por cada clase de la muestra, pone a 1 el valor dentro del vector
  #for clase in example["label_list"]:
  #  label_array[clase] = 1

  # asigna el vector con los clases correspondientes activas
  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [None]:
dataset['train'][0]['text']

In [None]:
preprocess_data(dataset['train'][0])

In [None]:
encoded_dataset = dataset.map(
    preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

In [None]:
encoded_dataset

In [None]:
example = encoded_dataset['train'][0]
tokenizer.decode(example['input_ids'])

In [None]:
example['labels']

In [None]:
#class2label.int2str(example['labels'].item())

## Train the model!

We are going to train the model using HuggingFace's Trainer API.

In [None]:
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    encoded_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    encoded_dataset["validation"], batch_size=8, collate_fn=data_collator
)

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
from tqdm.auto import tqdm
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
import torch
from transformers import get_scheduler  
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from transformers import EvalPrediction

#metrics
def multi_label_metrics(predictions, labels, threshold=0.9):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
      
    precision_score_value = precision_score(y_true, y_pred, average=METRIC_AVERGE)
    recall_score_value = recall_score(y_true, y_pred, average=METRIC_AVERGE)
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average=METRIC_AVERGE)
    roc_auc = roc_auc_score(y_true, y_pred, average=METRIC_AVERGE)
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {
      'precision_score': precision_score_value,
      'recall_score': recall_score_value,
      'f1': f1_micro_average,
      'roc_auc': roc_auc,
      'accuracy': accuracy, 
      'y_pred': y_pred, 
      'y_true': y_true}
    return metrics

def compute_metrics(p: EvalPrediction):
    print ("p")
    print (p)
    print ("p.predictions")
    print (p.predictions)
    print ("len(p.predictions)")
    print (len(p.predictions))
    print ("len(p.predictions[0])")
    print (len(p.predictions[0]))
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions

    return multi_label_metrics(predictions=preds, labels=p.label_ids)

def training_model():

  accelerator = Accelerator()

  model = AutoModelForSequenceClassification.from_pretrained(
        CHECKPOINT, 
        num_labels=class2label.num_classes, 
        id2label = id2label, 
        label2id = label2id,
        problem_type = "multi_label_classification",
        use_auth_token = HUGGING_FACE_TOKEN) #"single_label_classification"

  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model.to(device)

  optimizer = AdamW(model.parameters(), lr=3e-5)
  train_dl, eval_dl, model, optimizer = accelerator.prepare(
      train_dataloader, eval_dataloader, model, optimizer
  )

  num_training_steps = NUM_EPOCHS * len(train_dataloader)
  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps,
  )

  progress_bar = tqdm(range(num_training_steps))

  training_args = TrainingArguments(
      evaluation_strategy="epoch", 
      save_strategy = "epoch",
      load_best_model_at_end=True,
      output_dir= MODEL_OUTPUT_DIR, 
      per_device_train_batch_size=BATCH_SIZE,
      per_device_eval_batch_size=BATCH_SIZE,
      num_train_epochs=NUM_EPOCHS,
      learning_rate=2e-5,
      weight_decay=0.01,
      metric_for_best_model=METRIC_NAME,
      hub_token=HUGGING_FACE_TOKEN,
      hub_private_repo=True,
      push_to_hub=subir_a_hugging_faces) 

  trainer = Trainer(
      model=model,
      args=training_args,
      data_collator=data_collator,
      train_dataset=encoded_dataset["train"],
      eval_dataset=encoded_dataset["validation"],
      compute_metrics=compute_metrics,
      optimizers=(optimizer,lr_scheduler)
  )

  trainer.train()

  if (subir_a_hugging_faces);
    tokenizer.push_to_hub(
        MODEL_OUTPUT_DIR, private=True, 
        use_auth_token=HUGGING_FACE_TOKEN)

In [None]:
from accelerate import notebook_launcher

MODEL_OUTPUT_DIR = "uned-tfg-08.19"
notebook_launcher(training_model)