* Veure
https://discuss.huggingface.co/t/most-efficient-multi-label-classifier/9296/2

* The Artificial Guy - MULTI-LABEL TEXT CLASSIFICATION USING BERT AND PYTORCH
https://www.youtube.com/watch?v=f-86-HcYYi8

* Saurabh Anand - BERT for Multi-Label Classification
https://www.youtube.com/watch?v=JjcxZPNZbUY

* KGP Talkie - 5 - Multi-Label Text Classification Model with DistilBERT and Hugging Face Transformers in PyTorch
https://www.youtube.com/watch?v=ZYc9za75Chk

* Fine Tuning BERT for a Multi-Label Classification Problem on Colab - https://medium.com/@abdurhmanfayad_73788/fine-tuning-bert-for-a-multi-label-classification-problem-on-colab-5ca5b8759f3f

* BERT and DistilBERT Models for NLP - https://medium.com/@kumari01priyanka/bert-and-distilbert-model-for-nlp-7352eb16915e

* Choosing the Right Colab Runtime: A Guide for Data Scientists and Analysts - https://drlee.io/choosing-the-right-colab-runtime-a-guide-for-data-scientists-and-analysts-57ee7b7c9638

* distilbert / distilbert-base-uncased - https://huggingface.co/distilbert/distilbert-base-uncased

* "DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter" - https://arxiv.org/abs/1910.01108

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp /content/drive/MyDrive/TFM-MUECIM/*.py /content
!cp /content/drive/MyDrive/TFM-MUECIM/*.txt /content
!cp /content/drive/MyDrive/TFM-MUECIM/*.csv /content
!cp /content/drive/MyDrive/TFM-MUECIM/data/datasets/EURLEX57K/*.json /content

In [None]:
!pip install transformers



In [None]:
import sys
baseDir = '/content' #/drive/My Drive/TFM-MUECIM'
sys.path.append(baseDir)

In [None]:
import os
import torch
from tfm_LabelLoader import LabelLoader
from tfm_EURLEX57KDataset import EURLEX57KDataset
from torch.utils.data import DataLoader, random_split
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EvalPrediction
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [None]:
def countNonZeroItems(items):
    nonZero = torch.nonzero(items, as_tuple= True)
    return len(nonZero[0])

Segons el notebook:
Fine-tuning BERT (and friends) for multi-label text classification.ipynb

https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb#scrollTo=HgpKXDfvKBxn  

In [None]:
# ensures reproducibility
torch.manual_seed(0)

<torch._C.Generator at 0x7d7d86a9be90>

In [None]:
# load labels
labelLoader = LabelLoader(baseDir)
len(labelLoader.labels)

7201

In [None]:
ds = EURLEX57KDataset(baseDir,'EURLEX57KDataFrame.csv')
trainData, valData, testData = random_split(ds, [45000, 6000, 6000])

In [None]:
# set batch size
batchSize = 10

# create dataloaders. In case we'll use a more classical pipeline approach
trainDataLoader = DataLoader(trainData, batch_size=batchSize, shuffle=True)
valDataLoader = DataLoader(valData, batch_size=batchSize, shuffle=True)
testDataLoader = DataLoader(testData, batch_size=batchSize, shuffle=True)

In [None]:
# iterate through val batches
for i, batch in enumerate(valDataLoader):
  print(f'Batch {i}: ')
  batchFileNames = batch.get('fileName')
  batchData = batch.get('input_ids')
  batchAttentionMasks = batch.get('attention_mask')
  batchLabels = batch.get('labels')

  for elem in zip(batchFileNames, batchData, batchAttentionMasks, batchLabels):
    print(f'fileName: {elem[0]}')
    print(f'input_ids (5 first elements):\n{elem[1][0:5]}')
    print(f'attention_masks (5 first elements):\n{elem[2][0:5]}')
    print(f'Nonzero labels:{countNonZeroItems(elem[3])}\n')

  break

print('Done!')

Batch 0: 
fileName: data/datasets/EURLEX57K/train/32014D0241.json
input_ids (5 first elements):
tensor([ 2297,  1013, 22343,  1013,  7327])
attention_masks (5 first elements):
tensor([1, 1, 1, 1, 1])
Nonzero labels:6

fileName: data/datasets/EURLEX57K/train/32003R0205.json
input_ids (5 first elements):
tensor([ 3222,  7816,  1006, 14925,  1007])
attention_masks (5 first elements):
tensor([1, 1, 1, 1, 1])
Nonzero labels:6

fileName: data/datasets/EURLEX57K/test/32002R1451.json
input_ids (5 first elements):
tensor([ 3222,  7816,  1006, 14925,  1007])
attention_masks (5 first elements):
tensor([1, 1, 1, 1, 1])
Nonzero labels:3

fileName: data/datasets/EURLEX57K/train/32005D0607.json
input_ids (5 first elements):
tensor([2384, 1013, 3438, 2581, 1013])
attention_masks (5 first elements):
tensor([1, 1, 1, 1, 1])
Nonzero labels:7

fileName: data/datasets/EURLEX57K/test/32005R1989.json
input_ids (5 first elements):
tensor([ 3222,  7816,  1006, 14925,  1007])
attention_masks (5 first elements):

In [None]:
len(labelLoader.labels)

7201

In [None]:
# bert huggingface pretrained model
from tfm_LabelIndex import LabelIndex
from transformers import AutoConfig, AutoModelForSequenceClassification

labelIndex = LabelIndex(baseDir)

# Gemini. Define a cache directory for Hugging Face models and ensure it exists.
cache_dir = os.path.join(baseDir, 'tfm_cache')
os.makedirs(cache_dir, exist_ok=True)

# Load the configuration with the cache directory.
config = AutoConfig.from_pretrained(
    'distilbert-base-uncased',
    force_download=True,
    cache_dir=cache_dir,
    num_labels=len(labelLoader.labels),
    problem_type='multi_label_classification',
    id2label=labelIndex.id2label,
    label2id=labelIndex.label2id
)

# Load the model with the configuration and cache directory.
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', # Changed to the correct model identifier
    config=config,  # Pass the configuration to the model.
    cache_dir=cache_dir  # Specify the cache directory again.
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# forward pass. no training. test case
item = trainData.__getitem__(0)

outputs = model(
    input_ids=item['input_ids'][0:512].unsqueeze(0),
    attention_mask=item['attention_mask'][0:512].unsqueeze(0),
    labels=item['labels'].unsqueeze(0))

In [None]:
outputs.logits[0]


tensor([-0.0135,  0.2179, -0.0536,  ...,  0.1056, -0.0316, -0.1581],
       grad_fn=<SelectBackward0>)

In [None]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
# calculate metrics

import numpy as np
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(outputs.logits[0])
threshold = 0.5
y_pred = np.zeros(probs.shape)
y_pred[np.where(probs >= threshold)] = 1
y_true = item['labels'].cpu().numpy() # Convert y_true to a NumPy array
f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
accuracy = accuracy_score(y_true, y_pred)

metrics = {'f1': f1_micro_average,
           'roc_auc': roc_auc,
           'accuracy': accuracy}


In [None]:
metrics

{'f1': 0.5036800444382724,
 'roc_auc': 0.25201500833796553,
 'accuracy': 0.5036800444382724}

In [None]:
# https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb

import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import EvalPrediction


def multi_label_metrics(predictions, labels, ):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_true = labels
    y_pred[np.where(probs >= 0.5)] = 1
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # define dictionary of metrics to return
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

def metricsForTestSet():
    predictions = trainer.predict(testData)
    preds = predictions.predictions[0] if isinstance(predictions.predictions, tuple) else predictions.predictions
    labels = predictions.label_ids
    testMetrics = multi_label_metrics(predictions=preds, labels=labels)
    print(testMetrics)

# metric
metricName = 'f1'

# training arguments
trainArgs = TrainingArguments(
    'tfm_oputput',
    report_to='none',  # deactivate wandb  reports. Alternative -> TensorBoard
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batchSize,
    per_device_eval_batch_size=batchSize,
    num_train_epochs=1, # 1 epoch
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metricName)

trainer = Trainer(
    model=model,
    args=trainArgs,
    train_dataset=trainData,
    eval_dataset=valData,
    compute_metrics = compute_metrics,
    #data_collator = Data_Processing(),
)

# API-KEY-WAND-LIBRARY: 1bb618394e7e44feab7f79534fa2be428243d1bb

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Move the model to the correct device before training.
model.to(device)

# epoch 0 - baseline
trainer.evaluate()
metricsForTestSet()

# training
trainer.train()




Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


In [None]:
# https://stackoverflow.com/questions/42703500/how-do-i-save-a-trained-model-in-pytorch
import shutil
from datetime import datetime

prefixDate = datetime.today().strftime('%Y%m%d')
fileName = f'{prefixDate}_tfm_model.pt'
modelFullPath = os.path.join(baseDir,fileName)
drivePath = '/content/drive/MyDrive/TFM-MUECIM'
destFullPath = os.path.join(drivePath,fileName)
torch.save(model, modelFullPath)
shutil.copyfile(modelFullPath, destFullPath)

print 'Evaluate epoch'
trainer.evaluate()
metricsForTestSet():