## Setup

In [1]:
!nvidia-smi

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Mon Jul  3 20:25:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:0A:00.0 Off |                  Off |
|  0%   45C    P8    31W / 450W |      1MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                         

In [2]:
import os

# remove any unwanted garbage using the collector
import gc
gc.collect()

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pytorch_lightning as pl
import torchmetrics
import torch
import torch.nn as nn 
import tqdm
import warnings

from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import BertTokenizerFast as BertTokenizer
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, multilabel_confusion_matrix


2023-07-03 20:25:47.282953: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


https://curiousily.com/posts/multi-label-text-classification-with-bert-and-pytorch-lightning/
https://www.youtube.com/watch?v=vNKIg8rXK6w&ab_channel=rupertai


In [3]:
torch.set_float32_matmul_precision('high')
torch.manual_seed(42)

<torch._C.Generator at 0x7f82bada9b70>

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [5]:
# Set paths
path = "."
os.chdir(path)
data_path = path + "/data"
output_path = path + "/outputs"
fig_path = path + "/figs"

In [6]:
# Suppression des FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

## Load Dataset

In [7]:
## load data (takes around 1min30s)
df_train = pd.read_pickle(os.path.join(data_path, "train_dataset_for_DL.pkl"))
print("Train dataset: ", df_train.shape)
df_test = pd.read_pickle(os.path.join(data_path, "test_dataset_for_DL.pkl"))
print("Test dataset: ", df_test.shape)
df_valid100 = pd.read_pickle(os.path.join(data_path, "valid100_dataset_for_DL.pkl"))
print("Validation dataset: ", df_valid100.shape)

Train dataset:  (125264, 103022)
Test dataset:  (29244, 103022)
Validation dataset:  (100, 103022)


In [8]:
# get one row
row_id = 64
label_cols = df_train.columns[:-1]
sample_row = df_train.iloc[row_id]
sample_descr = sample_row.descr
sample_labels = sample_row[label_cols]

print("Description: ", sample_descr)
print("Concepts: ", sample_labels[sample_labels != 0].to_dict())


Description:  La bataille mondiale des matières premières Dans le débat sur un nouvel ordre économique international, les marchés mondiaux des matières premières constituent un enjeu de première importance. Ils conditionnent largement les moyens de financement du développement de pays pauvres et sont un des lieux stratégiques où se joue l'indépendance des pays. L'auteur analyse d'abord les mécanismes et les acteurs des marchés libres, mettant en lumière les limites du jeu libéral de l'offre et de la demande. Son examen des divers systèmes de régulation qui ont été expérimentés l'amènent ensuite à émettre de sérieuses réserves sur l'efficacité des stocks régulateurs. De même, les accords compensatoires (type prêts du FMI) se heurtent-ils à des difficultés théoriques et concrètes de mise en place. La régulation de l'offre n'a véritablement réussi que dans le cas du pétrole. Des solutions plus radicales existent en dehors d'un fonctionnement aménagé du marché : ouverture unilatérale des f

## Build the model

In [9]:
# Build Deep Learning Model with BERT/PyTorch
from transformers import BertTokenizer
BERT_MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME, do_lower_case=False)

#### Tokenization example

In [10]:
# Simple tokenizer
text_example = 'Je regarderai la serie à la télévision avec mes enfants ce soir'
bert_input = tokenizer(text_example, padding='max_length', max_length=20, truncation=True, return_tensors="pt")
print(bert_input['input_ids'])
print(bert_input["input_ids"].shape, bert_input["attention_mask"].shape)
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])
print(tokenizer.decode(bert_input.input_ids[0]))

tensor([[  101, 13796, 42047, 12015, 10116, 10109, 11185,   254, 10109, 33110,
         10460, 17954, 18374, 10794, 50520,   102,     0,     0,     0,     0]])
torch.Size([1, 20]) torch.Size([1, 20])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])
[CLS] Je regarderai la serie à la télévision avec mes enfants ce soir [SEP] [PAD] [PAD] [PAD] [PAD]


In [11]:
# More complex tokenizerr
encoding = tokenizer.encode_plus(
    text_example,
    add_special_tokens=True,
    max_length=20,
    truncation=True,
    return_token_type_ids=False,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
)

# Check model
print(encoding.keys())
# Check shapes
print(encoding["input_ids"].shape, encoding["attention_mask"].shape)
# Check contents of encoding outputs
print(encoding["input_ids"])
print(encoding["attention_mask"])
# Inverse tokenization to get back words
print(tokenizer.convert_ids_to_tokens(encoding.input_ids[0]))

dict_keys(['input_ids', 'attention_mask'])
torch.Size([1, 20]) torch.Size([1, 20])
tensor([[  101, 13796, 42047, 12015, 10116, 10109, 11185,   254, 10109, 33110,
         10460, 17954, 18374, 10794, 50520,   102,     0,     0,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])
['[CLS]', 'Je', 'regard', '##era', '##i', 'la', 'serie', 'à', 'la', 'télévision', 'avec', 'mes', 'enfants', 'ce', 'soir', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [12]:
# # Number of tokens by description
# token_counts = []
# df_train_sample = df_train.sample(n=20000, random_state=42)
# for _, row in df_train_sample.iterrows():
#   token_count = len(tokenizer.encode(
#     row["descr"],
#     max_length=512,
#     truncation=True
#   ))
#   token_counts.append(token_count)

In [13]:
# # Histplot
# sns.histplot(token_counts)
# plt.xlim([0, 512])

Most of the comments contain less than 300 tokens or more than 512. So, we’ll stick with the limit of 512.

In [14]:
MAX_TOKEN_COUNT = 512

In [15]:
class RameauLabelDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer: tokenizer, max_token_len: int = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self): 
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        descr = data_row.descr
        labels = data_row[label_cols]

        encoding = self.tokenizer.encode_plus(
            descr,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )

        return dict(
            descr = descr,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.FloatTensor(labels)
        )

In [16]:
# Check on an item from the dataset
train_dataset = RameauLabelDataset(
  df_train,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)
sample_item = train_dataset[0]
sample_item.keys()

dict_keys(['descr', 'input_ids', 'attention_mask', 'labels'])

In [17]:
print("Description: ", sample_item["descr"])
print("Labels: ", sample_item["labels"])
print("Shape: ", sample_item["input_ids"].shape)

Description:  La culture pour vivre Mort de la culture populaire en France. Mutation des institutions culturelles grâce à une technique de mise en relation des oeuvres et d'un public, et qui tend à créer un comportement culturel adapté aux caractéristiques de l'époque
Labels:  tensor([0., 0., 0.,  ..., 0., 0., 0.])
Shape:  torch.Size([512])


In [18]:
# Load pretrained model and pass a sample of batch data
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

sample_batch = next(iter(DataLoader(train_dataset, batch_size=8, num_workers=2)))
sample_batch["input_ids"].shape, sample_batch["attention_mask"].shape

output = bert_model(sample_batch["input_ids"], sample_batch["attention_mask"])
print(output.last_hidden_state.shape, output.pooler_output.shape)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([8, 512, 768]) torch.Size([8, 768])


In [19]:
class RameauLabelDataModule(pl.LightningDataModule):

    def __init__(self, train_df, test_df, val_df, tokenizer, batch_size=8, max_token_len=128):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.val_df = val_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = RameauLabelDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )

        self.test_dataset = RameauLabelDataset(
            self.test_df,
            self.tokenizer,
            self.max_token_len
        )

        self.val_dataset = RameauLabelDataset(
            self.val_df,
            self.tokenizer,
            self.max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=4
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            num_workers=4
        )

In [20]:
N_EPOCHS = 10
BATCH_SIZE = 12

In [21]:
data_module = RameauLabelDataModule(
    df_train, 
    df_test, 
    df_valid100, 
    tokenizer,
    batch_size=BATCH_SIZE,
    max_token_len = MAX_TOKEN_COUNT)

In [22]:
data_module.setup()

In [23]:
# Model
class RameauLabelTagger(pl.LightningModule):
  
  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()
    self.training_step_outputs = []
  
  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output
  
  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.training_step_outputs.append(loss)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": batch["labels"]}
  
  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss
  
  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss
  

  def on_train_epoch_end(self, outputs):
    labels = []
    predictions = []

    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(mlb.classes_):
      class_roc_auc = torchmetrics.AUROC(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
      
    # epoch_average = torch.stack(self.training_step_outputs).mean()
    # self.log("training_epoch_average", epoch_average)
    # self.training_step_outputs.clear()

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=2e-5)
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )
    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

In [24]:
steps_per_epoch=len(df_train) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS

In [25]:
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(20876, 104380)

In [26]:
# Instance of the current model
model = RameauLabelTagger(
  n_classes=len(label_cols),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
# Evaluation
criterion = nn.BCELoss()

_, predictions = model(sample_batch["input_ids"], sample_batch["attention_mask"])
criterion(predictions, sample_batch["labels"])

tensor(0.6973, grad_fn=<BinaryCrossEntropyBackward0>)

## Training

In [28]:
checkpoint_callback = ModelCheckpoint(
  dirpath="./checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="test_loss",
  mode="min"
)

In [29]:
# Log the progress in Tensorboard
logger = TensorBoardLogger("lightning_logs", name="Rameau")

In [30]:
# Add early stopping
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

In [31]:
# Training
trainer = pl.Trainer(
  logger=logger,
  callbacks=[early_stopping_callback, checkpoint_callback],
  max_epochs=N_EPOCHS,
  devices=1,
  accelerator="gpu",
  enable_progress_bar=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [32]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["require_grad"] = "True"

In [33]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 177 M 
1 | classifier | Linear    | 79.2 M
2 | criterion  | BCELoss   | 0     
-----------------------------------------
257 M     Trainable params
0         Non-trainable params
257 M     Total params
1,028.306 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

In [None]:
trainer.test()

In [None]:
# Predictions
trained_model = RameauLabelTagger.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=len(mlb.classes_)
)
trained_model.eval()
trained_model.freeze()

In [None]:
# Evaluation
MAX_TOKEN_COUNT = 512


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)
val_dataset = RameauLabelDataset(
  val_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

predictions = []
labels = []

for item in tqdm(val_dataset):
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device),
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())
  labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

## Metrics

In [None]:
# Accuracy
THRESHOLD = 0.7
accuracy(predictions, labels, threshold=THRESHOLD)

In [None]:
# AUROC
print("AUROC per tag")
for i, name in enumerate(mlb.classes_):
  tag_auroc = auroc(predictions[:, i], labels[:, i], pos_label=1)
  print(f"{name}: {tag_auroc}")

In [None]:
# Classification report
y_pred = predictions.numpy()
y_true = labels.numpy()
upper, lower = 1, 0
y_pred = np.where(y_pred > THRESHOLD, upper, lower)

print(classification_report(
  y_true,
  y_pred,
  target_names=mlb.classes_,
  zero_division=0
))