# Multilabel text classification using BERT family

from: 
https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb#scrollTo=Lk6Cq9duKBkA

from 
https://colab.research.google.com/drive/1ejBYmu0P5urzghoTTDB-GBUxpbUFX0Gz?usp=sharing#scrollTo=p5Iuv7q7Dtg_


## Setup

In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import os
import time

# remove any unwanted garbage using the collector
import gc
gc.collect()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pytorch_lightning as pl
import torch.nn as nn 
import tqdm.notebook as tq
import warnings

from torch.utils.data import Dataset, DataLoader
from transformers import training_args, Trainer
from transformers import BertTokenizer, BertModel, AutoTokenizer, AdamW
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!nvidia-smi

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Tue Jul 11 23:06:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06   Driver Version: 525.125.06   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:0A:00.0 Off |                  Off |
|  0%   39C    P8    22W / 450W |   1451MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                         

In [4]:
# Setup torch
torch.set_float32_matmul_precision('high')
torch.manual_seed(42)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [5]:
# Set paths
path = "."
os.chdir(path)
data_path = path + "/data"
output_path = path + "/outputs"
fig_path = path + "/figs"

In [6]:
# Suppression des FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [7]:
torch.set_float32_matmul_precision('high')
RANDOM_SEED=42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7fdd400a4a30>

In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [9]:
os.environ["TOKENIZERS_PARALLELISM"] = "False"
os.environ["require_grad"] = "False"

## Load Datasets

In [10]:
## load data (takes around 1min30s)
df_train = pd.read_pickle(os.path.join(data_path, "train_dataset_for_DL.pkl"))
print("Train dataset: ", df_train.shape)
df_test = pd.read_pickle(os.path.join(data_path, "test_dataset_for_DL.pkl"))
print("Test dataset: ", df_test.shape)
df_valid100 = pd.read_pickle(os.path.join(data_path, "valid100_dataset_for_DL.pkl"))
print("Validation dataset: ", df_valid100.shape)

Train dataset:  (125220, 103022)
Test dataset:  (29227, 103022)
Validation dataset:  (100, 103022)


In [11]:
# Check memory space
print("train dataset memory usage: ", df_train.info())
print()
print("ntest dataset memory usage: ", df_test.info())
print()
print("validation dataset memory usage: ", df_valid100.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125220 entries, 0 to 125219
Columns: 103022 entries, !Xóõ (langue) to descr
dtypes: Sparse[int64, 0](103021), object(1)
memory usage: 4.8+ MB
train dataset memory usage:  None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29227 entries, 0 to 29226
Columns: 103022 entries, !Xóõ (langue) to descr
dtypes: Sparse[int64, 0](103021), object(1)
memory usage: 1.1+ MB
ntest dataset memory usage:  None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 103022 entries, !Xóõ (langue) to descr
dtypes: Sparse[int64, 0](103021), object(1)
memory usage: 4.1+ KB
validation dataset memory usage:  None


In [12]:
## Show columns
print("Features: ", df_train.columns)

Features:  Index(['!Xóõ (langue)', '"Sprach- und Sachatlas Italiens und der Südschweiz"',
       '"Taalatlas van Noord- en Zuid-Nederland"', ''?d', ''?ntokyo',
       ''Are'are (peuple des îles Salomon)', ''Au keto',
       ''Au keto, Musique d'', ''Au ni aau', ''Au ni aau, Musique d'',
       ...
       'évangéliaire de split', 'Ālāp', 'Ārbajo, Musique d'',
       'Đinh pơng, Musique de', 'Ō-tsuzumi, Musique d'', 'ʿArūbi', 'Ḥawfi',
       'Ṭhumrī', 'Ṭār (tambour), Musique de', 'descr'],
      dtype='object', length=103022)


In [13]:
# get one row
row_id = 64
label_cols = df_train.columns[:-1]
sample_row = df_train.iloc[row_id]
sample_descr = sample_row.descr
sample_labels = sample_row[label_cols]

print("Description: ", sample_descr)
print("Concepts: ", sample_labels[sample_labels != 0].to_dict())


Description:  La bataille mondiale des matières premières Dans le débat sur un nouvel ordre économique international, les marchés mondiaux des matières premières constituent un enjeu de première importance. Ils conditionnent largement les moyens de financement du développement de pays pauvres et sont un des lieux stratégiques où se joue l'indépendance des pays. L'auteur analyse d'abord les mécanismes et les acteurs des marchés libres, mettant en lumière les limites du jeu libéral de l'offre et de la demande. Son examen des divers systèmes de régulation qui ont été expérimentés l'amènent ensuite à émettre de sérieuses réserves sur l'efficacité des stocks régulateurs. De même, les accords compensatoires (type prêts du FMI) se heurtent-ils à des difficultés théoriques et concrètes de mise en place. La régulation de l'offre n'a véritablement réussi que dans le cas du pétrole. Des solutions plus radicales existent en dehors d'un fonctionnement aménagé du marché : ouverture unilatérale des f

In [14]:
'''
# Sample datasets
df_train = df_train.sample(n=10000, random_state=42)
print("Train dataset: ", df_train.shape)
#df_test = df_test.sample(n=3000, random_state=42)
print("Test dataset: ", df_test.shape)
'''

'\n# Sample datasets\ndf_train = df_train.sample(n=10000, random_state=42)\nprint("Train dataset: ", df_train.shape)\n#df_test = df_test.sample(n=3000, random_state=42)\nprint("Test dataset: ", df_test.shape)\n'

In [15]:
'''
# Remove useless columns (labels not used in train dataset - Takes > 5min)
cols_to_remove = df_train.columns[df_train.sum() == 0]
len(cols_to_remove)
'''

'\n# Remove useless columns (labels not used in train dataset - Takes > 5min)\ncols_to_remove = df_train.columns[df_train.sum() == 0]\nlen(cols_to_remove)\n'

In [16]:
'''
# Remove useless columns
df_train = df_train.drop(columns=cols_to_remove)
print("Train dataset:", df_train.shape)
df_test = df_test.drop(columns=cols_to_remove)
print("Test dataset:", df_test.shape)
df_valid100 = df_valid100.drop(columns=cols_to_remove)
print("Validation dataset:", df_valid100.shape)
label_cols = df_train.columns[:-1]
print("Nombre de labels:", len(label_cols))
'''

'\n# Remove useless columns\ndf_train = df_train.drop(columns=cols_to_remove)\nprint("Train dataset:", df_train.shape)\ndf_test = df_test.drop(columns=cols_to_remove)\nprint("Test dataset:", df_test.shape)\ndf_valid100 = df_valid100.drop(columns=cols_to_remove)\nprint("Validation dataset:", df_valid100.shape)\nlabel_cols = df_train.columns[:-1]\nprint("Nombre de labels:", len(label_cols))\n'

In [17]:
# Separate train dataset into train and validation sets for model 
from sklearn.model_selection import train_test_split
data_train, data_val = train_test_split(df_train, test_size = 0.25)
print(data_train.shape, data_val.shape)

(93915, 103022) (31305, 103022)


In [18]:
LABEL_COLUMNS = data_train.columns[:-1]
id2label = {idx:label for idx, label in enumerate(LABEL_COLUMNS)}
label2id = {label:idx for idx, label in enumerate(LABEL_COLUMNS)}
print(len(LABEL_COLUMNS))

103021


In [19]:
data_train[LABEL_COLUMNS].sum().sort_values(ascending=False)[:10]

Histoire                            4539
Aspect social                       1599
Manuels d'enseignement supérieur    1213
Français (langue)                   1194
Philosophie                         1133
Mathématiques                       1053
Étude et enseignement (primaire)    1047
Étude et enseignement                994
Droit                                936
Bandes dessinées                     928
dtype: int64

In [20]:
'''
# Sample test_set
data_test = df_test.sample(n=5000)
print("data_test shape:", data_test.shape)
'''

'\n# Sample test_set\ndata_test = df_test.sample(n=5000)\nprint("data_test shape:", data_test.shape)\n'

In [21]:
n_labels = len(label_cols)

## Build the model

In [22]:
# Select one example
id = 56
sample_row = data_train.iloc[id]
sample_text = sample_row.descr
sample_labels = sample_row[LABEL_COLUMNS][sample_row[LABEL_COLUMNS] == 1 ]
print(sample_text)
print(sample_labels.to_dict())

Enfants perdus de Roumanie : histoire des orphelinats de Ceausescu Images d'enfants maltraités, mal nourris, privés d'accès aux soins, entassés dans des bâtisses insalubres : en 1989, l'opinion internationale découvrait avec effroi l'enfer des « orphelinats de Ceausescu », au point que leur démantèlement fut une condition sine qua non de l'adhésion de la Roumanie à l'Union européenne. Au-delà des représentations sensationnalistes diffusées par la presse et les organisations internationales, la réalité de ce phénomène reste encore largement méconnue. Une certitude : du fait d'un manque cruel de moyens et de personnel qualifié, ces « enfants de l'État » ont, par dizaines de milliers, subi pendant des années, sans possibilité d'échappatoire, la rudesse des conditions de vie sous le régime socialiste et une violence quotidienne au sein des institutions censées les prendre en charge. En s'appuyant sur des sources nationales et locales inexplorées, sur de nombreux témoignages d'anciens mineu

from https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb

In [23]:
# Sections of config
from transformers import CamembertModel, CamembertTokenizer
model_name = 'camembert-base'
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = CamembertTokenizer.from_pretrained(model_name)

In [24]:
class RameauLabelDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_len: int = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self): 
        return len(self.data)
    
    def __getitem__(self, item_idx: int):
        data_row = self.data.iloc[item_idx]
        descr = data_row.descr
        descr = " ".join(descr.split())
        labels = data_row[LABEL_COLUMNS]

        # Tokenization
        inputs = self.tokenizer.encode_plus(
            descr,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            truncation=True,
            padding="max_length",
            return_attention_mask=False,
            return_tensors="pt"
        )
        input_ids = inputs['input_ids'].flatten()
        #attention_mask = inputs['attention_mask'].flatten()

        return {
            #'description': descr,
            'input_ids': input_ids ,
            #'attention_mask': attention_mask,
            'labels':torch.FloatTensor(labels)
            }

In [25]:
# Creating the dataset and dataloader for the neural network
training_set = RameauLabelDataset(data_train, tokenizer, MAX_LEN)
validation_set = RameauLabelDataset(data_val, tokenizer, MAX_LEN)
testing_set = RameauLabelDataset(df_test, tokenizer, MAX_LEN)

In [26]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'pin_memory': True,
                'shuffle': True,
                'num_workers': 12
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
               'pin_memory': True, 
               'shuffle': False,
               'num_workers': 12
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **test_params)
testing_loader = DataLoader(testing_set, **test_params)

In [27]:
'''
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=n_labels)
model.to(device)
'''

'\nfrom transformers import AutoModelForSequenceClassification\n\nmodel = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=n_labels)\nmodel.to(device)\n'

In [28]:
# Creating the customized model, by adding a drop out and a dense layer on top of bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = CamembertModel.from_pretrained(model_name, return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, len(label_cols))

    def forward(self, input_ids):
        output = self.bert_model(input_ids)
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()

# # Freezing BERT layers: (tested, weaker convergence)
# for param in model.bert_model.parameters():
#     param.requires_grad = False

model.to(device)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (bert_model): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Laye

In [29]:
# Loss function
loss_fn = torch.nn.BCEWithLogitsLoss()

In [30]:
# Optimizer and learning rate scheduler
from torch.optim import AdamW
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE)

In [31]:
#Training loo^p
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs = data['input_ids'].to(device)
        labels = data['labels'].to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [32]:
# Initializing in a separate cell so we can easily add more epochs to the same run
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(validation_loader):
            vinputs, vlabels = vdata
            voutputs = model(vinputs)
            vloss = loss_fn(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:


In [None]:
torch.save(model.state_dict(), 'model_weights.pth')

In [None]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from transformers import TrainingArguments, Trainer

# Define Trainer
args = TrainingArguments(
    output_dir="./outputs",
    num_train_epochs=1,
    per_device_train_batch_size=16

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=validation_set,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model('camembert_v1')

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
# Optimizer and learning rate scheduler
from torch.optim import AdamW
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
# Load model on gpu
model.to(device)

In [None]:
# create defait learning rate scheduler from Trainer
from transformers import get_scheduler

num_epochs = 10
batch_per_epoch = 30
num_training_steps = num_epochs * batch_per_epoch
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [None]:
# Training loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for i, batch in enumerate(training_loader,0):
    while i < batch_per_epoch:
        for epoch in range(num_epochs):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

In [None]:
# Evaluation
import evaluate

metric = evaluate.load("f1_score")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

## Fine tune the model

In [None]:
def train(epoch):
    model.train()
  
    ids = data['input_ids'].to(device, dtype = torch.long)
    mask = data['attention_mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['labels'].to(device, dtype = torch.float)

    outputs = model(ids, mask, token_type_ids)

    optimizer.zero_grad()
    loss = loss_fn(outputs, targets)
   
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
start_time = time.time()
for data in training_loader:
    for epoch in range(EPOCHS):
        train(epoch)
        

stop = time.time()

In [None]:
start_time = time.time()
for epoch in range(EPOCHS):
    train(epoch)

stop = time.time()

In [None]:
print(f"Training time: {stop - start_time}s")

### Validating the model

In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
def get_predictions(model, data_loader):
    """
    Outputs:
      predictions - 
    """
    model = model.eval()
    
    descr = []
    predictions = []
    prediction_probs = []
    target_values = []

    with torch.no_grad():
      for data in data_loader:
        #descr = data["descr"]
        ids = data["input_ids"].to(device, dtype = torch.long)
        mask = data["attention_mask"].to(device, dtype = torch.long)
        targets = data["labels"].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
        outputs = torch.sigmoid(outputs).detach().cpu()
        # thresholding at 0.5
        preds = outputs.round()
        targets = targets.detach().cpu()

        descr.extend(descr)
        predictions.extend(preds)
        prediction_probs.extend(outputs)
        target_values.extend(targets)
    
    predictions = torch.stack(predictions)
    prediction_probs = torch.stack(prediction_probs)
    target_values = torch.stack(target_values)
    
    return predictions, prediction_probs, target_values


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
from utils_metrics import *
THRESHOLD = 0.05
for epoch in range(EPOCHS):
    outputs_prob, targets = validation(epoch)
    outputs = np.array(outputs_prob) >= THRESHOLD
    results = label_metrics_report(
        modelName = model_name,
        y_true = targets,
        y_pred = outputs,
        y_prob=None,
        print_metrics=True)

In [None]:
len(outputs[0])

In [None]:
t = outputs[0]
t[t]

In [None]:
model

### TRY CAMEMBERT

In [None]:
tokenizer

In [None]:
class RameauLabelDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_len: int = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self): 
        return len(self.data)
    
    def __getitem__(self, item_idx: int):
        data_row = self.data.iloc[item_idx]
        descr = data_row.descr
        descr = " ".join(descr.split())
        labels = data_row[LABEL_COLUMNS]

        # Tokenization
        inputs = self.tokenizer.encode_plus(
            descr,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )
        input_ids = inputs['input_ids'].flatten()
        attn_mask = inputs['attention_mask'].flatten()

        return {
            #'description': descr,
            'input_ids': input_ids ,
            'attention_mask': attn_mask,
            'labels':torch.FloatTensor(labels)
            }

In [None]:
# Check
train_dataset = RameauLabelDataset(data_train, tokenizer)
sample_item = train_dataset[0]
print("keys:", sample_item.keys())
#print("description:", sample_item["description"])
print("labels:", sample_item["labels"])
print(sample_item["input_ids"].shape)

In [None]:
# Hyperparameters
MAX_LEN = 512
BATCH_SIZE = 32
N_EPOCHS = 2
LEARNING_RATE = 1e-05
THRESHOLD = 0.08 # threshold for the sigmoid

In [None]:
# Data Module
class RameauLabelDataModule(pl.LightningDataModule):

    def __init__(self, train_df, val_df, test_df, tokenizer, batch_size=8, max_token_len=128):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = RameauLabelDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )
            
        self.val_dataset = RameauLabelDataset(
            self.val_df,
            self.tokenizer,
            self.max_token_len
        )
        self.test_dataset = RameauLabelDataset(
            self.test_df,
            self.tokenizer,
            self.max_token_len
        )
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=1,
            num_workers=4
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=2
        )

In [None]:
# Instantiate and set up the data_module
data_module = RameauLabelDataModule(
    data_train, 
    data_val, 
    df_test, 
    tokenizer,
    batch_size=BATCH_SIZE,
    max_token_len = MAX_LEN)

data_module.setup()

In [None]:
len(data_module.train_dataloader())

## Model

In [None]:
from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
import torch.nn as nn
import math
from torchmetrics.functional.classification import auroc
import torch.nn.functional as F

In [None]:
# Model
class RameauLabelClassifier(pl.LightningModule):
  # Set up the classifier
  def __init__(self, config: dict):
    super().__init__()
    self.config = config
    self.pretrained_model = CamembertModel.from_pretrained(config['model_name'], return_dict = True)
    self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
    self.classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])
    self.loss_func = nn.BCEWithLogitsLoss(reduction='mean')
    self.dropout = nn.Dropout()  

  def forward(self, input_ids, attention_mask, labels=None):
    # roberta layer
    output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = torch.mean(output.last_hidden_state, 1)
    # final logits
    pooled_output = self.dropout(pooled_output)
    pooled_output = self.hidden(pooled_output)
    pooled_output = F.relu(pooled_output)
    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    # calculate loss
    loss = 0
    if labels is not None:
      loss = self.loss_func(logits.view(-1, self.config['n_labels']), labels.view(-1, self.config['n_labels']))

    return loss, logits


  def training_step(self, batch, batch_index):
    loss, outputs = self(**batch)
    self.log("train loss ", loss, prog_bar = True, logger=True)
    return {"loss":loss, "predictions":outputs, "labels": batch["labels"]}

  def validation_step(self, batch, batch_index):
    loss, outputs = self(**batch)
    self.log("validation loss ", loss, prog_bar = True, logger=True)
    return {"val_loss": loss, "predictions":outputs, "labels": batch["labels"]}

  def predict_step(self, batch, batch_index):
    loss, outputs = self(**batch)
    return outputs
  

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'])
    total_steps = self.config['train_size']/self.config['batch_size']
    warmup_steps = math.floor(total_steps * self.config['warmup'])
    warmup_steps = math.floor(total_steps * self.config['warmup'])
    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
    return [optimizer],[scheduler]

  # def on_validation_epoch_end(self, outputs):
  #   losses = []
  #   for output in outputs:
  #     loss = output['val_loss'].detach().cpu()
  #     losses.append(loss)
  #   avg_loss = torch.mean(torch.stack(losses))
  #   self.log("avg_val_loss", avg_loss)


In [None]:
config = {
    'model_name': model_name,
    'n_labels': n_labels,
    'batch_size': 128,
    'max_len': 256,
    'lr': 1.5e-6,
    'warmup': 0.2, 
    'train_size': len(data_module.train_dataloader()),
    'weight_decay': 0.001,
    'n_epochs': 100
}

model = RameauLabelClassifier(config)

In [None]:
idx=0
sample_item = train_dataset[idx]
input_ids = sample_item["input_ids"]
attention_mask = sample_item['attention_mask']
labels = sample_item['labels']
model.cpu()
loss, output = model(input_ids.unsqueeze(dim=0), attention_mask.unsqueeze(dim=0), labels.unsqueeze(dim=0))
print(labels.shape, output.shape, output)

### Train model

In [None]:
# datamodule
data_module = data_module = RameauLabelDataModule(
    data_train, 
    data_val, 
    df_test, 
    tokenizer,
    batch_size=config['batch_size'],
    max_token_len = config["max_len"])

data_module.setup() 

# model
model = RameauLabelClassifier(config)
model.to(device)


In [None]:

# trainer and fit
trainer = pl.Trainer(max_epochs=config['n_epochs'], accelerator="gpu", num_sanity_val_steps=50)
trainer.fit(model, data_module)

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs/

### Predict with model

In [None]:
# method to convert list of comments into predictions for each comment
def classify_raw_comments(model, dm):
  predictions = trainer.predict(model, datamodule=dm)
  flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])
  return flattened_predictions

In [None]:
predictions = classify_raw_comments(model, data_module)

In [None]:
predictions

### Evaluation

In [None]:
steps_per_epoch=len(df_train) // BATCH_SIZE
print("steps per epoch:", steps_per_epoch)
total_training_steps = steps_per_epoch * N_EPOCHS
print("Total training steps:", total_training_steps)
warmup_steps = total_training_steps // 5
print("Warmup steps: ", warmup_steps)

In [None]:
# Create an instance of the model
model = RameauLabelClassifier(
  n_classes=len(LABEL_COLUMNS),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

model.to(device)

In [None]:
trainer = pl.Trainer(max_epochs=N_EPOCHS, accelerator="auto", enable_progress_bar=True)

In [None]:
torch.set_grad_enabled(True)

In [None]:
# Train the model
trainer.fit(model, data_module)

In [None]:
# Instance of the current model
model = RameauLabelClassifier(
  n_classes=len(label_cols),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

model.to("cuda")

In [None]:
# saves a file like: input/QTag-epoch=02-val_loss=0.32.ckpt
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    monitor='val_loss',# monitored quantity
    filename='Rameau_BertMultilingualClassifier-{epoch:02d}-{val_loss:.2f}',
    verbose=True,
    save_top_k=3, #  save the top 3 models
    mode='min', # mode of the monitored quantity  for optimization
)

In [None]:
# Log the progress in Tensorboard
logger = TensorBoardLogger("lightning_logs", name="Bert_multilingual")

In [None]:
# Add early stopping
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

In [None]:
# Training
trainer = pl.Trainer(
  logger=logger,
  callbacks=[early_stopping_callback, checkpoint_callback],
  max_epochs=EPOCHS,
  accelerator="gpu",
  enable_progress_bar=True,
)

In [None]:
trainer.fit(model, data_module)

In [None]:
# Initialize the parameters that will be use for training
N_EPOCHS = 12
BATCH_SIZE = 32
MAX_LEN = 300
LR = 2e-05

In [None]:
# Instantiate the Model Trainer
trainer = pl.Trainer(max_epochs =EPOCHS ,  accelerator="gpu", callbacks=[checkpoint_callback], enable_progress_bar=True)
# Train the Classifier Model
trainer.fit(model, data_module)

In [None]:
# Evaluation
criterion = nn.BCELoss()

_, predictions = model(sample_batch["input_ids"], sample_batch["attention_mask"])
criterion(predictions, sample_batch["labels"])

## Training

In [None]:
checkpoint_callback = ModelCheckpoint(
  dirpath="./checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="test_loss",
  mode="min"
)

In [None]:
trainer.test()

In [None]:
# Predictions
trained_model = RameauLabelTagger.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=len(label_cols)
)
trained_model.eval()
trained_model.freeze()

In [None]:
# Evaluation
MAX_TOKEN_COUNT = 512


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)
val_dataset = RameauLabelDataset(
  df_valid100,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

predictions = []
labels = []

for item in tqdm(val_dataset):
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device),
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())
  labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

## Metrics

In [None]:
# Accuracy
THRESHOLD = 0.7
torchmetrics.Accuracy(predictions, labels, threshold=THRESHOLD)

In [None]:
# AUROC
print("AUROC per tag")
for i, name in enumerate(label_cols):
  tag_auroc = torchmetrics.AUROC(predictions[:, i], labels[:, i], pos_label=1)
  print(f"{name}: {tag_auroc}")

In [None]:
# Classification report
y_pred = predictions.numpy()
y_true = labels.numpy()
upper, lower = 1, 0
y_pred = np.where(y_pred > THRESHOLD, upper, lower)

print(classification_report(
  y_true,
  y_pred,
  target_names=label_cols,
  zero_division=0
))