## Setup

In [1]:
import os

# remove any unwanted garbage using the collector
import gc
gc.collect()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn 
import tqdm.notebook as tq
import warnings

from sklearn.metrics import precision_score, recall_score, f1_score, jaccard_score, classification_report
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, AdamW

2023-07-26 22:47:23.872042: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
os.getcwd()

'/home/aurelie/ABES/labo-indexation-ai/DeepLearning'

In [3]:
# Set paths
path = "/home/aurelie/ABES/labo-indexation-ai"
os.chdir(path)
data_path = path + "/data"
output_path = path + "/outputs"
fig_path = path + "/figs"

In [4]:
os.getcwd()

'/home/aurelie/ABES/labo-indexation-ai'

In [5]:
# Check GPU 
!nvidia-smi

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Wed Jul 26 22:47:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06   Driver Version: 525.125.06   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:0A:00.0 Off |                  Off |
| 30%   37C    P2    65W / 450W |  24197MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                         

https://curiousily.com/posts/multi-label-text-classification-with-bert-and-pytorch-lightning/
https://www.youtube.com/watch?v=vNKIg8rXK6w&ab_channel=rupertai


In [6]:
# Setup torch
# Setup torch
torch.set_float32_matmul_precision('medium')
torch.manual_seed(42)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [7]:
# Suppression des FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

## Load Dataset

In [8]:
## load data (takes around 1min30s)
df_train = pd.read_pickle(os.path.join(data_path, "train_dataset_for_DL.pkl"))
print("Train dataset: ", df_train.shape)
df_test = pd.read_pickle(os.path.join(data_path, "test_dataset_for_DL.pkl"))
print("Test dataset: ", df_test.shape)
df_valid100 = pd.read_pickle(os.path.join(data_path, "valid100_dataset_for_DL.pkl"))
print("Validation dataset: ", df_valid100.shape)

Train dataset:  (125220, 103022)
Test dataset:  (29227, 103022)
Validation dataset:  (100, 103022)


In [9]:
# Check memory space
print("train dataset memory usage: ", df_train.info())
print()
print("ntest dataset memory usage: ", df_test.info())
print()
print("validation dataset memory usage: ", df_valid100.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125220 entries, 0 to 125219
Columns: 103022 entries, !Xóõ (langue) to descr
dtypes: Sparse[int64, 0](103021), object(1)
memory usage: 4.8+ MB
train dataset memory usage:  None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29227 entries, 0 to 29226
Columns: 103022 entries, !Xóõ (langue) to descr
dtypes: Sparse[int64, 0](103021), object(1)
memory usage: 1.1+ MB
ntest dataset memory usage:  None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 103022 entries, !Xóõ (langue) to descr
dtypes: Sparse[int64, 0](103021), object(1)
memory usage: 4.1+ KB
validation dataset memory usage:  None


In [10]:
# get one row
row_id = 64
label_cols = df_train.columns[:-1]
sample_row = df_train.iloc[row_id]
sample_descr = sample_row.descr
sample_labels = sample_row[label_cols]

print("Description: ", sample_descr)
print("Concepts: ", sample_labels[sample_labels != 0].to_dict())


Description:  La bataille mondiale des matières premières Dans le débat sur un nouvel ordre économique international, les marchés mondiaux des matières premières constituent un enjeu de première importance. Ils conditionnent largement les moyens de financement du développement de pays pauvres et sont un des lieux stratégiques où se joue l'indépendance des pays. L'auteur analyse d'abord les mécanismes et les acteurs des marchés libres, mettant en lumière les limites du jeu libéral de l'offre et de la demande. Son examen des divers systèmes de régulation qui ont été expérimentés l'amènent ensuite à émettre de sérieuses réserves sur l'efficacité des stocks régulateurs. De même, les accords compensatoires (type prêts du FMI) se heurtent-ils à des difficultés théoriques et concrètes de mise en place. La régulation de l'offre n'a véritablement réussi que dans le cas du pétrole. Des solutions plus radicales existent en dehors d'un fonctionnement aménagé du marché : ouverture unilatérale des f

In [11]:
# Set Hyperparameters
sampling=False
N_TRAIN_SAMPLE = 5000
N_TEST_SAMPLE = 500
MAX_LEN = 256
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
TEST_BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 2e-05
THRESHOLD = 0.2 # threshold to convert proba into predictions

In [12]:
if sampling: 
    # Sample datasets
    df_train = df_train.sample(n=N_TRAIN_SAMPLE, random_state=42)   
    df_test = df_test.sample(n=N_TEST_SAMPLE, random_state=42)

print("Train dataset: ", df_train.shape)
print("Test dataset: ", df_test.shape)

Train dataset:  (125220, 103022)
Test dataset:  (29227, 103022)


In [13]:
# Separate train dataset into train and validation sets for model 
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(df_train, test_size = 0.3)
print(data_train.shape, data_test.shape)

(87654, 103022) (37566, 103022)


## Build the model

In [14]:
# Build Deep Learning Model with BERT/PyTorch
import torch
from transformers import CamembertTokenizer, CamembertModel
modelname = "camembert-base"
tokenizer = CamembertTokenizer.from_pretrained(modelname)
bert_model = CamembertModel.from_pretrained(modelname, return_dict=True)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### Encoding

In [15]:
# Test tokenizer
text_example = 'Je regarderai la serie à la télévision avec mes enfants ce soir'
# Generate encoding
encodings = tokenizer.encode_plus(
    text_example,
    add_special_tokens=True,
    max_length=20,
    truncation=True,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
)

# Encodings generate a dictionary with three keys (see: https://huggingface.co/transformers/glossary.html)
print(encodings.keys())

dict_keys(['input_ids', 'attention_mask'])


In [16]:
class RameauLabelDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer: tokenizer, max_token_len: int = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self): 
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        descr = " ".join(data_row.descr.split())
        labels = data_row[label_cols]

        inputs = self.tokenizer.encode_plus(
            descr,
            None,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=True,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            "labels": torch.FloatTensor(labels),
            "descr": descr
        }

In [17]:
# Encode all datasets
train_dataset = RameauLabelDataset(
  df_train,
  tokenizer,
  max_token_len=MAX_LEN
)
test_dataset = RameauLabelDataset(
  df_test,
  tokenizer,
  max_token_len=MAX_LEN
)
valid_dataset = RameauLabelDataset(
  df_valid100,
  tokenizer,
  max_token_len=MAX_LEN
)

In [18]:
# Check on an item from the dataset
train_dataset = RameauLabelDataset(
  df_train,
  tokenizer,
  max_token_len=MAX_LEN
)
sample_item = train_dataset[0]

print("Description: ", sample_item["descr"])
print("Labels: ", sample_item["labels"])
print("Shape: ", sample_item["input_ids"].shape)

Description:  La culture pour vivre Mort de la culture populaire en France. Mutation des institutions culturelles grâce à une technique de mise en relation des oeuvres et d'un public, et qui tend à créer un comportement culturel adapté aux caractéristiques de l'époque
Labels:  tensor([0., 0., 0.,  ..., 0., 0., 0.])
Shape:  torch.Size([256])


In [19]:
# Test dataset
next(iter(train_dataset))

{'input_ids': tensor([    5,    61,  1030,    24,   747,  8527,     8,    13,  1030,  2429,
            22,   184,     9, 16003,   472,    20,  3847,  5993,   435,    15,
            28,   899,     8,   375,    22,   911,    20,  6737,    14,    18,
            11,    59,   525,     7,    14,    31,  4769,    15,   739,    23,
          2379,  3542,  2740,    68,  1830,    10,     8,    17,    11,  1475,
             6,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

### Data Loaders

In [20]:
# Build data loaders
train_data_loader = DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=16
)

val_data_loader = DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=4
)

test_data_loader = DataLoader(test_dataset, 
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

### Bert model

In [21]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = bert_model
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, len(label_cols))

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

In [22]:
# Instantiate model and load it to GPU
model = BERTClass()
model.to(device)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Define loss function
criterion = nn.BCEWithLogitsLoss()

In [None]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5) 

In [None]:
# Training of the model for one epoch
from tqdm import tqdm

def train_model(model, training_loader, optimizer, criterion):

    # set model to training mode (activate dropout, batch norm)
    model.train()
    print('Training')
    train_running_loss = 0.0
    train_running_correct = 0
    counter = 0

    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader), 
                      leave=True, colour='steelblue')
    
    for batch_idx, data in loop:
        counter += 1
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        labels = data['labels'].to(device, dtype = torch.float)

        # forward pass
        outputs = model(ids, mask, token_type_ids) 

        # Calculate the loss
        loss = criterion(outputs, labels)
        train_running_loss += loss.item()

        # calculate the accuracy
        outputs = torch.sigmoid(outputs)
        #upper, lower = 1, 0
        #preds = np.where(outputs > THRESHOLD, upper, lower)
        preds = outputs.round() # thresholding at 0.5
        train_running_correct += precision_score(preds.detach().cpu(), labels.detach().cpu(), average="samples")

        # backpropagation
        #optimizer.zero_grad()
        loss.backward()
        #nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # update the optimizer parameter
        optimizer.step()

    # loss and accuracy for the complete epoch
    epoch_loss = train_running_loss / counter
    epoch_prec = train_running_correct / counter

    return epoch_loss, epoch_prec

In [None]:
# Validation
def validate(model, validation_loader, criterion):
    
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()
    print('Validation')
    valid_running_loss = 0.0
    valid_running_correct = 0
    counter = 0

    valid_loop = tq.tqdm(enumerate(validation_loader), total=len(validation_loader), 
                      leave=True, colour='green')
    
    with torch.no_grad():
        for batch_idx, data in valid_loop:
            counter += 1
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            labels = data["labels"].to(device, dtype = torch.float)

            # Forward pass
            outputs = model(ids, mask, token_type_ids)

            # calculate the loss
            loss = criterion(outputs, labels)
            valid_running_loss += loss.item()
            # calculate the accuracy
            outputs = torch.sigmoid(outputs)
            #upper, lower = 1, 0
            #preds = np.where(outputs > THRESHOLD, upper, lower)
            preds = outputs.round() # thresholding at 0.5
            valid_running_correct += precision_score(preds.detach().cpu(), labels.detach().cpu(), average="samples")


    # loss and accuracy for the complete epoch
    epoch_loss = valid_running_loss / counter
    epoch_prec = valid_running_correct / counter
    return epoch_loss, epoch_prec


In [None]:
class SaveBestModel:
    """
    Class to save the best model while training. If the current epoch's 
    validation loss is less than the previous least less, then save the
    model state.
    """
    def __init__(
        self, best_valid_loss=float('inf')
    ):
        self.best_valid_loss = best_valid_loss
        
    def __call__(
        self, current_valid_loss, 
        epoch, model, optimizer, criterion
    ):
        if current_valid_loss < self.best_valid_loss:
            self.best_valid_loss = current_valid_loss
            print(f"\nBest validation loss: {self.best_valid_loss}")
            print(f"\nSaving best model for epoch: {epoch+1}\n")
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': criterion,
                }, 'DeepLearning/outputs/best_model.pth')

In [None]:
def save_model(epochs, model, optimizer, criterion):
    """
    Function to save the trained model to disk.
    """
    print(f"Saving final model...")
    torch.save({
                'epoch': epochs,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': criterion,
                }, 'DeepLearning/outputs/final_model.pth')

In [None]:
def save_plots(train_prec, valid_prec, train_loss, valid_loss):
    """
    Function to save the loss and accuracy plots to disk.
    """
    # accuracy plots
    plt.figure(figsize=(10, 7))
    plt.plot(
        train_prec, color='green', linestyle='-', 
        label='train precision'
    )
    plt.plot(
        valid_prec, color='blue', linestyle='-', 
        label='validation precision'
    )
    plt.xlabel('Epochs')
    plt.ylabel('Precision')
    plt.legend()
    plt.savefig('DeepLearning/outputs/precision.png')
    
    # loss plots
    plt.figure(figsize=(10, 7))
    plt.plot(
        train_loss, color='orange', linestyle='-', 
        label='train loss'
    )
    plt.plot(
        valid_loss, color='red', linestyle='-', 
        label='validataion loss'
    )
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig('DeepLearning/outputs/loss.png')

## Training1
https://github.com/dtolk/multilabel-BERT/blob/master/notebooks/multi_label_text_classification_BERT.ipynb

https://debuggercafe.com/saving-and-loading-the-best-model-in-pytorch/

In [None]:
# initialize SaveBestModel class
save_best_model = SaveBestModel()

In [None]:
# lists to keep track of losses and accuracies
train_loss, valid_loss = [], []
train_acc, valid_acc = [], []

# start the training
for epoch in range(EPOCHS):
    print(f"[INFO]: Epoch {epoch+1} of {EPOCHS}")
    train_epoch_loss, train_epoch_acc = train_model(model, train_data_loader, 
                                            optimizer, criterion)
    valid_epoch_loss, valid_epoch_acc = validate(model, val_data_loader,  
                                                criterion)
    train_loss.append(train_epoch_loss)
    valid_loss.append(valid_epoch_loss)
    train_acc.append(train_epoch_acc)
    valid_acc.append(valid_epoch_acc)
    print(f"Training loss: {train_epoch_loss:.3f}, training precision: {train_epoch_acc:.3f}")
    print(f"Validation loss: {valid_epoch_loss:.3f}, validation precision: {valid_epoch_acc:.3f}")
    # save the best model till now if we have the least loss in the current epoch
    save_best_model(
        valid_epoch_loss, epoch, model, optimizer, criterion
    )
    print('-'*50)
    
# save the trained model weights for a final time
save_model(EPOCHS, model, optimizer, criterion)
# save the loss and accuracy plots
save_plots(train_acc, valid_acc, train_loss, valid_loss)
print('TRAINING COMPLETE')

### Test the model

In [None]:
# load the best model checkpoint
best_model_cp = torch.load('DeepLearning/outputs/best_model.pth')
best_model_epoch = best_model_cp['epoch']
print(f"Best model was saved at {best_model_epoch} epochs\n")
# load the last model checkpoint
last_model_cp = torch.load('DeepLearning/outputs/final_model.pth')
last_model_epoch = last_model_cp['epoch']
print(f"Last model was saved at {last_model_epoch} epochs\n")

In [None]:
def test(model, test_data_loader):
    """
    Function to test the model
    """
    # set model to evaluation mode
    model.eval()
    print('Testing')
    valid_running_correct = 0
    counter = 0

    test_loop = tq.tqdm(enumerate(test_data_loader), total=len(test_data_loader), 
                      leave=True, colour='green')
    with torch.no_grad():
        for i, data in test_loop:
            counter += 1
            
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            labels = data["labels"].to(device, dtype = torch.float)

            # Forward pass
            outputs = model(ids, mask, token_type_ids)

            # calculate the accuracy
            outputs = torch.sigmoid(outputs)
            #upper, lower = 1, 0
            #preds = np.where(outputs > THRESHOLD, upper, lower)
            preds = outputs.round() # thresholding at 0.5
            valid_running_correct += precision_score(preds.detach().cpu(), labels.detach().cpu(), average="samples")
                  
    # Precision for the complete epoch
    final_prec = valid_running_correct / counter
    return final_prec

In [None]:
# test the last epoch saved model
def test_last_model(model, checkpoint, test_loader):
    print('Loading last epoch saved model weights...')
    model.load_state_dict(checkpoint['model_state_dict'])
    test_acc = test(model, test_loader)
    print(f"Last epoch saved model accuracy: {test_acc:.3f}")
# test the best epoch saved model
def test_best_model(model, checkpoint, test_loader):
    print('Loading best epoch saved model weights...')
    model.load_state_dict(checkpoint['model_state_dict'])
    test_acc = test(model, test_loader)
    print(f"Best epoch saved model accuracy: {test_acc:.3f}")

In [None]:
test_last_model(model, last_model_cp, test_data_loader)
test_best_model(model, best_model_cp, test_data_loader)

In [None]:
from sklearn.metrics import classification_report

def get_predictions(model, data_loader):
    """
    Outputs:
      predictions - 
    """
    model = model.eval()
    
    descr = []
    predictions = []
    prediction_probs = []
    target_values = []

    with torch.no_grad():
      for data in data_loader:
        descr = data["descr"]
        ids = data["input_ids"].to(device, dtype = torch.long)
        mask = data["attention_mask"].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data["labels"].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        # add sigmoid (for the training sigmoid is in BCEWithLogitsLoss)
        outputs = torch.sigmoid(outputs)
        # thresholding at 0.5
        preds = outputs.round()

        descr.extend(descr)
        predictions.extend(preds)
        prediction_probs.extend(outputs)
        target_values.extend(targets)
    
    predictions = torch.stack(predictions)
    prediction_probs = torch.stack(prediction_probs)
    target_values = torch.stack(target_values)
    
    return descr, predictions, prediction_probs, target_values


In [None]:
# Predictions on test dataset (~20min)
descr, predictions, prediction_probs, target_values = get_predictions(model, test_data_loader)

In [None]:
# Access values
prediction_probs = prediction_probs.detach().cpu().detach().numpy()
predictions = predictions.detach().cpu().detach().numpy()
target_values = target_values.detach().cpu().detach().numpy()

## Metrics

In [None]:
# Classical metrics
print("Average doc precision : ", precision_score(target_values, predictions, average="samples", zero_division=0))
print("Average doc recall : ", recall_score(target_values, predictions, average="samples", zero_division=0))
print("Average doc F1score : ", f1_score(target_values, predictions, average="samples", zero_division=0))
print("Average doc Jaccard index : ", jaccard_score(target_values, predictions, average="samples", zero_division=0))

In [None]:
# Classification report
y_pred = predictions
y_true = target_values
upper, lower = 1, 0
y_pred = np.where(y_pred > THRESHOLD, upper, lower)

print(classification_report(
  y_true,
  y_pred,
  target_names=label_cols,
  zero_division=0
))

In [None]:
# Prediction
idx = 458
descriptions = test_data_loader.dataset.data.descr
print("Description: ", descriptions.iloc[idx])
pred_probs = print("Predicted Concept: ", label_cols[prediction_probs[idx] > 0.01])
print("Sudoc Indexation: ", label_cols[target_values[idx] == 1])