## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pytorch_lightning as pl
import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import Dataset, DataLoader
#from pytorch_lightning.metrics.functional import accuracy, f1, auroc
#from torcheval.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import warnings
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, multilabel_confusion_matrix


from utils_text_processing import *

https://curiousily.com/posts/multi-label-text-classification-with-bert-and-pytorch-lightning/
https://www.youtube.com/watch?v=vNKIg8rXK6w&ab_channel=rupertai


In [None]:
torch.set_float32_matmul_precision('high')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

In [None]:
!nvidia-smi

In [None]:
torch.manual_seed(42)

In [None]:
# Set paths
path = "."
os.chdir(path)
data_path = path + "/data"
output_path = path + "/outputs"
fig_path = path + "/figs"

In [None]:
# Suppression des FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

## Create Dataset

In [None]:
# Import data
df = pd.read_csv(os.path.join(data_path, 'working_data_sans_dewey.csv'), index_col=0)
print(df.shape)
df.head()

In [None]:
# Transform rameau_list_unstack variable
eval(df.loc[1, "rameau_concepts"])
df["target"] = df["rameau_concepts"].apply(lambda x: eval(x))
df["target"]

In [None]:
# Convert the categorical labels to Multi Label Encodings
mlb = MultiLabelBinarizer()
df_multilabel= pd.DataFrame(mlb.fit_transform(df["target"]), columns=mlb.classes_)
df_multilabel["descr"] = df["DESCR"]

In [None]:
len(mlb.classes_)

In [None]:
# Split data
train_df, val_df = train_test_split(df_multilabel, test_size=0.33, random_state=42)

In [None]:
# Check sizes
print(f"train dataset size: {train_df.shape}")
print(f"test dataset size: {val_df.shape}")

In [None]:
# Verification des classes
print(f"There are {len(mlb.classes_)} different Rameau PPN")
mlb.classes_

In [None]:
# get one row
sample_row = df_multilabel.iloc[16]
sample_descr = sample_row.descr
sample_labels = sample_row[mlb.classes_]

print(sample_descr)
print()
print(sample_labels.to_dict())

## Build the model

In [None]:
# Build Deep Learning Model with BERT/PyTorch
BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)


In [None]:
encoding = tokenizer.encode_plus(
    sample_descr,
    add_special_tokens=True,
    max_length=512,
    return_token_type_ids=False,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
)

In [None]:
# Check model
encoding.keys()

In [None]:
encoding["input_ids"].shape, encoding["attention_mask"].shape

In [None]:
tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze())[:20]

In [None]:
class RameauLabelDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_len: int = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self): 
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        desc = data_row.descr
        labels = data_row[mlb.classes_]

        encoding = self.tokenizer.encode_plus(
            desc,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )

        return dict(
            desc = desc,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.FloatTensor(labels)
        )

In [None]:
# Build dataset
train_dataset = RameauLabelDataset(df_multilabel, tokenizer)

In [None]:
sample_item = train_dataset[0]
sample_item.keys()

In [None]:
sample_item["desc"]

In [None]:
sample_item["labels"].shape

In [None]:
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

In [None]:
sample_batch = next(iter(DataLoader(train_dataset, batch_size=8, num_workers=2)))
sample_batch["input_ids"].shape, sample_batch["attention_mask"].shape

In [None]:
prediction = bert_model(sample_item['input_ids'].unsqueeze(dim=0), sample_item["attention_mask"].unsqueeze(dim=0))
prediction.last_hidden_state.shape, prediction.pooler_output.shape

In [None]:
class RameauLabelDataModule(pl.LightningDataModule):

    def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=128):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = RameauLabelDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )

        self.test_dataset = RameauLabelDataset(
            self.test_df,
            self.tokenizer,
            self.max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
        )

In [None]:
data_module = RameauLabelDataModule(train_df, val_df, tokenizer)
data_module.setup()

In [None]:
N_EPOCHS = 10
BATCH_SIZE = 12

In [None]:
class RameauLabelTagger(pl.LightningModule):
  
  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()
    self.training_step_outputs = []
  
  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output
  
  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.training_step_outputs.append(loss)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": batch["labels"]}
  
  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": batch["labels"]}
  
  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return outputs
  
  # def on_train_epoch_end(self):
  #   labels = []
  #   predictions = []
  #   for output in outputs:
  #     for out_labels in output["labels"].detach().cpu():
  #       labels.append(out_labels)
  #     for out_predictions in output["predictions"].detach().cpu():
  #       predictions.append(out_predictions)
  #   labels = torch.stack(labels).int()
  #   predictions = torch.stack(predictions)
  #   for i, name in enumerate(LABEL_COLUMNS):
  #     class_roc_auc = auroc(predictions[:, i], labels[:, i])
  #     self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
  #   epoch_average = torch.stack(self.training_step_outputs).mean()
  #   self.log("training_epoch_average", epoch_average)
  #   self.training_step_outputs.clear()

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=2e-5)
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )
    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

In [None]:
dummy_model = nn.Linear(2, 1)
optimizer = AdamW(params=dummy_model.parameters(), lr=0.001)
warmup_steps = 20
total_training_steps = 100
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=warmup_steps,
  num_training_steps=total_training_steps
)
learning_rate_history = []
for step in range(total_training_steps):
  optimizer.step()
  scheduler.step()
  learning_rate_history.append(optimizer.param_groups[0]['lr'])
plt.plot(learning_rate_history, label="learning rate")
plt.axvline(x=warmup_steps, color="red", linestyle=(0, (5, 10)), label="warmup end")
plt.legend()
plt.xlabel("Step")
plt.ylabel("Learning rate")
plt.tight_layout();

In [None]:
steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS

In [None]:
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

In [None]:
# Instance of the current model
model = RameauLabelTagger(
  n_classes=len(mlb.classes_),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

In [None]:
# Evaluation
_, predictions = model(sample_batch["input_ids"], sample_batch["attention_mask"])
predictions

In [None]:
criterion = nn.BCELoss()
criterion(predictions, sample_batch["labels"])

In [None]:
checkpoint_callback = ModelCheckpoint(
  dirpath="./checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [None]:
logger = TensorBoardLogger("lightning_logs", name="Rameau")

In [None]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

In [None]:
trainer = pl.Trainer(
  logger=logger,
  callbacks=[early_stopping_callback, checkpoint_callback],
  max_epochs=N_EPOCHS,
  devices=1,
  accelerator="gpu",
  enable_progress_bar=True,
)

In [None]:
trainer.fit(model, data_module)

In [None]:
trainer.test()

In [None]:
# Predictions
trained_model = RameauLabelTagger.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=len(mlb.classes_)
)
trained_model.eval()
trained_model.freeze()

In [None]:
# Evaluation
MAX_TOKEN_COUNT = 512


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)
val_dataset = RameauLabelDataset(
  val_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

predictions = []
labels = []

for item in tqdm(val_dataset):
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device),
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())
  labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

## Metrics

In [None]:
# Accuracy
THRESHOLD = 0.7
accuracy(predictions, labels, threshold=THRESHOLD)

In [None]:
# AUROC
print("AUROC per tag")
for i, name in enumerate(mlb.classes_):
  tag_auroc = auroc(predictions[:, i], labels[:, i], pos_label=1)
  print(f"{name}: {tag_auroc}")

In [None]:
# Classification report
y_pred = predictions.numpy()
y_true = labels.numpy()
upper, lower = 1, 0
y_pred = np.where(y_pred > THRESHOLD, upper, lower)

print(classification_report(
  y_true,
  y_pred,
  target_names=mlb.classes_,
  zero_division=0
))