<a href="https://colab.research.google.com/github/antoniobelotti/HVD/blob/main/6_ensemble_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup env

In [None]:
%%capture

!python -m pip install --upgrade
!pip install --no-cache-dir transformers sentencepiece datasets
!pip install accelerate -U
!pip install scikit-multilearn

import functools
import pandas as pd
import time
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas() # to use progress_apply
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import (
  AutoModelForSequenceClassification,
  AutoModel,
  AutoTokenizer,
  TrainingArguments,
  Trainer,
  EarlyStoppingCallback,
  IntervalStrategy,
  EvalPrediction,
  AdamW,
  get_linear_schedule_with_warmup
)
from datasets import (
  load_from_disk,
  DatasetDict,
  concatenate_datasets
)

import torch

from sklearn.model_selection import (
  StratifiedKFold,
  cross_validate,
  train_test_split
)
from sklearn.metrics import (
  make_scorer,
  accuracy_score,
  precision_score,
  recall_score,
  f1_score,
  confusion_matrix,
  classification_report
)

%matplotlib inline

In [None]:
from google.colab import drive
import pathlib

drive.mount('/content/gdrive')

BASE_PATH = pathlib.Path("/content/gdrive/MyDrive/human_value_detection")
MODELS_PATH = BASE_PATH / "models"
DATA_PATH = BASE_PATH / "data"

Mounted at /content/gdrive


In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LABEL_NAMES = ["Self-direction: thought", "Self-direction: action", "Stimulation", "Hedonism", "Achievement", "Power: dominance", "Power: resources", "Face", "Security: personal", "Security: societal", "Tradition", "Conformity: rules", "Conformity: interpersonal", "Humility", "Benevolence: caring", "Benevolence: dependability", "Universalism: concern", "Universalism: nature", "Universalism: tolerance", "Universalism: objectivity"]

## Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("webis/Touche23-ValueEval", "main")

def concatenate(sample):
  return {
    "text":[" ".join(x) for x in zip(sample["Premise"], sample["Stance"], sample["Conclusion"])]
  }

dataset = dataset.map(lambda x: concatenate(x), batched=True, remove_columns=["Argument ID", "Conclusion", "Stance", "Premise"])

Downloading builder script:   0%|          | 0.00/32.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/254k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/363k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/89.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/290k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1896 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

### Stratified holdout set

In [None]:
from skmultilearn.model_selection import iterative_train_test_split
from datasets import Dataset

new_X_train, new_y_train, X_holdout, y_holdout = iterative_train_test_split(
  np.array(dataset["train"]["text"]).reshape(-1, 1),
  np.array(dataset["train"]["Labels"]), test_size = 0.05)

dataset["train"] = Dataset.from_dict({
    'Labels': new_y_train,
    'text': new_X_train.flatten()
})

dataset["holdout"] = Dataset.from_dict({
    'Labels': y_holdout,
    'text': X_holdout.flatten()
})
dataset

DatasetDict({
    train: Dataset({
        features: ['Labels', 'text'],
        num_rows: 5100
    })
    validation: Dataset({
        features: ['Labels', 'text'],
        num_rows: 1896
    })
    test: Dataset({
        features: ['Labels', 'text'],
        num_rows: 1576
    })
    holdout: Dataset({
        features: ['Labels', 'text'],
        num_rows: 293
    })
})

# Classifiers

In [None]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels):
  # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  metrics = {}

  for threshold in np.arange(0.1, 1, 0.05):
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    metrics[f'f1_macro_{threshold:.2f}'] = f1_score(labels, y_pred, average="macro")

  return {
    "f1_macro": max(metrics.values())
  }

def compute_metrics(p: EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  result = multi_label_metrics(
    predictions=preds,
    labels=p.label_ids
  )
  return result

def model_init(model_name):
  return AutoModelForSequenceClassification.from_pretrained(
    model_name,
    problem_type="multi_label_classification",
    num_labels=20,
  )

def get_trainer(base_model_name, encoded_ds, tokenizer, seed=42):
  BATCH_SIZE = 8
  EPOCHS = 10

  steps_per_epoch=len(encoded_ds["train"]) // BATCH_SIZE
  total_training_steps = steps_per_epoch * EPOCHS
  warmup_steps = total_training_steps // 5

  args = TrainingArguments(
    f"{base_model_name}-HVD",
    evaluation_strategy = IntervalStrategy.STEPS,
    save_strategy = IntervalStrategy.STEPS,
    max_steps=total_training_steps,
    save_total_limit = 1,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps = 1,
    weight_decay=0.01,
    warmup_steps=warmup_steps,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=True,
    push_to_hub=False,
    seed=seed
  )

  base_model_init_fn = functools.partial(model_init, base_model_name)

  trainer = Trainer(
    model_init=base_model_init_fn,
    args=args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
  )

  return trainer

In [None]:
def encode(tokenizer, max_len, batch):
  t = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=max_len)
  return {
    "input_ids": t["input_ids"],
    "attention_mask": t["attention_mask"],
    "labels": np.array(batch["Labels"], dtype=float)
  }

In [None]:
BMN = [
  "xlm-roberta-base",
  "danschr/roberta-large-BS_16-EPOCHS_8-LR_5e-05-ACC_GRAD_2-MAX_LENGTH_165",
  "microsoft/deberta-v3-base"
]
tokenizers = [
  AutoTokenizer.from_pretrained(mn, use_fast=False, ignore_mismatched_sizes=True)
  for mn in BMN
]

def encode_with(tokenizer):
  max_len = (pd.DataFrame(dataset["train"])["text"]
              .progress_apply(lambda x: len(tokenizer.encode(x, max_length=512)))
              .max())

  encode_fn = functools.partial(encode, tokenizer, max_len)
  encoded_ds = dataset.map(encode_fn, batched=True, remove_columns=dataset["train"].column_names)
  return encoded_ds

encoded_datasets = [encode_with(t) for t in tokenizers]

n_models = len(BMN)

# 1) train all models
trainers = []
for base_model_name, tok, ds in zip(BMN, tokenizers, encoded_datasets):
  seed = int(time.time())
  trainer = get_trainer(base_model_name, ds, tok, seed)
  trainer.train()

  trainers.append(trainer)

# 2) predict on holdout
holdout_predictions = []
for trainer, tok, ds in zip(trainers, tokenizers, encoded_datasets):
  X_holdout = ds["holdout"].remove_columns("labels")
  y_holdout = np.array(ds["holdout"]["labels"], dtype="int")

  holdout_predictions.append(trainer.predict(X_holdout).predictions)

# 3) average all predictions
avg_holdout_predictions =  np.mean(np.array([*holdout_predictions]), axis=0)

# 4) select best global threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(avg_holdout_predictions))

best_t = 0
best_f1 = 0
for threshold in np.arange(0.1, 1, 0.05):
  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs >= threshold)] = 1

  f1 = f1_score(y_holdout, y_pred, average="macro")
  if f1 > best_f1:
    best_f1 = f1
    best_t = threshold

# 5) predict on test dataset
test_predictions = []
for trainer, ds in zip(trainers, encoded_datasets):
  X_test = ds["test"].remove_columns("labels")
  y_test = np.array(ds["test"]["labels"], dtype="int")

  test_predictions.append(trainer.predict(X_test).predictions)

# 6) average all predictions
avg_test_predictions =  np.mean(np.array([*test_predictions]), axis=0)

# 7) binarize using the threshold found at step 4
probs = sigmoid(torch.Tensor(avg_test_predictions))
y_pred = np.zeros(probs.shape)
y_pred[np.where(probs >= best_t)] = 1

ensemble_f1_score = f1_score(y_test, y_pred, average="macro")

print(f"Ensemble final Macro F1 score: {ensemble_f1_score}")

  0%|          | 0/5100 [00:00<?, ?it/s]

Map:   0%|          | 0/5100 [00:00<?, ? examples/s]

Map:   0%|          | 0/293 [00:00<?, ? examples/s]

  0%|          | 0/5100 [00:00<?, ?it/s]

Map:   0%|          | 0/5100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

Map:   0%|          | 0/293 [00:00<?, ? examples/s]

  0%|          | 0/5100 [00:00<?, ?it/s]

Map:   0%|          | 0/5100 [00:00<?, ? examples/s]

Map:   0%|          | 0/293 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,F1 Macro
500,0.52,0.413042,0.25036
1000,0.4094,0.387741,0.283792
1500,0.3609,0.356044,0.345874
2000,0.3361,0.350253,0.393805
2500,0.3076,0.340541,0.408096
3000,0.2873,0.350701,0.415401
3500,0.2745,0.340894,0.433424
4000,0.2575,0.342567,0.431891
4500,0.2437,0.343691,0.437424
5000,0.2301,0.344962,0.434126




Step,Training Loss,Validation Loss,F1 Macro
500,0.4853,0.360865,0.377553
1000,0.3247,0.314682,0.452119
1500,0.2857,0.30781,0.488332
2000,0.2597,0.307347,0.502387
2500,0.2248,0.309842,0.513654
3000,0.1933,0.317896,0.51847
3500,0.166,0.325309,0.520418
4000,0.1424,0.338274,0.521884
4500,0.122,0.348369,0.52305
5000,0.0992,0.356379,0.520413




Step,Training Loss,Validation Loss,F1 Macro
500,0.5171,0.405925,0.254609
1000,0.3838,0.362787,0.315047
1500,0.3444,0.347543,0.381731
2000,0.3122,0.336895,0.41991
2500,0.2876,0.33892,0.433587
3000,0.2623,0.341323,0.441752
3500,0.247,0.351059,0.448873
4000,0.2254,0.358372,0.451359
4500,0.2071,0.363259,0.449307
5000,0.1935,0.37388,0.448352


Ensemble final Macro F1 score: 0.48150322830669
