<a href="https://colab.research.google.com/github/antoniobelotti/HVD/blob/main/5_other_transformers_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup env

In [None]:
%%capture

!python -m pip install --upgrade
!pip install --no-cache-dir transformers sentencepiece datasets
!pip install accelerate -U

import time
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas() # to use progress_apply
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import (
  AutoModelForSequenceClassification,
  AutoModel,
  AutoTokenizer,
  TrainingArguments,
  Trainer,
  EarlyStoppingCallback,
  IntervalStrategy,
  EvalPrediction,
  AdamW,
  get_linear_schedule_with_warmup
)
from datasets import (
  load_from_disk,
  DatasetDict,
  concatenate_datasets
)

import torch

from sklearn.model_selection import (
  StratifiedKFold,
  cross_validate,
  train_test_split
)
from sklearn.metrics import (
  make_scorer,
  accuracy_score,
  precision_score,
  recall_score,
  f1_score,
  confusion_matrix,
  classification_report
)

%matplotlib inline

In [None]:
from google.colab import drive
import pathlib

drive.mount('/content/gdrive')

BASE_PATH = pathlib.Path("/content/gdrive/MyDrive/human_value_detection")
MODELS_PATH = BASE_PATH / "models"
DATA_PATH = BASE_PATH / "data"

Mounted at /content/gdrive


In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LABEL_NAMES = ["Self-direction: thought", "Self-direction: action", "Stimulation", "Hedonism", "Achievement", "Power: dominance", "Power: resources", "Face", "Security: personal", "Security: societal", "Tradition", "Conformity: rules", "Conformity: interpersonal", "Humility", "Benevolence: caring", "Benevolence: dependability", "Universalism: concern", "Universalism: nature", "Universalism: tolerance", "Universalism: objectivity"]

## Original dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("webis/Touche23-ValueEval", "main")

def encode(tokenizer, max_len, sample):
  batch = [
    " ".join(x)
    for x in zip(sample["Premise"], sample["Stance"], sample["Conclusion"])
  ]
  t = tokenizer(batch, padding="max_length", truncation=True, max_length=max_len)
  return {
    "input_ids": t["input_ids"],
    "attention_mask": t["attention_mask"],
    "labels": np.array(sample["Labels"], dtype=float)
  }

Downloading builder script:   0%|          | 0.00/32.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/254k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/363k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/89.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/290k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

# Classifiers

## common

In [None]:
import functools

In [None]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels):
  # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  metrics = {}

  for threshold in np.arange(0.1, 1, 0.05):
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    metrics[f'f1_macro_{threshold:.2f}'] = f1_score(labels, y_pred, average="macro")

  return {
    "f1_macro": max(metrics.values())
  }

def compute_metrics(p: EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  result = multi_label_metrics(
    predictions=preds,
    labels=p.label_ids
  )
  return result

def model_init(model_name):
  return AutoModelForSequenceClassification.from_pretrained(
    model_name,
    problem_type="multi_label_classification",
    num_labels=20,
  )

def get_trainer(base_model_name, encoded_ds, tokenizer, seed=42):
  BATCH_SIZE = 8
  EPOCHS = 10

  steps_per_epoch=len(encoded_ds["train"]) // BATCH_SIZE
  total_training_steps = steps_per_epoch * EPOCHS
  warmup_steps = total_training_steps // 5

  args = TrainingArguments(
    f"{base_model_name}-HVD",
    evaluation_strategy = IntervalStrategy.STEPS,
    save_strategy = IntervalStrategy.STEPS,
    save_steps = 500,
    max_steps=total_training_steps,
    save_total_limit = 1,
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps = 1,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    warmup_steps=warmup_steps,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=True,
    push_to_hub=False,
    seed=seed
  )

  base_model_init_fn = functools.partial(model_init, base_model_name)

  trainer = Trainer(
    model_init=base_model_init_fn,
    args=args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
  )

  return trainer

## DeBerta v3

In [None]:
n_retry=3

base_model_name = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=False)

max_len = (pd.DataFrame(dataset["train"])["Premise"]
            .progress_apply(lambda x: len(tokenizer.encode(x, max_length=512)))
            .max())

encode_fn = functools.partial(encode, tokenizer, max_len)
encoded_ds = dataset.map(encode_fn, batched=True, remove_columns=dataset["train"].column_names)

scores = np.zeros(n_retry)
for i in range(n_retry):
  seed = int(time.time())

  trainer = get_trainer(base_model_name, encoded_ds, tokenizer, seed)
  trainer.train()

  X_test = encoded_ds["test"].remove_columns("labels")
  y_test = np.array(encoded_ds["test"]["labels"], dtype="int")

  scores[i] = multi_label_metrics(
    trainer.predict(X_test).predictions,
    y_test
  )["f1_macro"]

scores.mean(), scores.std()

  0%|          | 0/5393 [00:00<?, ?it/s]



Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.39674,0.262237
1,No log,0.345005,0.380774
2,0.389100,0.327746,0.442801
4,0.389100,0.340107,0.443026
4,0.389100,0.337937,0.473974
5,0.234200,0.351846,0.461758
6,0.234200,0.38151,0.474659
8,0.234200,0.412511,0.47569
8,0.119700,0.443716,0.482178
9,0.119700,0.473257,0.477144




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.380234,0.28278
1,No log,0.33709,0.401236
2,0.386000,0.330775,0.454445
4,0.386000,0.328359,0.455414
4,0.386000,0.33951,0.472935
5,0.226900,0.356649,0.48705
6,0.226900,0.391136,0.488864


Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.380234,0.28278
1,No log,0.33709,0.401236
2,0.386000,0.330775,0.454445
4,0.386000,0.328359,0.455414
4,0.386000,0.33951,0.472935
5,0.226900,0.356649,0.48705
6,0.226900,0.391136,0.488864
8,0.226900,0.429338,0.491468
8,0.110900,0.459608,0.491901
9,0.110900,0.490765,0.486319




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.387843,0.263016
1,No log,0.33961,0.418722
2,0.390000,0.330071,0.431617
4,0.390000,0.33097,0.452801
4,0.390000,0.344669,0.470041
5,0.222200,0.364297,0.478267
6,0.222200,0.402366,0.486451
8,0.222200,0.431117,0.483183
8,0.098900,0.472689,0.470602


(0.4889187020457393, 0.0049050040133361205)

In [None]:
# premise only        (0.48085631027824155, 0.00844434062410289)
# premise+stance+conc (0.4889187020457393, 0.0049050040133361205)

## XLNet

In [None]:
n_retry=3

model_name = "xlnet-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

max_len = (pd.DataFrame(dataset["train"])["Premise"]
            .progress_apply(lambda x: len(tokenizer.encode(x, max_length=512)))
            .max())

encode_fn = functools.partial(encode, tokenizer, max_len)
encoded_ds = dataset.map(encode_fn, batched=True, remove_columns=dataset["train"].column_names)

scores = np.zeros(n_retry)
for i in range(n_retry):
  seed = int(time.time())
  trainer = get_trainer(model_name, encoded_ds, tokenizer, seed)

  trainer.train()

  X_test = encoded_ds["test"].remove_columns("labels")
  y_test = np.array(encoded_ds["test"]["labels"], dtype="int")

  scores[i] = multi_label_metrics(
    trainer.predict(X_test).predictions,
    y_test
  )["f1_macro"]

scores.mean(), scores.std()

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

  0%|          | 0/5393 [00:00<?, ?it/s]

Map:   0%|          | 0/5393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1896 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.395403,0.256904
1,No log,0.344726,0.37733
2,0.391100,0.33419,0.419514
4,0.391100,0.336243,0.434721
4,0.391100,0.337039,0.448197
5,0.248100,0.347893,0.454769


Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.395403,0.256904
1,No log,0.344726,0.37733
2,0.391100,0.33419,0.419514
4,0.391100,0.336243,0.434721
4,0.391100,0.337039,0.448197
5,0.248100,0.347893,0.454769
6,0.248100,0.361583,0.456348
8,0.248100,0.372706,0.460333
8,0.152200,0.393465,0.450515
9,0.152200,0.403604,0.451642




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.391761,0.265807
1,No log,0.343561,0.386214
2,0.383100,0.340796,0.415174
4,0.383100,0.329406,0.456945
4,0.383100,0.335228,0.453879
5,0.244000,0.350656,0.452676




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.386724,0.283253
1,No log,0.342179,0.403007
2,0.383800,0.329571,0.436728
4,0.383800,0.331026,0.452245
4,0.383800,0.332787,0.463501
5,0.242800,0.347652,0.461472
6,0.242800,0.359962,0.455482


(0.45389181789886884, 0.011488460619142956)

In [None]:
# premise only  (0.44536362641990657, 0.011691843766579908)
# all           (0.45389181789886884, 0.011488460619142956)

## danschr/roberta-large-BS_16-EPOCHS_8-LR_5e-05-ACC_GRAD_2-MAX_LENGTH_165

In [None]:
n_retry=3

model_name = "danschr/roberta-large-BS_16-EPOCHS_8-LR_5e-05-ACC_GRAD_2-MAX_LENGTH_165"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

max_len = (pd.DataFrame(dataset["train"])["Premise"]
            .progress_apply(lambda x: len(tokenizer.encode(x, max_length=512)))
            .max())

encode_fn = functools.partial(encode, tokenizer, max_len)
encoded_ds = dataset.map(encode_fn, batched=True, remove_columns=dataset["train"].column_names)

scores = np.zeros(n_retry)
for i in range(n_retry):
  seed = int(time.time())
  trainer = get_trainer(model_name, encoded_ds, tokenizer, seed)

  trainer.train()

  X_test = encoded_ds["test"].remove_columns("labels")
  y_test = np.array(encoded_ds["test"]["labels"], dtype="int")

  scores[i] = multi_label_metrics(
    trainer.predict(X_test).predictions,
    y_test
  )["f1_macro"]

scores.mean(), scores.std()

Downloading (…)okenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

  0%|          | 0/5393 [00:00<?, ?it/s]

Map:   0%|          | 0/5393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1896 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]



Step,Training Loss,Validation Loss,F1 Macro
500,0.4372,0.334463,0.408132
1000,0.3099,0.319045,0.481937
1500,0.2936,0.325581,0.464743
2000,0.2782,0.317941,0.487402
2500,0.2377,0.32952,0.481882
3000,0.2224,0.334976,0.495263
3500,0.1915,0.340839,0.507616
4000,0.164,0.351423,0.504583
4500,0.1306,0.364103,0.50845


Step,Training Loss,Validation Loss,F1 Macro
500,0.4372,0.334463,0.408132
1000,0.3099,0.319045,0.481937
1500,0.2936,0.325581,0.464743
2000,0.2782,0.317941,0.487402
2500,0.2377,0.32952,0.481882
3000,0.2224,0.334976,0.495263
3500,0.1915,0.340839,0.507616
4000,0.164,0.351423,0.504583
4500,0.1306,0.364103,0.50845
5000,0.1054,0.377503,0.509142




Step,Training Loss,Validation Loss,F1 Macro
500,0.4353,0.337469,0.418707
1000,0.3088,0.319261,0.456015
1500,0.2953,0.31852,0.478932
2000,0.2774,0.315229,0.483823
2500,0.2449,0.320904,0.495561
3000,0.2211,0.334018,0.494164
3500,0.1995,0.338496,0.51046
4000,0.1709,0.343781,0.520383
4500,0.1351,0.362931,0.506109
5000,0.1122,0.378371,0.505105




Step,Training Loss,Validation Loss,F1 Macro
500,0.4275,0.332399,0.409781
1000,0.3088,0.31776,0.474346
1500,0.2969,0.32137,0.480448
2000,0.2752,0.309032,0.499881
2500,0.2428,0.321608,0.506176
3000,0.2212,0.332905,0.504426
3500,0.1994,0.339578,0.501163


(0.4898307493838356, 0.009832350237969521)

In [None]:
# premise only  (0.44536362641990657, 0.011691843766579908)
# all           (0.45389181789886884, 0.011488460619142956)