<a href="https://colab.research.google.com/github/antoniobelotti/HVD/blob/main/2_only_premise_vs_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup env

In [None]:
%%capture

!python -m pip install --upgrade
!pip install --no-cache-dir transformers sentencepiece datasets
!pip install accelerate -U

import time
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas() # to use progress_apply
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import (
  AutoModelForSequenceClassification,
  AutoModel,
  AutoTokenizer,
  TrainingArguments,
  Trainer,
  EarlyStoppingCallback,
  IntervalStrategy,
  EvalPrediction,
  AdamW,
  get_linear_schedule_with_warmup
)
from datasets import (
  load_from_disk,
  load_dataset,
  DatasetDict,
  concatenate_datasets
)

import torch

from sklearn.model_selection import (
  StratifiedKFold,
  cross_validate,
  train_test_split
)
from sklearn.metrics import (
  make_scorer,
  accuracy_score,
  precision_score,
  recall_score,
  f1_score,
  confusion_matrix,
  classification_report
)

%matplotlib inline

In [None]:
MODEL_NAME = "distilbert-base-uncased"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

## Dataset

In [None]:
LABEL_NAMES = ["Self-direction: thought", "Self-direction: action", "Stimulation", "Hedonism", "Achievement", "Power: dominance", "Power: resources", "Face", "Security: personal", "Security: societal", "Tradition", "Conformity: rules", "Conformity: interpersonal", "Humility", "Benevolence: caring", "Benevolence: dependability", "Universalism: concern", "Universalism: nature", "Universalism: tolerance", "Universalism: objectivity"]

In [None]:
dataset = load_dataset("webis/Touche23-ValueEval", "main")

token_lens = pd.DataFrame(dataset["train"])["Premise"].progress_apply(lambda x: len(TOKENIZER.encode(x, max_length=512)))
MAX_TOKENS_LEN = token_lens.max()

def encode_premise(sample):
  t = TOKENIZER(sample["Premise"], padding="max_length", truncation=True, max_length=MAX_TOKENS_LEN)
  return {
    "input_ids": t["input_ids"],
    "attention_mask": t["attention_mask"],
    "labels": np.array(sample["Labels"], dtype=float)
  }

def encode_all(sample):
  batch = [
    " ".join(x)
    for x in zip(sample["Premise"], sample["Stance"], sample["Conclusion"])
  ]
  t = TOKENIZER(batch, padding="max_length", truncation=True, max_length=MAX_TOKENS_LEN)
  return {
    "input_ids": t["input_ids"],
    "attention_mask": t["attention_mask"],
    "labels": np.array(sample["Labels"], dtype=float)
  }

encoded_premise = dataset.map(encode_premise, batched=True, remove_columns=dataset["train"].column_names)
encoded_all = dataset.map(encode_all, batched=True, remove_columns=dataset["train"].column_names)

Downloading builder script:   0%|          | 0.00/32.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

Downloading and preparing dataset touche23-value_eval/main to /root/.cache/huggingface/datasets/webis___touche23-value_eval/main/0.0.2/109738f7f54e5a68f95e3d0b4d07797f6b7e558edce5e29c71cf0668208bfa43...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/254k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/363k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/89.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/290k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset touche23-value_eval downloaded and prepared to /root/.cache/huggingface/datasets/webis___touche23-value_eval/main/0.0.2/109738f7f54e5a68f95e3d0b4d07797f6b7e558edce5e29c71cf0668208bfa43. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5393 [00:00<?, ?it/s]

Map:   0%|          | 0/5393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1896 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

Map:   0%|          | 0/5393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1896 [00:00<?, ? examples/s]

Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

# Classifiers

## Common

In [None]:
from sklearn.metrics import f1_score

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels):
  # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  metrics = {}

  for threshold in np.arange(0.1, 1, 0.05):
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    metrics[f'f1_macro_{threshold:.2f}'] = f1_score(labels, y_pred, average="macro")

  return {
    "f1_macro": max(metrics.values())
  }

def compute_metrics(p: EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  result = multi_label_metrics(
    predictions=preds,
    labels=p.label_ids
  )
  return result

def model_init():
  return AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    problem_type="multi_label_classification",
    num_labels=len(dataset["train"][0]["Labels"]),
  )

def get_trainer(name, ds, seed=42):
  args = TrainingArguments(
    name,
    evaluation_strategy = IntervalStrategy.EPOCH,
    save_strategy = IntervalStrategy.EPOCH,
    save_total_limit = 1,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps = 2,
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    push_to_hub=False,
    seed=seed
  )

  trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=TOKENIZER,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )

  return trainer

In [None]:
def integer_predictions(predictions, thresholds):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs >= torch.tensor(thresholds))] = 1

  return y_pred.astype("int")

## Premise only

In [None]:
n_retry = 5

scores = np.zeros(n_retry)
for i in range(n_retry):
  seed = int(time.time())

  premise_only_trainer = get_trainer("premise_only", encoded_premise, seed=seed)
  premise_only_trainer.train()

  X_test = encoded_premise["test"].remove_columns("labels")
  y_test = np.array(encoded_premise["test"]["labels"], dtype="int")

  scores[i] = multi_label_metrics(
    premise_only_trainer.predict(X_test).predictions,
    y_test
  )["f1_macro"]

scores.mean(), scores.std()

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.372512,0.320077
2,0.438000,0.336561,0.407201
2,0.303700,0.338172,0.435564
4,0.303700,0.345915,0.443529
4,0.228600,0.37648,0.437416
6,0.169500,0.400369,0.441353
6,0.169500,0.428581,0.440243




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.378216,0.311484
2,0.443200,0.339399,0.417221
2,0.307500,0.336946,0.43453
4,0.307500,0.352631,0.43762
4,0.229600,0.371605,0.442127
6,0.169800,0.401422,0.443534
6,0.169800,0.428708,0.438774
8,0.117200,0.446763,0.436496
8,0.088900,0.46738,0.435731




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.376352,0.294913
2,0.440600,0.347415,0.381472
2,0.309400,0.339105,0.431482
4,0.309400,0.351017,0.438359
4,0.235500,0.369878,0.442037
6,0.177700,0.400157,0.44066
6,0.177700,0.426985,0.436074
8,0.124800,0.448098,0.43654




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.378779,0.298681
2,0.441600,0.347928,0.391232
2,0.309300,0.338143,0.42953
4,0.309300,0.349229,0.438204
4,0.235100,0.376953,0.438669
6,0.176600,0.401344,0.436023
6,0.176600,0.42715,0.43385
8,0.123400,0.445544,0.436277




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.372742,0.306183
2,0.437100,0.339908,0.411318
2,0.308000,0.339951,0.431193
4,0.308000,0.349141,0.443738
4,0.234400,0.368098,0.446054
6,0.175600,0.396172,0.440941
6,0.175600,0.423966,0.439829
8,0.124700,0.440551,0.443831


(0.42679440107446087, 0.0042260050951904386)

## Premise+Stance+Conclusion

In [None]:
n_retry = 5

scores = np.zeros(n_retry)
for i in range(n_retry):
  seed = int(time.time())

  premise_only_trainer = get_trainer("premise_stance_conclusion", encoded_all, seed=seed)
  premise_only_trainer.train()

  X_test = encoded_all["test"].remove_columns("labels")
  y_test = np.array(encoded_all["test"]["labels"], dtype="int")

  scores[i] = multi_label_metrics(
    premise_only_trainer.predict(X_test).predictions,
    y_test
  )["f1_macro"]

scores.mean(), scores.std()



Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.383134,0.284149
2,0.441500,0.348318,0.381352
2,0.309800,0.337394,0.44058
4,0.309800,0.348696,0.43224
4,0.234000,0.377988,0.432859
6,0.176300,0.397304,0.441471
6,0.176300,0.42498,0.439687
8,0.124100,0.445428,0.443752
8,0.094200,0.464565,0.437897
9,0.094200,0.467663,0.43964




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.381347,0.302372
2,0.441600,0.338317,0.415795
2,0.305700,0.335122,0.412397
4,0.305700,0.34521,0.450194
4,0.229600,0.37095,0.440362
6,0.172700,0.399579,0.437579
6,0.172700,0.423809,0.445015




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.382917,0.296504
2,0.438300,0.34261,0.382299
2,0.309000,0.336181,0.421429
4,0.309000,0.350188,0.440586
4,0.236700,0.374706,0.430582
6,0.179700,0.40047,0.429451
6,0.179700,0.420999,0.43534




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.383282,0.292994
2,0.445100,0.340368,0.395809
2,0.306300,0.337721,0.411773
4,0.306300,0.35391,0.432029
4,0.228500,0.375491,0.436356
6,0.169700,0.405206,0.426552
6,0.169700,0.430755,0.424076
8,0.119500,0.454174,0.424949




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.380623,0.293756
2,0.442700,0.344674,0.393054
2,0.309900,0.342415,0.424634
4,0.309900,0.355347,0.425351
4,0.234800,0.375116,0.440299
6,0.175600,0.408071,0.439166
6,0.175600,0.43051,0.433703
8,0.124400,0.452459,0.438244


(0.4285676546042173, 0.005906577105715771)