<a href="https://colab.research.google.com/github/antoniobelotti/HVD/blob/main/3_augmented_vs_original_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup env

In [None]:
%%capture

!python -m pip install --upgrade
!pip install --no-cache-dir transformers sentencepiece datasets
!pip install accelerate -U

import time
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas() # to use progress_apply
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import (
  AutoModelForSequenceClassification,
  AutoModel,
  AutoTokenizer,
  TrainingArguments,
  Trainer,
  EarlyStoppingCallback,
  IntervalStrategy,
  EvalPrediction,
  AdamW,
  get_linear_schedule_with_warmup
)
from datasets import (
  load_from_disk,
  DatasetDict,
  concatenate_datasets
)

import torch

from sklearn.model_selection import (
  StratifiedKFold,
  cross_validate,
  train_test_split
)
from sklearn.metrics import (
  make_scorer,
  accuracy_score,
  precision_score,
  recall_score,
  f1_score,
  confusion_matrix,
  classification_report
)

%matplotlib inline

In [None]:
USE_DRIVE = True

In [None]:
import pathlib

if USE_DRIVE:
  from google.colab import drive
  drive.mount('/content/gdrive')
  BASE_PATH = pathlib.Path("/content/gdrive/MyDrive/human_value_detection")
else:
  BASE_PATH = pathlib.Path("/content/human_value_detection")
  print("Manually upload the encoded datasets produced in notebook 1")

DATA_PATH = BASE_PATH / "data"

Mounted at /content/gdrive


In [None]:
MODEL_NAME = "distilbert-base-uncased"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LABEL_NAMES = ["Self-direction: thought", "Self-direction: action", "Stimulation", "Hedonism", "Achievement", "Power: dominance", "Power: resources", "Face", "Security: personal", "Security: societal", "Tradition", "Conformity: rules", "Conformity: interpersonal", "Humility", "Benevolence: caring", "Benevolence: dependability", "Universalism: concern", "Universalism: nature", "Universalism: tolerance", "Universalism: objectivity"]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

# Classifiers

In [None]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels):
  # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  metrics = {}

  for threshold in np.arange(0.1, 1, 0.05):
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    metrics[f'f1_macro_{threshold:.2f}'] = f1_score(labels, y_pred, average="macro")

  return {
    "f1_macro": max(metrics.values())
  }

def compute_metrics(p: EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  result = multi_label_metrics(
    predictions=preds,
    labels=p.label_ids
  )
  return result

def model_init():
  return AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    problem_type="multi_label_classification",
    num_labels=20,
  )

def get_trainer(name, ds, seed=42):
  args = TrainingArguments(
    name,
    evaluation_strategy = IntervalStrategy.EPOCH,
    save_strategy = IntervalStrategy.EPOCH,
    save_total_limit = 1,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps = 2,
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    push_to_hub=False,
    seed=seed
  )

  trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=TOKENIZER,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )

  return trainer

In [None]:
n_retry = 5
reports = {}

for ds_name in ["original", "summarized", "masked", "backtranslated"]:
  dataset = load_from_disk(DATA_PATH / MODEL_NAME / ds_name)

  scores = np.zeros(n_retry)
  for i in range(n_retry):
    seed = int(time.time())

    trainer = get_trainer(ds_name, dataset, seed=seed)

    trainer.train()

    X_test = dataset["test"].remove_columns("labels")
    y_test = np.array(dataset["test"]["labels"], dtype="int")

    scores[i] = multi_label_metrics(
      trainer.predict(X_test).predictions,
      y_test
    )["f1_macro"]

  reports[ds_name] = scores.mean()



Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.383802,0.295328
2,0.441100,0.349216,0.408594
2,0.313900,0.341492,0.411459
4,0.313900,0.355742,0.427921
4,0.238000,0.38632,0.423343
6,0.176000,0.41546,0.431947
6,0.176000,0.44938,0.423053
8,0.122900,0.469632,0.430176
8,0.092500,0.484553,0.428276




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.380752,0.299585
2,0.438800,0.344567,0.412963
2,0.312500,0.340822,0.423341
4,0.312500,0.363554,0.422111
4,0.237900,0.381055,0.430369
6,0.178900,0.413175,0.422849
6,0.178900,0.437915,0.420249
8,0.124600,0.462586,0.423538




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.380375,0.297318
2,0.438100,0.342,0.416185
2,0.307900,0.349234,0.419679
4,0.307900,0.358555,0.430876
4,0.235100,0.381938,0.424664
6,0.175100,0.414116,0.418206
6,0.175100,0.442546,0.42093




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.386443,0.291739
2,0.441100,0.346139,0.386201
2,0.312200,0.344689,0.398895
4,0.312200,0.352596,0.424577
4,0.237900,0.374085,0.437825
6,0.177500,0.405845,0.423504
6,0.177500,0.430136,0.432385
8,0.123100,0.454485,0.430169




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.381083,0.302293
2,0.437500,0.3432,0.381353
2,0.314100,0.340486,0.422086
4,0.314100,0.356165,0.428282
4,0.237700,0.379116,0.426598
6,0.176600,0.417245,0.417488
6,0.176600,0.435547,0.420473




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.443061,0.276646
2,0.441800,0.358745,0.346214
2,0.322600,0.342151,0.389184
4,0.322600,0.346656,0.41119
4,0.246800,0.359257,0.411706
6,0.183500,0.375113,0.419605
6,0.183500,0.39819,0.412584
8,0.127800,0.417768,0.414336
8,0.095400,0.43718,0.406099




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.438034,0.273928
2,0.440200,0.371866,0.354103
2,0.322700,0.349748,0.388344
4,0.322700,0.343505,0.41481
4,0.241600,0.359398,0.416322
6,0.179500,0.376735,0.420026
6,0.179500,0.401781,0.415896
8,0.122900,0.419205,0.415437
8,0.092500,0.435659,0.41564




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.416029,0.286338
2,0.451100,0.361679,0.363311
2,0.325500,0.356686,0.396434
4,0.325500,0.346585,0.405145
4,0.248800,0.352816,0.420285
6,0.186800,0.37308,0.412712
6,0.186800,0.40271,0.409679
8,0.129500,0.416644,0.406436




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.422899,0.291484
2,0.444200,0.356892,0.359368
2,0.323100,0.341115,0.399676
4,0.323100,0.339441,0.420423
4,0.244300,0.34404,0.430677
6,0.182800,0.36673,0.427032
6,0.182800,0.392719,0.424762
8,0.124900,0.414883,0.423161




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.420899,0.284922
2,0.444600,0.363259,0.36758
2,0.322500,0.344151,0.393383
4,0.322500,0.341271,0.397429
4,0.243100,0.343045,0.425455
6,0.178300,0.357922,0.426759
6,0.178300,0.380517,0.412379
8,0.123200,0.399414,0.415599
8,0.091800,0.410743,0.417969




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.373322,0.31263
2,0.438000,0.340603,0.395562
2,0.306900,0.334954,0.437132
4,0.306900,0.351127,0.443695
4,0.228300,0.37681,0.43785
6,0.172000,0.403154,0.437382
6,0.172000,0.434188,0.433181




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.376118,0.289575
2,0.439800,0.338237,0.408239
2,0.306200,0.336266,0.437358
4,0.306200,0.360433,0.422344
4,0.231000,0.379436,0.441764
6,0.172200,0.411454,0.43884
6,0.172200,0.43981,0.436372
8,0.121100,0.458589,0.437025




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.377587,0.285477
2,0.439400,0.343556,0.400854
2,0.308700,0.342232,0.414658
4,0.308700,0.356749,0.431125
4,0.230900,0.375304,0.43601
6,0.171100,0.405611,0.438677
6,0.171100,0.43623,0.432256
8,0.117400,0.452145,0.435094
8,0.090500,0.468253,0.432754




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.37267,0.320631
2,0.441200,0.342319,0.399456
2,0.307100,0.336966,0.443717
4,0.307100,0.348283,0.436341
4,0.231900,0.369378,0.44119
6,0.172500,0.399953,0.438475




Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,0.379751,0.302931
2,0.439000,0.341307,0.400201
2,0.310400,0.33887,0.427006
4,0.310400,0.349386,0.434378
4,0.234700,0.375114,0.443857
6,0.174300,0.400703,0.438038
6,0.174300,0.431134,0.437231
8,0.122400,0.45381,0.439512


In [None]:
reports

{'summarized': 0.40605220815372894,
 'masked': 0.3959099820166097,
 'backtranslated': 0.41934826711927614}

# All datasets

In [None]:
import time

In [None]:
n_retry = 5
reports = {}

dataset = load_from_disk(DATA_PATH / MODEL_NAME / "original")
for augmented_ds_name in ["summarized", "masked", "backtranslated"]:
  new_ds = load_from_disk(DATA_PATH / MODEL_NAME / augmented_ds_name)
  dataset["train"] = concatenate_datasets([dataset["train"], new_ds["train"]])

  scores = np.zeros(n_retry)
  for i in range(n_retry):
    seed = int(time.time())

    trainer = get_trainer("original-" + augmented_ds_name, dataset, seed=seed)

    trainer.train()

    X_test = dataset["test"].remove_columns("labels")
    y_test = np.array(dataset["test"]["labels"], dtype="int")

    scores[i] = multi_label_metrics(
      trainer.predict(X_test).predictions,
      y_test
    )["f1_macro"]

  reports["original-"+augmented_ds_name] = scores.mean()

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,F1 Macro
0,0.4348,0.346433,0.377523
2,0.3074,0.345617,0.437352
2,0.1923,0.399004,0.435172
4,0.1336,0.476115,0.425763
4,0.0975,0.542776,0.428585




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.4414,0.342338,0.401748
2,0.3059,0.357122,0.433282
2,0.1927,0.396788,0.439708
4,0.1336,0.475365,0.427451
4,0.0963,0.538812,0.437534
6,0.051,0.595165,0.440375
6,0.0389,0.642109,0.430707
8,0.0311,0.669001,0.438215
8,0.0235,0.691702,0.433118




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.4453,0.343701,0.3802
2,0.303,0.342787,0.445801
2,0.1924,0.397946,0.442397
4,0.1308,0.467299,0.436261
4,0.0965,0.536221,0.441386




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.4411,0.342281,0.397305
2,0.3082,0.343169,0.445864
2,0.1922,0.401515,0.436268
4,0.1328,0.461019,0.441441
4,0.0964,0.533712,0.442884




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.4446,0.342331,0.402899
2,0.3034,0.344681,0.439602
2,0.1906,0.394869,0.443942
4,0.1305,0.459708,0.440729
4,0.0951,0.527642,0.43909
6,0.05,0.588356,0.438912




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.3197,0.341291,0.43092
2,0.2245,0.382488,0.435482
2,0.1315,0.478888,0.437105
4,0.0704,0.579853,0.433635
4,0.0413,0.677346,0.421828
6,0.0268,0.737825,0.425812




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.318,0.341742,0.419656
2,0.229,0.371195,0.439231
2,0.1339,0.462139,0.440257
4,0.0722,0.569718,0.434385
4,0.042,0.650111,0.438184
6,0.0267,0.719999,0.437133




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.3205,0.338837,0.420132
2,0.2259,0.373065,0.432662
2,0.1313,0.480712,0.425478
4,0.0713,0.584853,0.426006
4,0.0408,0.678013,0.413822




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.3199,0.339446,0.427838
2,0.2255,0.376326,0.43308
2,0.1332,0.479352,0.429094
4,0.0719,0.578085,0.429568
4,0.0424,0.669371,0.431208




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.3215,0.340344,0.418814
2,0.2293,0.376824,0.439941
2,0.1308,0.47326,0.438027
4,0.0721,0.575013,0.436806
4,0.0415,0.66187,0.426017




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.3168,0.339172,0.44171
2,0.185,0.412385,0.445968
2,0.0884,0.544328,0.440784
4,0.0498,0.666656,0.442491
4,0.03,0.746053,0.432971




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.3173,0.342718,0.427037
2,0.1847,0.419611,0.432416
2,0.0849,0.559244,0.430487
4,0.0489,0.681703,0.421229
4,0.0277,0.764938,0.424805




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.3202,0.344961,0.430604
2,0.188,0.422956,0.42931
2,0.0872,0.550503,0.429501
4,0.0492,0.675063,0.42919




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.3155,0.344691,0.424195
2,0.1848,0.419385,0.436658
2,0.0866,0.551387,0.437059
4,0.0499,0.67047,0.435948
4,0.0289,0.756898,0.432959
6,0.0192,0.820029,0.432194




Epoch,Training Loss,Validation Loss,F1 Macro
0,0.3189,0.340209,0.435177
2,0.1854,0.427473,0.432801
2,0.0879,0.557355,0.432919
4,0.0494,0.669812,0.437961
4,0.0286,0.737075,0.441044
6,0.0199,0.817395,0.43728
6,0.0149,0.860699,0.435561
8,0.0117,0.892253,0.434898


In [None]:
reports

{'original-summarized': 0.42100764828944726,
 'original-masked': 0.4125954491651499,
 'original-backtranslated': 0.42191733925661473}