# Train a Language Model to Detect Human Values in Arguments

In [1]:
!pip install transformers torchmetrics



In [2]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import torch
import torch.nn as nn

from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import pickle


RANDOM_SEED = 99
torch.manual_seed(99)

<torch._C.Generator at 0x7f1f346b8110>

In [3]:
from model.BertDataModule import BertDataModule, BertDataset
from model.BertFineTuner import BertFineTuner, train

### Define Parameters

In [4]:
PARAMS = {
    # Language Model and Hyperparameters
    "MODEL_PATH": 'roberta-base',
    "BATCH_SIZE": 16,
    "ACCUMULATE_GRAD_BATCHES": 1,
    "LR": 1e-5,
    "EPOCHS": 20,
    "OPTIMIZER": 'AdamW',
    "DEVICE": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    "NUM_TRAIN_WORKERS": 4,
    "NUM_VAL_WORKERS": 4,
    "MAX_TOKEN_COUNT":128,
    "RANDOM_SEED": RANDOM_SEED,


    # Early Stopping Params
    "PATIENCE": 3,
    "VAL_CHECK_INTERVAL": 300,

    # The metric we optimize for. Alternative "custom_f1/Val" and "max"
    "MAX_THRESHOLD_METRIC": "custom", #The f1-score that should maximized (custom = formula for the task evaluation)
    "EARLY_STOPPING_METRIC": "avg_val_loss",
    "EARLY_STOPPING_MODE": "min",

    # DATA
    "VALIDATION_SET_SIZE":500,

    "TRAIN_PATH" : "./data/data_training_full.csv", #
    "LEAVE_OUT_DATA_PATH": "./data/leave_out_dataset_300.csv",
    "SAVE_PATH": "./model/best_model.pt"

}


## Data Loading


In [5]:
train_df = pd.read_csv(PARAMS["TRAIN_PATH"], index_col=0)
LABEL_COLUMNS = train_df.columns.tolist()[6:]

leave_out_df = pd.read_csv(PARAMS["LEAVE_OUT_DATA_PATH"], index_col=0)

## Model Training

### Linear Learning Rate Schedule

In [6]:
steps_per_epoch=len(train_df) // PARAMS['BATCH_SIZE']
total_training_steps = steps_per_epoch * PARAMS['EPOCHS']
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(1716, 8580)

### Prepare Data Modules for the Training

In [7]:
train_df, val_df = train_test_split(train_df, test_size=PARAMS["VALIDATION_SET_SIZE"], random_state=PARAMS["RANDOM_SEED"])
TOKENIZER = AutoTokenizer.from_pretrained(PARAMS["MODEL_PATH"])

data_module = BertDataModule(
    train_df=train_df,
    val_df=val_df,
    tokenizer=TOKENIZER,
    params=PARAMS,
    label_columns=LABEL_COLUMNS
)

data_module.setup()
train_loader = data_module.train_dataloader()
val_loader = data_module.val_dataloader()



In [8]:
print(LABEL_COLUMNS)
print(len(LABEL_COLUMNS))

['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism', 'Achievement', 'Power: dominance', 'Power: resources', 'Face', 'Security: personal', 'Security: societal', 'Tradition', 'Conformity: rules', 'Conformity: interpersonal', 'Humility', 'Benevolence: caring', 'Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity']
20


In [9]:
# Print some examples from the train loader
print("Train Loader Examples:")
for batch in train_loader:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    # Print the input examples
    for i in range(len(input_ids)):
        print("Example", i+1)
        print("Input IDs:", input_ids[i])
        print("Attention Mask:", attention_mask[i])
        print("Labels:", labels[i])
        print()

    # Stop after printing a few examples
    break

# Print some examples from the val loader
print("Val Loader Examples:")
for batch in val_loader:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    # Print the input examples
    for i in range(len(input_ids)):
        print("Example", i+1)
        print("Input IDs:", input_ids[i])
        print("Attention Mask:", attention_mask[i])
        print("Labels:", labels[i])
        print()

    # Stop after

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
           28, 14267,    11,  4402,     9,   166,   197,  1032,    13,     5,
        34117,     9,  1748,  2398,     2,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1])
Attention Mask: tensor([1, 1, 1, 1, 

### Calculate weights to address imbalance problem

In [10]:
from sklearn.utils import class_weight

def calculate_class_weights(no_of_classes, samples_per_cls, power=1):
    weights_for_samples = 1.0/np.array(np.power(samples_per_cls,power))
    weights_for_samples = weights_for_samples / np.sum(weights_for_samples) * no_of_classes
    return weights_for_samples

temp_df = pd.concat([train_df, val_df], ignore_index=True)
class_labels = temp_df[LABEL_COLUMNS]
num_ones = class_labels.eq(1).sum()
class_weights = calculate_class_weights(20,num_ones)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
print(class_weights)

[0.60873996 0.4023066  1.9535921  2.66267368 0.36530584 0.99989137
 0.98753007 1.51034011 0.27439767 0.34780933 1.04040795 0.47705501
 2.80828865 1.44072524 0.38486183 0.72253457 0.275449   1.34127219
 0.85382648 0.54299237]


### Create Model

In [11]:
model = BertFineTuner(params=PARAMS, label_columns=LABEL_COLUMNS, n_training_steps=total_training_steps, n_warmup_steps=warmup_steps)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
trained_weights = torch.load(PARAMS['SAVE_PATH'])
model.load_state_dict(trained_weights)

train(
    model = model,
    train_loader = train_loader,
    val_loader = val_loader,
    num_epochs = PARAMS['EPOCHS'],
    learning_rate = PARAMS['LR'],
    n_warmup_steps = warmup_steps,
    n_training_steps = total_training_steps,
    save_path = PARAMS['SAVE_PATH'],
    class_weights= class_weights_tensor.to(PARAMS['DEVICE'])
)

  predictions = torch.tensor(predictions).clone().detach().requires_grad_(True)
  labels = torch.tensor(labels).clone().detach().requires_grad_(True)


Epoch 1/20 - Train Loss: 0.2379 - Val Loss: 0.2643
Per Class Accuracies: [0.8059999942779541, 0.722000002861023, 0.9340000152587891, 0.972000002861023, 0.6439999938011169, 0.8579999804496765, 0.8619999885559082, 0.8759999871253967, 0.4399999976158142, 0.5479999780654907, 0.9179999828338623, 0.6859999895095825, 0.9660000205039978, 0.9419999718666077, 0.5, 0.7559999823570251, 0.4339999854564667, 0.9559999704360962, 0.8560000061988831, 0.671999990940094]
F1 Score: 0.4711 - Precision: 0.3834 - Recall: 0.6108
Saved the best model!




Epoch 2/20 - Train Loss: 0.2327 - Val Loss: 0.2672
Per Class Accuracies: [0.8240000009536743, 0.7120000123977661, 0.9319999814033508, 0.9660000205039978, 0.6679999828338623, 0.8920000195503235, 0.871999979019165, 0.8859999775886536, 0.46000000834465027, 0.5659999847412109, 0.8920000195503235, 0.6740000247955322, 0.9660000205039978, 0.9340000152587891, 0.5180000066757202, 0.7760000228881836, 0.4399999976158142, 0.9459999799728394, 0.843999981880188, 0.6639999747276306]
F1 Score: 0.4590 - Precision: 0.3836 - Recall: 0.5713




Epoch 3/20 - Train Loss: 0.2321 - Val Loss: 0.2667
Per Class Accuracies: [0.7739999890327454, 0.7120000123977661, 0.9279999732971191, 0.9639999866485596, 0.6359999775886536, 0.8759999871253967, 0.8579999804496765, 0.878000020980835, 0.44600000977516174, 0.550000011920929, 0.9100000262260437, 0.6800000071525574, 0.9639999866485596, 0.9419999718666077, 0.47600001096725464, 0.765999972820282, 0.4359999895095825, 0.949999988079071, 0.8500000238418579, 0.6140000224113464]
F1 Score: 0.4662 - Precision: 0.3748 - Recall: 0.6167




Epoch 4/20 - Train Loss: 0.2404 - Val Loss: 0.2659
Per Class Accuracies: [0.8180000185966492, 0.7059999704360962, 0.9340000152587891, 0.972000002861023, 0.6439999938011169, 0.8740000128746033, 0.8640000224113464, 0.871999979019165, 0.4560000002384186, 0.5619999766349792, 0.8980000019073486, 0.671999990940094, 0.9639999866485596, 0.9440000057220459, 0.49799999594688416, 0.7639999985694885, 0.44999998807907104, 0.9419999718666077, 0.8379999995231628, 0.6740000247955322]
F1 Score: 0.4729 - Precision: 0.3840 - Recall: 0.6156




Epoch 5/20 - Train Loss: 0.2348 - Val Loss: 0.2703
Per Class Accuracies: [0.8040000200271606, 0.7179999947547913, 0.9340000152587891, 0.9599999785423279, 0.6819999814033508, 0.878000020980835, 0.8740000128746033, 0.878000020980835, 0.4519999921321869, 0.5820000171661377, 0.9079999923706055, 0.6880000233650208, 0.9620000123977661, 0.9440000057220459, 0.5239999890327454, 0.777999997138977, 0.46000000834465027, 0.9520000219345093, 0.8500000238418579, 0.6859999895095825]
F1 Score: 0.4658 - Precision: 0.3907 - Recall: 0.5767




Epoch 6/20 - Train Loss: 0.2299 - Val Loss: 0.2685
Per Class Accuracies: [0.7739999890327454, 0.7080000042915344, 0.9300000071525574, 0.9620000123977661, 0.6880000233650208, 0.871999979019165, 0.8640000224113464, 0.8700000047683716, 0.4560000002384186, 0.5820000171661377, 0.9020000100135803, 0.6660000085830688, 0.9620000123977661, 0.9419999718666077, 0.49399998784065247, 0.7739999890327454, 0.43799999356269836, 0.9539999961853027, 0.8420000076293945, 0.6639999747276306]
F1 Score: 0.4723 - Precision: 0.3837 - Recall: 0.6144




Epoch 7/20 - Train Loss: 0.2241 - Val Loss: 0.2680
Per Class Accuracies: [0.7799999713897705, 0.7139999866485596, 0.9319999814033508, 0.9599999785423279, 0.656000018119812, 0.8700000047683716, 0.8679999709129333, 0.8640000224113464, 0.44999998807907104, 0.5580000281333923, 0.9100000262260437, 0.6840000152587891, 0.9620000123977661, 0.9419999718666077, 0.5120000243186951, 0.7559999823570251, 0.47999998927116394, 0.9559999704360962, 0.8460000157356262, 0.6660000085830688]
F1 Score: 0.4806 - Precision: 0.3877 - Recall: 0.6321




Epoch 8/20 - Train Loss: 0.2190 - Val Loss: 0.2687
Per Class Accuracies: [0.8159999847412109, 0.7459999918937683, 0.9380000233650208, 0.9700000286102295, 0.6740000247955322, 0.8600000143051147, 0.8799999952316284, 0.8679999709129333, 0.46399998664855957, 0.6200000047683716, 0.9259999990463257, 0.7059999704360962, 0.9639999866485596, 0.9480000138282776, 0.46399998664855957, 0.7580000162124634, 0.46799999475479126, 0.9520000219345093, 0.8659999966621399, 0.6859999895095825]
F1 Score: 0.4777 - Precision: 0.3983 - Recall: 0.5967




Epoch 9/20 - Train Loss: 0.2160 - Val Loss: 0.2679
Per Class Accuracies: [0.8220000267028809, 0.7020000219345093, 0.9359999895095825, 0.9599999785423279, 0.6359999775886536, 0.8659999966621399, 0.8619999885559082, 0.8700000047683716, 0.4339999854564667, 0.5839999914169312, 0.9139999747276306, 0.6899999976158142, 0.9620000123977661, 0.9459999799728394, 0.5040000081062317, 0.7739999890327454, 0.49000000953674316, 0.9539999961853027, 0.8619999885559082, 0.6420000195503235]
F1 Score: 0.4837 - Precision: 0.3911 - Recall: 0.6338




Epoch 10/20 - Train Loss: 0.2118 - Val Loss: 0.2690
Per Class Accuracies: [0.8019999861717224, 0.7260000109672546, 0.9380000233650208, 0.9580000042915344, 0.656000018119812, 0.8220000267028809, 0.8759999871253967, 0.8560000061988831, 0.46000000834465027, 0.5979999899864197, 0.9079999923706055, 0.7139999866485596, 0.9620000123977661, 0.9440000057220459, 0.45399999618530273, 0.7360000014305115, 0.5059999823570251, 0.949999988079071, 0.8360000252723694, 0.6520000100135803]
F1 Score: 0.4821 - Precision: 0.3876 - Recall: 0.6374




Epoch 11/20 - Train Loss: 0.2076 - Val Loss: 0.2707
Per Class Accuracies: [0.8180000185966492, 0.7360000014305115, 0.9300000071525574, 0.9580000042915344, 0.656000018119812, 0.8560000061988831, 0.8600000143051147, 0.8619999885559082, 0.44999998807907104, 0.5960000157356262, 0.9039999842643738, 0.7120000123977661, 0.9620000123977661, 0.949999988079071, 0.4620000123977661, 0.7400000095367432, 0.49799999594688416, 0.9539999961853027, 0.8420000076293945, 0.6520000100135803]
F1 Score: 0.4807 - Precision: 0.3894 - Recall: 0.6279




Epoch 12/20 - Train Loss: 0.2034 - Val Loss: 0.2727
Per Class Accuracies: [0.8180000185966492, 0.7319999933242798, 0.9300000071525574, 0.9580000042915344, 0.6539999842643738, 0.8519999980926514, 0.871999979019165, 0.8700000047683716, 0.46000000834465027, 0.6140000224113464, 0.906000018119812, 0.7179999947547913, 0.9639999866485596, 0.9380000233650208, 0.5339999794960022, 0.7580000162124634, 0.5239999890327454, 0.9580000042915344, 0.843999981880188, 0.6639999747276306]
F1 Score: 0.4834 - Precision: 0.3998 - Recall: 0.6114




Epoch 13/20 - Train Loss: 0.2020 - Val Loss: 0.2707
Per Class Accuracies: [0.8240000009536743, 0.7260000109672546, 0.9380000233650208, 0.9620000123977661, 0.6380000114440918, 0.8460000157356262, 0.8759999871253967, 0.8659999966621399, 0.45399999618530273, 0.621999979019165, 0.9240000247955322, 0.7039999961853027, 0.9639999866485596, 0.949999988079071, 0.5180000066757202, 0.7599999904632568, 0.527999997138977, 0.9599999785423279, 0.8500000238418579, 0.6700000166893005]
F1 Score: 0.4868 - Precision: 0.4015 - Recall: 0.6179




Epoch 14/20 - Train Loss: 0.1996 - Val Loss: 0.2769
Per Class Accuracies: [0.828000009059906, 0.7379999756813049, 0.9359999895095825, 0.9580000042915344, 0.6919999718666077, 0.8740000128746033, 0.8820000290870667, 0.8759999871253967, 0.4480000138282776, 0.6399999856948853, 0.9259999990463257, 0.6919999718666077, 0.9639999866485596, 0.9459999799728394, 0.5580000281333923, 0.7720000147819519, 0.5419999957084656, 0.9580000042915344, 0.8539999723434448, 0.7080000042915344]
F1 Score: 0.4878 - Precision: 0.4154 - Recall: 0.5908




Epoch 15/20 - Train Loss: 0.1960 - Val Loss: 0.2781
Per Class Accuracies: [0.8339999914169312, 0.7400000095367432, 0.9399999976158142, 0.9620000123977661, 0.6880000233650208, 0.8740000128746033, 0.8840000033378601, 0.878000020980835, 0.4699999988079071, 0.6460000276565552, 0.9120000004768372, 0.7260000109672546, 0.9639999866485596, 0.9459999799728394, 0.5419999957084656, 0.7459999918937683, 0.5360000133514404, 0.9559999704360962, 0.8600000143051147, 0.7020000219345093]
F1 Score: 0.4921 - Precision: 0.4176 - Recall: 0.5991




Epoch 16/20 - Train Loss: 0.1949 - Val Loss: 0.2800
Per Class Accuracies: [0.8479999899864197, 0.7480000257492065, 0.9419999718666077, 0.9679999947547913, 0.6819999814033508, 0.843999981880188, 0.8840000033378601, 0.8859999775886536, 0.4620000123977661, 0.6420000195503235, 0.921999990940094, 0.7239999771118164, 0.9639999866485596, 0.949999988079071, 0.5299999713897705, 0.7319999933242798, 0.5740000009536743, 0.9520000219345093, 0.8759999871253967, 0.7160000205039978]
F1 Score: 0.4972 - Precision: 0.4218 - Recall: 0.6055




Epoch 17/20 - Train Loss: 0.1935 - Val Loss: 0.2773
Per Class Accuracies: [0.8379999995231628, 0.7300000190734863, 0.9380000233650208, 0.9660000205039978, 0.6740000247955322, 0.8519999980926514, 0.8840000033378601, 0.8799999952316284, 0.46799999475479126, 0.628000020980835, 0.9200000166893005, 0.7039999961853027, 0.9639999866485596, 0.9480000138282776, 0.5580000281333923, 0.7379999756813049, 0.5740000009536743, 0.9539999961853027, 0.8619999885559082, 0.6959999799728394]
F1 Score: 0.4962 - Precision: 0.4167 - Recall: 0.6132




Epoch 18/20 - Train Loss: 0.1918 - Val Loss: 0.2776
Per Class Accuracies: [0.8379999995231628, 0.7459999918937683, 0.9380000233650208, 0.9620000123977661, 0.6779999732971191, 0.8500000238418579, 0.8840000033378601, 0.8700000047683716, 0.45399999618530273, 0.6240000128746033, 0.9279999732971191, 0.7160000205039978, 0.9639999866485596, 0.9459999799728394, 0.5580000281333923, 0.7360000014305115, 0.5680000185966492, 0.9539999961853027, 0.8640000224113464, 0.6880000233650208]
F1 Score: 0.4937 - Precision: 0.4153 - Recall: 0.6085




Epoch 19/20 - Train Loss: 0.1908 - Val Loss: 0.2759
Per Class Accuracies: [0.8420000076293945, 0.7400000095367432, 0.9359999895095825, 0.9639999866485596, 0.6819999814033508, 0.8420000076293945, 0.8880000114440918, 0.8640000224113464, 0.4580000042915344, 0.6200000047683716, 0.9259999990463257, 0.7039999961853027, 0.9639999866485596, 0.9480000138282776, 0.550000011920929, 0.734000027179718, 0.5659999847412109, 0.9539999961853027, 0.8579999804496765, 0.6959999799728394]
F1 Score: 0.4921 - Precision: 0.4129 - Recall: 0.6091




Epoch 20/20 - Train Loss: 0.1901 - Val Loss: 0.2793
Per Class Accuracies: [0.8420000076293945, 0.7440000176429749, 0.9359999895095825, 0.9639999866485596, 0.6779999732971191, 0.8600000143051147, 0.8880000114440918, 0.8679999709129333, 0.4620000123977661, 0.6520000100135803, 0.9259999990463257, 0.7200000286102295, 0.9639999866485596, 0.9459999799728394, 0.5540000200271606, 0.7360000014305115, 0.5820000171661377, 0.9520000219345093, 0.8679999709129333, 0.6980000138282776]
F1 Score: 0.4937 - Precision: 0.4204 - Recall: 0.5979
Training complete!


Now we are done with the training. This process is repeated with several different configurations for the model. More information can be found in the system description paper.

# Evaluation

The predictions for the final submissions are done based on an ensemble.
Hence for ensembling, please continue with the ensemble_eval_and_predict.ipynb notebook.
However, for simplicity or if you are interested, you may want to continue here to evaluate the model performance.

1. We determine the decision threshold to decide when a certain label should be counted as 1, based on the val_data
2. We predict the test_data with it (if splitted above)

We load the model from the best_checkpoint in order to get the model that performed best with respect to the early stopping metric.

In [13]:
trained_weights = torch.load(PARAMS['SAVE_PATH'])
model.load_state_dict(trained_weights)

model.eval()
for param in model.parameters():
    param.requires_grad = False

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = model.to(device)

predictions = []
labels = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels_tensor = batch["labels"].to(device)

        prediction = trained_model(input_ids, attention_mask)

        predictions.append(prediction)
        labels.append(labels_tensor)

predictions = torch.cat(predictions, dim=0).cpu()
labels = torch.cat(labels, dim=0).cpu()

  0%|          | 0/32 [00:00<?, ?it/s]

In [15]:
print(predictions.shape)
print(labels.shape)
print(predictions[:10])
print(labels[:10])

torch.Size([500, 20])
torch.Size([500, 20])
tensor([[0.0849, 0.2030, 0.0188, 0.0184, 0.1299, 0.1023, 0.0306, 0.0213, 0.3433,
         0.4290, 0.0364, 0.4177, 0.0103, 0.0194, 0.1760, 0.1199, 0.5689, 0.0227,
         0.0515, 0.0907],
        [0.0341, 0.1101, 0.0027, 0.0027, 0.0885, 0.0371, 0.0057, 0.0110, 0.3463,
         0.1957, 0.0130, 0.2258, 0.0024, 0.0050, 0.1735, 0.0497, 0.4962, 0.0059,
         0.0171, 0.0433],
        [0.2255, 0.2006, 0.0688, 0.0160, 0.4520, 0.1263, 0.6081, 0.0185, 0.2809,
         0.3952, 0.0981, 0.1166, 0.0062, 0.0382, 0.3195, 0.1070, 0.4843, 0.0268,
         0.0327, 0.3060],
        [0.0579, 0.0959, 0.0244, 0.0107, 0.2435, 0.0715, 0.0256, 0.0558, 0.1895,
         0.1039, 0.0560, 0.1684, 0.0085, 0.0821, 0.2833, 0.2062, 0.3282, 0.5569,
         0.0904, 0.1647],
        [0.0464, 0.1421, 0.0042, 0.0039, 0.1145, 0.0455, 0.0198, 0.0045, 0.2656,
         0.3314, 0.0166, 0.2062, 0.0011, 0.0073, 0.1590, 0.0398, 0.5333, 0.0037,
         0.0081, 0.0636],
        [0.0801,

Select optimal Threshold on Val Dataset

In [16]:
predictions = predictions.detach().cpu()
labels = labels.detach().cpu()

In [17]:
from toolbox.bert_utils import calculate_best_threshold

THRESHOLD, best_f1_score = calculate_best_threshold(labels.numpy(), predictions.numpy())

Threshold: 100%|██████████| 100/100 [00:58<00:00,  1.71it/s]


In [28]:
torch.save(model.state_dict(), "./model/best_model.pt")
print("Saved the best model!")


Saved the best model!


In [29]:
print(THRESHOLD)
print(best_f1_score)

0.33333333333333337
0.6658007563117249


In [None]:
print(THRESHOLD)
print(best_f1_score)

Alternatively if you just one to load a model from checkpoint

binarize the predictions with the optimal threshold

In [None]:
y_pred = predictions.numpy()
y_true = labels.numpy()

upper, lower = 1, 0

y_pred = np.where(y_pred > 0.25, upper, lower)

In [None]:
print(f"Threshold: {THRESHOLD}")
print(classification_report(
    y_true,
    y_pred,
    target_names=LABEL_COLUMNS,
    zero_division=0,
))

class_rep = classification_report(
    y_true,
    y_pred,
    target_names=LABEL_COLUMNS,
    zero_division=0,
    output_dict=True
)

# Use Threshold to predict on Test Data
If we want to predict on the test-data (if you have split it apart, alternatively you could use the leave-out-dataset). For a single Model.

In [None]:
test_df = leave_out_df

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

test_dataset = BertDataset(
    test_df,
    tokenizer=TOKENIZER,
    max_token_count=PARAMS["MAX_TOKEN_COUNT"],
    label_columns=LABEL_COLUMNS
)

predictions = []
labels = []

with torch.no_grad():
    for item in tqdm(test_dataset):
        input_ids = item["input_ids"].unsqueeze(dim=0).to(device)
        attention_mask = item["attention_mask"].unsqueeze(dim=0).to(device)

        prediction = trained_model(input_ids, attention_mask)

        predictions.append(prediction.flatten().cpu())
        labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

y_pred = predictions.numpy()
y_true = labels.numpy()

In [23]:
print(y_pred[:10])
print(y_true[:10])

[[0.19768946 0.47861814 0.05621206 0.08963757 0.27469584 0.17602919
  0.05605317 0.3262488  0.5223407  0.29802653 0.12079002 0.490611
  0.5982055  0.04795987 0.37662542 0.22822349 0.31384373 0.08800869
  0.30438533 0.17389989]
 [0.09726778 0.19540162 0.01156007 0.00370659 0.40483856 0.19481866
  0.08383829 0.04050161 0.4722789  0.28713363 0.02585992 0.10060598
  0.0052846  0.03234348 0.28127682 0.25726908 0.302832   0.01817109
  0.02224549 0.16202512]
 [0.12248838 0.22477956 0.02015953 0.01389778 0.19697957 0.23180543
  0.05856963 0.03659924 0.43355596 0.56057143 0.04870175 0.2877665
  0.01181335 0.05106555 0.21285364 0.3358183  0.49392682 0.02238621
  0.07907804 0.10171212]
 [0.050573   0.12528972 0.00487865 0.0009754  0.17347924 0.06420155
  0.01376368 0.00793553 0.41136676 0.2746455  0.01309359 0.14136909
  0.00167432 0.0061621  0.17932628 0.10424852 0.30782622 0.00559521
  0.01367295 0.08282189]
 [0.1638837  0.16435964 0.01019481 0.00919462 0.11023044 0.04939653
  0.01429625 0.0087

###  Binarize the model predictions with Threshold


In [24]:
upper, lower = 1, 0

y_pred = np.where(y_pred > 0.5, upper, lower)

In [25]:
print(f"Threshold: {THRESHOLD}")
print(classification_report(
    y_true,
    y_pred,
    target_names=LABEL_COLUMNS,
    zero_division=0,
))

class_rep = classification_report(
    y_true,
    y_pred,
    target_names=LABEL_COLUMNS,
    zero_division=0,
    output_dict=True
)

Threshold: 0.33333333333333337
                            precision    recall  f1-score   support

   Self-direction: thought       0.75      0.06      0.11        49
    Self-direction: action       0.64      0.09      0.16        75
               Stimulation       0.00      0.00      0.00        14
                  Hedonism       0.25      0.20      0.22         5
               Achievement       0.56      0.10      0.18        86
          Power: dominance       0.00      0.00      0.00        33
          Power: resources       0.67      0.46      0.55        26
                      Face       0.00      0.00      0.00        25
        Security: personal       0.45      0.17      0.25       103
        Security: societal       0.64      0.19      0.29        85
                 Tradition       1.00      0.03      0.06        31
         Conformity: rules       0.50      0.04      0.07        81
 Conformity: interpersonal       0.50      0.27      0.35        11
                

# Calculate F1-Score

In [26]:
test_custom_f1 = -1
test_macro_recall = class_rep["macro avg"]["recall"]
test_macro_precision = class_rep["macro avg"]["precision"]
if (test_macro_precision + test_macro_recall) != 0:
    test_custom_f1 = (2*test_macro_recall*test_macro_precision/(test_macro_recall+test_macro_precision))
else:
    test_custom_f1 = 0
print(test_custom_f1)

0.2077796434873278


## Test model

In [27]:
test_df_input = pd.read_csv('./data/arguments-test.tsv', sep='\t')
test_df_input["text"] = test_df_input["Premise"]+" " + test_df_input["Stance"]+ " " + test_df_input["Conclusion"]
test_df_input.head()

FileNotFoundError: ignored

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

test_df_dataset = BertDataset(
    data=test_df_input,
    tokenizer=TOKENIZER,
    max_token_count=PARAMS["MAX_TOKEN_COUNT"],
)

predictions = []

for item in tqdm(test_df_dataset):
    _, prediction = trained_model(
        item["input_ids"].unsqueeze(dim=0).to(device),
        item["attention_mask"].unsqueeze(dim=0).to(device)
    )
    predictions.append(prediction.flatten())

predictions = torch.stack(predictions).detach().cpu()

In [None]:
y_pred = predictions.numpy()
upper, lower = 1, 0
y_pred = np.where(y_pred > THRESHOLD, upper, lower)

In [None]:
prediction_dictionary = {}
prediction_dictionary["Argument ID"] = test_df_input["Argument ID"]
for idx, l_name in enumerate(LABEL_COLUMNS):
  prediction_dictionary[l_name]=y_pred[:,idx]

test_prediction_df = pd.DataFrame(prediction_dictionary)
test_prediction_df.head()

In [None]:
if RUN_ID:
    test_prediction_df.to_csv(f"submissions/{RUN_ID}-submission_test.txt", sep="\t", index=False)
else:
    test_prediction_df.to_csv(f"submissions/{NAME}-submission_test.txt", sep="\t", index=False)

# Looking at single predictions

In [None]:
def print_example_prediction(record, show_all_probs=False, THRESHOLD=0.3):

    print(record["Argument ID"])
    print(record["text"])
    print(f"True Label: {record.category}")


    encoding = TOKENIZER.encode_plus(
        record.text,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding="max_length",
        return_attention_mask=True,
        return_tensors='pt',
    )

    _, test_prediction = trained_model(encoding["input_ids"], encoding["attention_mask"])
    test_prediction = test_prediction.flatten().numpy()

    res = {}
    if show_all_probs:
        for label, prediction in zip(LABEL_COLUMNS, test_prediction):
            print(f"{label}: {prediction}")
            res[label] = prediction

    else:
        print(f"Predictions:")
        for label, prediction in zip(LABEL_COLUMNS, test_prediction):
            if prediction < THRESHOLD:
                continue
            print(f"{label}: {prediction}")
            res[label] = prediction
    return res

In [None]:
# 13 whaling is good one
trained_model.to("cpu")
test_record = test_df.iloc[6]
print_example_prediction(test_record, show_all_probs=False, THRESHOLD=THRESHOLD)
